From 609f7bd8fd35b1d76086d7f822deb4e1260b6e47 Mon Sep 17 00:00:00 2001 From: Ilya Averyanov Date: Fri, 17 Feb 2023 00:16:29 +0200 Subject: [PATCH 1/8] feat(rebalance): port apps from 4.x --- Makefile | 1 + apps/emqx/include/asserts.hrl | 31 + apps/emqx/include/emqx_channel.hrl | 42 + apps/emqx/include/emqx_hooks.hrl | 1 + apps/emqx/priv/bpapi.versions | 5 + apps/emqx/src/emqx_channel.erl | 54 +- apps/emqx/src/emqx_cm.erl | 49 +- apps/emqx/src/emqx_router_helper.erl | 12 +- .../test/emqx_dashboard_api_test_helpers.erl | 6 +- apps/emqx_machine/src/emqx_machine_boot.erl | 10 +- changes/ee/feat-10075.en.md | 1 + changes/ee/feat-10075.zh.md | 1 + lib-ee/emqx_eviction_agent/README.md | 9 + .../etc/emqx_eviction_agent.conf | 3 + .../i18n/emqx_eviction_agent_api_i18n.conf | 14 + lib-ee/emqx_eviction_agent/rebar.config | 2 + .../src/emqx_eviction_agent.app.src | 22 + .../src/emqx_eviction_agent.appup.src | 3 + .../src/emqx_eviction_agent.erl | 346 ++++++++ .../src/emqx_eviction_agent_api.erl | 85 ++ .../src/emqx_eviction_agent_app.erl | 24 + .../src/emqx_eviction_agent_channel.erl | 368 +++++++++ .../src/emqx_eviction_agent_cli.erl | 30 + .../src/emqx_eviction_agent_conn_sup.erl | 21 + .../src/emqx_eviction_agent_sup.erl | 34 + .../proto/emqx_eviction_agent_proto_v1.erl | 27 + .../test/emqx_eviction_agent_SUITE.erl | 403 ++++++++++ .../test/emqx_eviction_agent_api_SUITE.erl | 69 ++ .../emqx_eviction_agent_channel_SUITE.erl | 251 ++++++ .../test/emqx_eviction_agent_cli_SUITE.erl | 39 + .../test/emqx_eviction_agent_test_helpers.erl | 141 ++++ lib-ee/emqx_node_rebalance/README.md | 9 + .../etc/emqx_node_rebalance.conf | 3 + .../i18n/emqx_node_rebalance_api_i18n.conf | 490 ++++++++++++ .../include/emqx_node_rebalance.hrl | 33 + lib-ee/emqx_node_rebalance/rebar.config | 2 + .../src/emqx_node_rebalance.app.src | 22 + .../src/emqx_node_rebalance.appup.src | 3 + .../src/emqx_node_rebalance.erl | 438 +++++++++++ .../src/emqx_node_rebalance_agent.erl | 131 ++++ .../src/emqx_node_rebalance_api.erl | 738 ++++++++++++++++++ .../src/emqx_node_rebalance_app.erl | 22 + .../src/emqx_node_rebalance_cli.erl | 305 ++++++++ .../src/emqx_node_rebalance_evacuation.erl | 308 ++++++++ ...emqx_node_rebalance_evacuation_persist.erl | 120 +++ .../src/emqx_node_rebalance_status.erl | 238 ++++++ .../src/emqx_node_rebalance_sup.erl | 35 + .../emqx_node_rebalance_api_proto_v1.erl | 43 + ...mqx_node_rebalance_evacuation_proto_v1.erl | 22 + .../proto/emqx_node_rebalance_proto_v1.erl | 62 ++ .../emqx_node_rebalance_status_proto_v1.erl | 36 + .../test/emqx_node_rebalance_SUITE.erl | 229 ++++++ .../test/emqx_node_rebalance_agent_SUITE.erl | 214 +++++ .../test/emqx_node_rebalance_api_SUITE.erl | 444 +++++++++++ .../test/emqx_node_rebalance_cli_SUITE.erl | 291 +++++++ .../emqx_node_rebalance_evacuation_SUITE.erl | 271 +++++++ ...ode_rebalance_evacuation_persist_SUITE.erl | 108 +++ mix.exs | 4 +- rebar.config.erl | 4 +- 59 files changed, 6686 insertions(+), 43 deletions(-) create mode 100644 apps/emqx/include/asserts.hrl create mode 100644 apps/emqx/include/emqx_channel.hrl create mode 100644 changes/ee/feat-10075.en.md create mode 100644 changes/ee/feat-10075.zh.md create mode 100644 lib-ee/emqx_eviction_agent/README.md create mode 100644 lib-ee/emqx_eviction_agent/etc/emqx_eviction_agent.conf create mode 100644 lib-ee/emqx_eviction_agent/i18n/emqx_eviction_agent_api_i18n.conf create mode 100644 lib-ee/emqx_eviction_agent/rebar.config create mode 100644 lib-ee/emqx_eviction_agent/src/emqx_eviction_agent.app.src create mode 100644 lib-ee/emqx_eviction_agent/src/emqx_eviction_agent.appup.src create mode 100644 lib-ee/emqx_eviction_agent/src/emqx_eviction_agent.erl create mode 100644 lib-ee/emqx_eviction_agent/src/emqx_eviction_agent_api.erl create mode 100644 lib-ee/emqx_eviction_agent/src/emqx_eviction_agent_app.erl create mode 100644 lib-ee/emqx_eviction_agent/src/emqx_eviction_agent_channel.erl create mode 100644 lib-ee/emqx_eviction_agent/src/emqx_eviction_agent_cli.erl create mode 100644 lib-ee/emqx_eviction_agent/src/emqx_eviction_agent_conn_sup.erl create mode 100644 lib-ee/emqx_eviction_agent/src/emqx_eviction_agent_sup.erl create mode 100644 lib-ee/emqx_eviction_agent/src/proto/emqx_eviction_agent_proto_v1.erl create mode 100644 lib-ee/emqx_eviction_agent/test/emqx_eviction_agent_SUITE.erl create mode 100644 lib-ee/emqx_eviction_agent/test/emqx_eviction_agent_api_SUITE.erl create mode 100644 lib-ee/emqx_eviction_agent/test/emqx_eviction_agent_channel_SUITE.erl create mode 100644 lib-ee/emqx_eviction_agent/test/emqx_eviction_agent_cli_SUITE.erl create mode 100644 lib-ee/emqx_eviction_agent/test/emqx_eviction_agent_test_helpers.erl create mode 100644 lib-ee/emqx_node_rebalance/README.md create mode 100644 lib-ee/emqx_node_rebalance/etc/emqx_node_rebalance.conf create mode 100644 lib-ee/emqx_node_rebalance/i18n/emqx_node_rebalance_api_i18n.conf create mode 100644 lib-ee/emqx_node_rebalance/include/emqx_node_rebalance.hrl create mode 100644 lib-ee/emqx_node_rebalance/rebar.config create mode 100644 lib-ee/emqx_node_rebalance/src/emqx_node_rebalance.app.src create mode 100644 lib-ee/emqx_node_rebalance/src/emqx_node_rebalance.appup.src create mode 100644 lib-ee/emqx_node_rebalance/src/emqx_node_rebalance.erl create mode 100644 lib-ee/emqx_node_rebalance/src/emqx_node_rebalance_agent.erl create mode 100644 lib-ee/emqx_node_rebalance/src/emqx_node_rebalance_api.erl create mode 100644 lib-ee/emqx_node_rebalance/src/emqx_node_rebalance_app.erl create mode 100644 lib-ee/emqx_node_rebalance/src/emqx_node_rebalance_cli.erl create mode 100644 lib-ee/emqx_node_rebalance/src/emqx_node_rebalance_evacuation.erl create mode 100644 lib-ee/emqx_node_rebalance/src/emqx_node_rebalance_evacuation_persist.erl create mode 100644 lib-ee/emqx_node_rebalance/src/emqx_node_rebalance_status.erl create mode 100644 lib-ee/emqx_node_rebalance/src/emqx_node_rebalance_sup.erl create mode 100644 lib-ee/emqx_node_rebalance/src/proto/emqx_node_rebalance_api_proto_v1.erl create mode 100644 lib-ee/emqx_node_rebalance/src/proto/emqx_node_rebalance_evacuation_proto_v1.erl create mode 100644 lib-ee/emqx_node_rebalance/src/proto/emqx_node_rebalance_proto_v1.erl create mode 100644 lib-ee/emqx_node_rebalance/src/proto/emqx_node_rebalance_status_proto_v1.erl create mode 100644 lib-ee/emqx_node_rebalance/test/emqx_node_rebalance_SUITE.erl create mode 100644 lib-ee/emqx_node_rebalance/test/emqx_node_rebalance_agent_SUITE.erl create mode 100644 lib-ee/emqx_node_rebalance/test/emqx_node_rebalance_api_SUITE.erl create mode 100644 lib-ee/emqx_node_rebalance/test/emqx_node_rebalance_cli_SUITE.erl create mode 100644 lib-ee/emqx_node_rebalance/test/emqx_node_rebalance_evacuation_SUITE.erl create mode 100644 lib-ee/emqx_node_rebalance/test/emqx_node_rebalance_evacuation_persist_SUITE.erl diff --git a/Makefile b/Makefile index 6741317ee..24af58ebc 100644 --- a/Makefile +++ b/Makefile @@ -179,6 +179,7 @@ clean-all: @rm -f rebar.lock @rm -rf deps @rm -rf _build + @rm -f emqx_dialyzer_*_plt .PHONY: deps-all deps-all: $(REBAR) $(PROFILES:%=deps-%) diff --git a/apps/emqx/include/asserts.hrl b/apps/emqx/include/asserts.hrl new file mode 100644 index 000000000..98d8e72fc --- /dev/null +++ b/apps/emqx/include/asserts.hrl @@ -0,0 +1,31 @@ +%%-------------------------------------------------------------------- +%% Copyright (c) 2023 EMQ Technologies Co., Ltd. All Rights Reserved. +%% +%% Licensed under the Apache License, Version 2.0 (the "License"); +%% you may not use this file except in compliance with the License. +%% You may obtain a copy of the License at +%% +%% http://www.apache.org/licenses/LICENSE-2.0 +%% +%% Unless required by applicable law or agreed to in writing, software +%% distributed under the License is distributed on an "AS IS" BASIS, +%% WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +%% See the License for the specific language governing permissions and +%% limitations under the License. +%%-------------------------------------------------------------------- + +%% This file contains common macros for testing. +%% It must not be used anywhere except in test suites. + +-include_lib("snabbkaffe/include/snabbkaffe.hrl"). + +-define(assertWaitEvent(Code, EventMatch, Timeout), + ?assertMatch( + {_, {ok, EventMatch}}, + ?wait_async_action( + Code, + EventMatch, + Timeout + ) + ) +). diff --git a/apps/emqx/include/emqx_channel.hrl b/apps/emqx/include/emqx_channel.hrl new file mode 100644 index 000000000..d4362633a --- /dev/null +++ b/apps/emqx/include/emqx_channel.hrl @@ -0,0 +1,42 @@ +%%-------------------------------------------------------------------- +%% Copyright (c) 2017-2023 EMQ Technologies Co., Ltd. All Rights Reserved. +%% +%% Licensed under the Apache License, Version 2.0 (the "License"); +%% you may not use this file except in compliance with the License. +%% You may obtain a copy of the License at +%% +%% http://www.apache.org/licenses/LICENSE-2.0 +%% +%% Unless required by applicable law or agreed to in writing, software +%% distributed under the License is distributed on an "AS IS" BASIS, +%% WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +%% See the License for the specific language governing permissions and +%% limitations under the License. +%%-------------------------------------------------------------------- + +-define(CHANNEL_METRICS, [ + recv_pkt, + recv_msg, + 'recv_msg.qos0', + 'recv_msg.qos1', + 'recv_msg.qos2', + 'recv_msg.dropped', + 'recv_msg.dropped.await_pubrel_timeout', + send_pkt, + send_msg, + 'send_msg.qos0', + 'send_msg.qos1', + 'send_msg.qos2', + 'send_msg.dropped', + 'send_msg.dropped.expired', + 'send_msg.dropped.queue_full', + 'send_msg.dropped.too_large' +]). + +-define(INFO_KEYS, [ + conninfo, + conn_state, + clientinfo, + session, + will_msg +]). diff --git a/apps/emqx/include/emqx_hooks.hrl b/apps/emqx/include/emqx_hooks.hrl index 1665492c5..2373b5928 100644 --- a/apps/emqx/include/emqx_hooks.hrl +++ b/apps/emqx/include/emqx_hooks.hrl @@ -34,6 +34,7 @@ -define(HP_BRIDGE, 870). -define(HP_DELAY_PUB, 860). %% apps that can stop the hooks chain from continuing +-define(HP_NODE_REBALANCE, 110). -define(HP_EXHOOK, 100). %% == Lowest Priority = 0, don't change this value as the plugins may depend on it. diff --git a/apps/emqx/priv/bpapi.versions b/apps/emqx/priv/bpapi.versions index db4765e3f..dceb38c47 100644 --- a/apps/emqx/priv/bpapi.versions +++ b/apps/emqx/priv/bpapi.versions @@ -13,6 +13,7 @@ {emqx_conf,2}. {emqx_dashboard,1}. {emqx_delayed,1}. +{emqx_eviction_agent,1}. {emqx_exhook,1}. {emqx_gateway_api_listeners,1}. {emqx_gateway_cm,1}. @@ -26,6 +27,10 @@ {emqx_mgmt_cluster,1}. {emqx_mgmt_trace,1}. {emqx_mgmt_trace,2}. +{emqx_node_rebalance,1}. +{emqx_node_rebalance_api,1}. +{emqx_node_rebalance_evacuation,1}. +{emqx_node_rebalance_status,1}. {emqx_persistent_session,1}. {emqx_plugin_libs,1}. {emqx_plugins,1}. diff --git a/apps/emqx/src/emqx_channel.erl b/apps/emqx/src/emqx_channel.erl index 862b72c06..69e0a55f7 100644 --- a/apps/emqx/src/emqx_channel.erl +++ b/apps/emqx/src/emqx_channel.erl @@ -18,6 +18,7 @@ -module(emqx_channel). -include("emqx.hrl"). +-include("emqx_channel.hrl"). -include("emqx_mqtt.hrl"). -include("logger.hrl"). -include("types.hrl"). @@ -57,6 +58,12 @@ clear_keepalive/1 ]). +%% Export for emqx_channel implementations +-export([ + maybe_nack/1, + maybe_mark_as_delivered/2 +]). + %% Exports for CT -export([set_field/3]). @@ -69,7 +76,7 @@ ] ). --export_type([channel/0, opts/0]). +-export_type([channel/0, opts/0, conn_state/0]). -record(channel, { %% MQTT ConnInfo @@ -131,33 +138,6 @@ quota_timer => expire_quota_limit }). --define(CHANNEL_METRICS, [ - recv_pkt, - recv_msg, - 'recv_msg.qos0', - 'recv_msg.qos1', - 'recv_msg.qos2', - 'recv_msg.dropped', - 'recv_msg.dropped.await_pubrel_timeout', - send_pkt, - send_msg, - 'send_msg.qos0', - 'send_msg.qos1', - 'send_msg.qos2', - 'send_msg.dropped', - 'send_msg.dropped.expired', - 'send_msg.dropped.queue_full', - 'send_msg.dropped.too_large' -]). - --define(INFO_KEYS, [ - conninfo, - conn_state, - clientinfo, - session, - will_msg -]). - -define(LIMITER_ROUTING, message_routing). -dialyzer({no_match, [shutdown/4, ensure_timer/2, interval/2]}). @@ -1078,10 +1058,12 @@ handle_out(unsuback, {PacketId, _ReasonCodes}, Channel) -> handle_out(disconnect, ReasonCode, Channel) when is_integer(ReasonCode) -> ReasonName = disconnect_reason(ReasonCode), handle_out(disconnect, {ReasonCode, ReasonName}, Channel); -handle_out(disconnect, {ReasonCode, ReasonName}, Channel = ?IS_MQTT_V5) -> - Packet = ?DISCONNECT_PACKET(ReasonCode), +handle_out(disconnect, {ReasonCode, ReasonName}, Channel) -> + handle_out(disconnect, {ReasonCode, ReasonName, #{}}, Channel); +handle_out(disconnect, {ReasonCode, ReasonName, Props}, Channel = ?IS_MQTT_V5) -> + Packet = ?DISCONNECT_PACKET(ReasonCode, Props), {ok, [{outgoing, Packet}, {close, ReasonName}], Channel}; -handle_out(disconnect, {_ReasonCode, ReasonName}, Channel) -> +handle_out(disconnect, {_ReasonCode, ReasonName, _Props}, Channel) -> {ok, {close, ReasonName}, Channel}; handle_out(auth, {ReasonCode, Properties}, Channel) -> {ok, ?AUTH_PACKET(ReasonCode, Properties), Channel}; @@ -1198,13 +1180,19 @@ handle_call( {takeover, 'end'}, Channel = #channel{ session = Session, - pendings = Pendings + pendings = Pendings, + conninfo = #{clientid := ClientId} } ) -> ok = emqx_session:takeover(Session), %% TODO: Should not drain deliver here (side effect) Delivers = emqx_utils:drain_deliver(), AllPendings = lists:append(Delivers, Pendings), + ?tp( + debug, + emqx_channel_takeover_end, + #{clientid => ClientId} + ), disconnect_and_shutdown(takenover, AllPendings, Channel); handle_call(list_authz_cache, Channel) -> {reply, emqx_authz_cache:list_authz_cache(), Channel}; @@ -1276,6 +1264,8 @@ handle_info(die_if_test = Info, Channel) -> die_if_test_compiled(), ?SLOG(error, #{msg => "unexpected_info", info => Info}), {ok, Channel}; +handle_info({disconnect, ReasonCode, ReasonName, Props}, Channel) -> + handle_out(disconnect, {ReasonCode, ReasonName, Props}, Channel); handle_info(Info, Channel) -> ?SLOG(error, #{msg => "unexpected_info", info => Info}), {ok, Channel}. diff --git a/apps/emqx/src/emqx_cm.erl b/apps/emqx/src/emqx_cm.erl index 0290b57d3..c8296f317 100644 --- a/apps/emqx/src/emqx_cm.erl +++ b/apps/emqx/src/emqx_cm.erl @@ -23,6 +23,8 @@ -include("logger.hrl"). -include("types.hrl"). -include_lib("snabbkaffe/include/snabbkaffe.hrl"). +-include_lib("stdlib/include/qlc.hrl"). +-include_lib("stdlib/include/ms_transform.hrl"). -export([start_link/0]). @@ -72,6 +74,12 @@ get_session_confs/2 ]). +%% Client management +-export([ + channel_with_session_table/1, + live_connection_table/1 +]). + %% gen_server callbacks -export([ init/1, @@ -593,6 +601,40 @@ all_channels() -> Pat = [{{'_', '$1'}, [], ['$1']}], ets:select(?CHAN_TAB, Pat). +%% @doc Get clientinfo for all clients with sessions +channel_with_session_table(ConnModules) -> + Ms = ets:fun2ms( + fun({{ClientId, _ChanPid}, Info, _Stats}) -> + {ClientId, Info} + end + ), + Table = ets:table(?CHAN_INFO_TAB, [{traverse, {select, Ms}}]), + ConnModuleMap = maps:from_list([{Mod, true} || Mod <- ConnModules]), + qlc:q([ + {ClientId, ConnState, ConnInfo, ClientInfo} + || {ClientId, #{ + conn_state := ConnState, + clientinfo := ClientInfo, + conninfo := #{clean_start := false, conn_mod := ConnModule} = ConnInfo + }} <- + Table, + maps:is_key(ConnModule, ConnModuleMap) + ]). + +%% @doc Get all local connection query handle +live_connection_table(ConnModules) -> + Ms = lists:map(fun live_connection_ms/1, ConnModules), + Table = ets:table(?CHAN_CONN_TAB, [{traverse, {select, Ms}}]), + qlc:q([{ClientId, ChanPid} || {ClientId, ChanPid} <- Table, is_channel_connected(ChanPid)]). + +live_connection_ms(ConnModule) -> + {{{'$1', '$2'}, ConnModule}, [], [{{'$1', '$2'}}]}. + +is_channel_connected(ChanPid) when node(ChanPid) =:= node() -> + ets:member(?CHAN_LIVE_TAB, ChanPid); +is_channel_connected(_ChanPid) -> + false. + %% @doc Get all registered clientIDs. Debug/test interface all_client_ids() -> Pat = [{{'$1', '_'}, [], ['$1']}], @@ -693,7 +735,8 @@ code_change(_OldVsn, State, _Extra) -> %%-------------------------------------------------------------------- clean_down({ChanPid, ClientId}) -> - do_unregister_channel({ClientId, ChanPid}). + do_unregister_channel({ClientId, ChanPid}), + ok = ?tp(debug, emqx_cm_clean_down, #{client_id => ClientId}). stats_fun() -> lists:foreach(fun update_stats/1, ?CHAN_STATS). @@ -719,12 +762,12 @@ get_chann_conn_mod(ClientId, ChanPid) -> wrap_rpc(emqx_cm_proto_v1:get_chann_conn_mod(ClientId, ChanPid)). mark_channel_connected(ChanPid) -> - ?tp(emqx_cm_connected_client_count_inc, #{}), + ?tp(emqx_cm_connected_client_count_inc, #{chan_pid => ChanPid}), ets:insert_new(?CHAN_LIVE_TAB, {ChanPid, true}), ok. mark_channel_disconnected(ChanPid) -> - ?tp(emqx_cm_connected_client_count_dec, #{}), + ?tp(emqx_cm_connected_client_count_dec, #{chan_pid => ChanPid}), ets:delete(?CHAN_LIVE_TAB, ChanPid), ok. diff --git a/apps/emqx/src/emqx_router_helper.erl b/apps/emqx/src/emqx_router_helper.erl index e2d54b99e..4bff98072 100644 --- a/apps/emqx/src/emqx_router_helper.erl +++ b/apps/emqx/src/emqx_router_helper.erl @@ -167,9 +167,15 @@ handle_info(Info, State) -> {noreply, State}. terminate(_Reason, _State) -> - ok = ekka:unmonitor(membership), - emqx_stats:cancel_update(route_stats), - mnesia:unsubscribe({table, ?ROUTING_NODE, simple}). + try + ok = ekka:unmonitor(membership), + emqx_stats:cancel_update(route_stats), + mnesia:unsubscribe({table, ?ROUTING_NODE, simple}) + catch + exit:{noproc, {gen_server, call, [mria_membership, _]}} -> + ?SLOG(warning, #{msg => "mria_membership_down"}), + ok + end. code_change(_OldVsn, State, _Extra) -> {ok, State}. diff --git a/apps/emqx_dashboard/test/emqx_dashboard_api_test_helpers.erl b/apps/emqx_dashboard/test/emqx_dashboard_api_test_helpers.erl index 91c7729d3..25b4065de 100644 --- a/apps/emqx_dashboard/test/emqx_dashboard_api_test_helpers.erl +++ b/apps/emqx_dashboard/test/emqx_dashboard_api_test_helpers.erl @@ -20,6 +20,7 @@ set_default_config/0, set_default_config/1, set_default_config/2, + set_default_config/3, request/2, request/3, request/4, @@ -40,11 +41,14 @@ set_default_config(DefaultUsername) -> set_default_config(DefaultUsername, false). set_default_config(DefaultUsername, HAProxyEnabled) -> + set_default_config(DefaultUsername, HAProxyEnabled, #{}). + +set_default_config(DefaultUsername, HAProxyEnabled, Opts) -> Config = #{ listeners => #{ http => #{ enable => true, - bind => 18083, + bind => maps:get(bind, Opts, 18083), inet6 => false, ipv6_v6only => false, max_connections => 512, diff --git a/apps/emqx_machine/src/emqx_machine_boot.erl b/apps/emqx_machine/src/emqx_machine_boot.erl index 82b3d602f..e3f84079b 100644 --- a/apps/emqx_machine/src/emqx_machine_boot.erl +++ b/apps/emqx_machine/src/emqx_machine_boot.erl @@ -149,8 +149,14 @@ basic_reboot_apps() -> emqx_plugins ], case emqx_release:edition() of - ce -> CE; - ee -> CE ++ [] + ce -> + CE; + ee -> + CE ++ + [ + emqx_eviction_agent, + emqx_node_rebalance + ] end. sorted_reboot_apps() -> diff --git a/changes/ee/feat-10075.en.md b/changes/ee/feat-10075.en.md new file mode 100644 index 000000000..e6e070ddc --- /dev/null +++ b/changes/ee/feat-10075.en.md @@ -0,0 +1 @@ +Add node rebalance/node evacuation [functionality](https://github.com/emqx/eip/blob/main/active/0020-node-rebalance.md). diff --git a/changes/ee/feat-10075.zh.md b/changes/ee/feat-10075.zh.md new file mode 100644 index 000000000..36c78acb8 --- /dev/null +++ b/changes/ee/feat-10075.zh.md @@ -0,0 +1 @@ +添加节点再平衡/节点疏散[功能](https://github.com/emqx/eip/blob/main/active/0020-node-rebalance.md)。 diff --git a/lib-ee/emqx_eviction_agent/README.md b/lib-ee/emqx_eviction_agent/README.md new file mode 100644 index 000000000..f9b8037bf --- /dev/null +++ b/lib-ee/emqx_eviction_agent/README.md @@ -0,0 +1,9 @@ +emqx_eviction_agent +===== + +An OTP library + +Build +----- + + $ rebar3 compile diff --git a/lib-ee/emqx_eviction_agent/etc/emqx_eviction_agent.conf b/lib-ee/emqx_eviction_agent/etc/emqx_eviction_agent.conf new file mode 100644 index 000000000..011b7fb0f --- /dev/null +++ b/lib-ee/emqx_eviction_agent/etc/emqx_eviction_agent.conf @@ -0,0 +1,3 @@ +##-------------------------------------------------------------------- +## EMQX Eviction Agent Plugin +##-------------------------------------------------------------------- diff --git a/lib-ee/emqx_eviction_agent/i18n/emqx_eviction_agent_api_i18n.conf b/lib-ee/emqx_eviction_agent/i18n/emqx_eviction_agent_api_i18n.conf new file mode 100644 index 000000000..8bb7282c3 --- /dev/null +++ b/lib-ee/emqx_eviction_agent/i18n/emqx_eviction_agent_api_i18n.conf @@ -0,0 +1,14 @@ +emqx_eviction_agent_api { + + node_eviction_status_get { + desc { + en: "Get the node eviction status" + zh: "获取节点驱逐状态" + } + label { + en: "Node Eviction Status" + zh: "节点驱逐状态" + } + } + +} diff --git a/lib-ee/emqx_eviction_agent/rebar.config b/lib-ee/emqx_eviction_agent/rebar.config new file mode 100644 index 000000000..b055d8f4f --- /dev/null +++ b/lib-ee/emqx_eviction_agent/rebar.config @@ -0,0 +1,2 @@ +{deps, [{emqx, {path, "../../apps/emqx"}}]}. +{project_plugins, [erlfmt]}. diff --git a/lib-ee/emqx_eviction_agent/src/emqx_eviction_agent.app.src b/lib-ee/emqx_eviction_agent/src/emqx_eviction_agent.app.src new file mode 100644 index 000000000..a360133f4 --- /dev/null +++ b/lib-ee/emqx_eviction_agent/src/emqx_eviction_agent.app.src @@ -0,0 +1,22 @@ +{application, emqx_eviction_agent, [ + {description, "EMQX Eviction Agent"}, + {vsn, "5.0.0"}, + {registered, [ + emqx_eviction_agent_sup, + emqx_eviction_agent, + emqx_eviction_agent_conn_sup + ]}, + {applications, [ + kernel, + stdlib, + emqx_ctl + ]}, + {mod, {emqx_eviction_agent_app, []}}, + {env, []}, + {modules, []}, + {maintainers, ["EMQX Team "]}, + {links, [ + {"Homepage", "https://emqx.io/"}, + {"Github", "https://github.com/emqx"} + ]} +]}. diff --git a/lib-ee/emqx_eviction_agent/src/emqx_eviction_agent.appup.src b/lib-ee/emqx_eviction_agent/src/emqx_eviction_agent.appup.src new file mode 100644 index 000000000..c1b84778d --- /dev/null +++ b/lib-ee/emqx_eviction_agent/src/emqx_eviction_agent.appup.src @@ -0,0 +1,3 @@ +%% -*- mode: erlang -*- +%% Unless you know what you are doing, DO NOT edit manually!! +{VSN, [{<<".*">>, []}], [{<<".*">>, []}]}. diff --git a/lib-ee/emqx_eviction_agent/src/emqx_eviction_agent.erl b/lib-ee/emqx_eviction_agent/src/emqx_eviction_agent.erl new file mode 100644 index 000000000..b8e1b5236 --- /dev/null +++ b/lib-ee/emqx_eviction_agent/src/emqx_eviction_agent.erl @@ -0,0 +1,346 @@ +%%-------------------------------------------------------------------- +%% Copyright (c) 2022-2023 EMQ Technologies Co., Ltd. All Rights Reserved. +%%-------------------------------------------------------------------- + +-module(emqx_eviction_agent). + +-include_lib("emqx/include/emqx_mqtt.hrl"). +-include_lib("emqx/include/logger.hrl"). +-include_lib("emqx/include/types.hrl"). +-include_lib("emqx/include/emqx_hooks.hrl"). + +-include_lib("stdlib/include/qlc.hrl"). +-include_lib("snabbkaffe/include/snabbkaffe.hrl"). + +-export([ + start_link/0, + enable/2, + disable/1, + status/0, + connection_count/0, + session_count/0, + session_count/1, + evict_connections/1, + evict_sessions/2, + evict_sessions/3, + evict_session_channel/3 +]). + +-behaviour(gen_server). + +-export([ + init/1, + handle_call/3, + handle_info/2, + handle_cast/2, + code_change/3 +]). + +-export([ + on_connect/2, + on_connack/3 +]). + +-export([ + hook/0, + unhook/0 +]). + +-export_type([server_reference/0]). + +-define(CONN_MODULES, [emqx_connection, emqx_ws_connection, emqx_eviction_agent_channel]). + +%%-------------------------------------------------------------------- +%% APIs +%%-------------------------------------------------------------------- + +-type server_reference() :: binary() | undefined. +-type status() :: {enabled, conn_stats()} | disabled. +-type conn_stats() :: #{ + connections := non_neg_integer(), + sessions := non_neg_integer() +}. +-type kind() :: atom(). + +-spec start_link() -> startlink_ret(). +start_link() -> + gen_server:start_link({local, ?MODULE}, ?MODULE, [], []). + +-spec enable(kind(), server_reference()) -> ok_or_error(eviction_agent_busy). +enable(Kind, ServerReference) -> + gen_server:call(?MODULE, {enable, Kind, ServerReference}). + +-spec disable(kind()) -> ok. +disable(Kind) -> + gen_server:call(?MODULE, {disable, Kind}). + +-spec status() -> status(). +status() -> + case enable_status() of + {enabled, _Kind, _ServerReference} -> + {enabled, stats()}; + disabled -> + disabled + end. + +-spec evict_connections(pos_integer()) -> ok_or_error(disabled). +evict_connections(N) -> + case enable_status() of + {enabled, _Kind, ServerReference} -> + ok = do_evict_connections(N, ServerReference); + disabled -> + {error, disabled} + end. + +-spec evict_sessions(pos_integer(), node() | [node()]) -> ok_or_error(disabled). +evict_sessions(N, Node) when is_atom(Node) -> + evict_sessions(N, [Node]); +evict_sessions(N, Nodes) when is_list(Nodes) andalso length(Nodes) > 0 -> + evict_sessions(N, Nodes, any). + +-spec evict_sessions(pos_integer(), node() | [node()], atom()) -> ok_or_error(disabled). +evict_sessions(N, Node, ConnState) when is_atom(Node) -> + evict_sessions(N, [Node], ConnState); +evict_sessions(N, Nodes, ConnState) when + is_list(Nodes) andalso length(Nodes) > 0 +-> + case enable_status() of + {enabled, _Kind, _ServerReference} -> + ok = do_evict_sessions(N, Nodes, ConnState); + disabled -> + {error, disabled} + end. + +%%-------------------------------------------------------------------- +%% gen_server callbacks +%%-------------------------------------------------------------------- + +init([]) -> + _ = persistent_term:erase(?MODULE), + {ok, #{}}. + +%% enable +handle_call({enable, Kind, ServerReference}, _From, St) -> + Reply = + case enable_status() of + disabled -> + ok = persistent_term:put(?MODULE, {enabled, Kind, ServerReference}); + {enabled, Kind, _ServerReference} -> + ok = persistent_term:put(?MODULE, {enabled, Kind, ServerReference}); + {enabled, _OtherKind, _ServerReference} -> + {error, eviction_agent_busy} + end, + {reply, Reply, St}; +%% disable +handle_call({disable, Kind}, _From, St) -> + Reply = + case enable_status() of + disabled -> + {error, disabled}; + {enabled, Kind, _ServerReference} -> + _ = persistent_term:erase(?MODULE), + ok; + {enabled, _OtherKind, _ServerReference} -> + {error, eviction_agent_busy} + end, + {reply, Reply, St}; +handle_call(Msg, _From, St) -> + ?SLOG(warning, #{msg => "unknown_call", call => Msg, state => St}), + {reply, {error, unknown_call}, St}. + +handle_info(Msg, St) -> + ?SLOG(warning, #{msg => "unknown_msg", info => Msg, state => St}), + {noreply, St}. + +handle_cast(Msg, St) -> + ?SLOG(warning, #{msg => "unknown_cast", cast => Msg, state => St}), + {noreply, St}. + +code_change(_Vsn, State, _Extra) -> + {ok, State}. + +%%-------------------------------------------------------------------- +%% Hook callbacks +%%-------------------------------------------------------------------- + +on_connect(_ConnInfo, _Props) -> + case enable_status() of + {enabled, _Kind, _ServerReference} -> + {stop, {error, ?RC_USE_ANOTHER_SERVER}}; + disabled -> + ignore + end. + +on_connack( + #{proto_name := <<"MQTT">>, proto_ver := ?MQTT_PROTO_V5}, + use_another_server, + Props +) -> + case enable_status() of + {enabled, _Kind, ServerReference} -> + {ok, Props#{'Server-Reference' => ServerReference}}; + disabled -> + {ok, Props} + end; +on_connack(_ClientInfo, _Reason, Props) -> + {ok, Props}. + +%%-------------------------------------------------------------------- +%% Hook funcs +%%-------------------------------------------------------------------- + +hook() -> + ?tp(debug, eviction_agent_hook, #{}), + ok = emqx_hooks:put('client.connack', {?MODULE, on_connack, []}, ?HP_NODE_REBALANCE), + ok = emqx_hooks:put('client.connect', {?MODULE, on_connect, []}, ?HP_NODE_REBALANCE). + +unhook() -> + ?tp(debug, eviction_agent_unhook, #{}), + ok = emqx_hooks:del('client.connect', {?MODULE, on_connect}), + ok = emqx_hooks:del('client.connack', {?MODULE, on_connack}). + +enable_status() -> + persistent_term:get(?MODULE, disabled). + +% connection management +stats() -> + #{ + connections => connection_count(), + sessions => session_count() + }. + +connection_table() -> + emqx_cm:live_connection_table(?CONN_MODULES). + +connection_count() -> + table_count(connection_table()). + +channel_with_session_table(any) -> + qlc:q([ + {ClientId, ConnInfo, ClientInfo} + || {ClientId, _, ConnInfo, ClientInfo} <- + emqx_cm:channel_with_session_table(?CONN_MODULES) + ]); +channel_with_session_table(RequiredConnState) -> + qlc:q([ + {ClientId, ConnInfo, ClientInfo} + || {ClientId, ConnState, ConnInfo, ClientInfo} <- + emqx_cm:channel_with_session_table(?CONN_MODULES), + RequiredConnState =:= ConnState + ]). + +session_count() -> + session_count(any). + +session_count(ConnState) -> + table_count(channel_with_session_table(ConnState)). + +table_count(QH) -> + qlc:fold(fun(_, Acc) -> Acc + 1 end, 0, QH). + +take_connections(N) -> + ChanQH = qlc:q([ChanPid || {_ClientId, ChanPid} <- connection_table()]), + ChanPidCursor = qlc:cursor(ChanQH), + ChanPids = qlc:next_answers(ChanPidCursor, N), + ok = qlc:delete_cursor(ChanPidCursor), + ChanPids. + +take_channel_with_sessions(N, ConnState) -> + ChanPidCursor = qlc:cursor(channel_with_session_table(ConnState)), + Channels = qlc:next_answers(ChanPidCursor, N), + ok = qlc:delete_cursor(ChanPidCursor), + Channels. + +do_evict_connections(N, ServerReference) when N > 0 -> + ChanPids = take_connections(N), + ok = lists:foreach( + fun(ChanPid) -> + disconnect_channel(ChanPid, ServerReference) + end, + ChanPids + ). + +do_evict_sessions(N, Nodes, ConnState) when N > 0 -> + Channels = take_channel_with_sessions(N, ConnState), + ok = lists:foreach( + fun({ClientId, ConnInfo, ClientInfo}) -> + evict_session_channel(Nodes, ClientId, ConnInfo, ClientInfo) + end, + Channels + ). + +evict_session_channel(Nodes, ClientId, ConnInfo, ClientInfo) -> + Node = select_random(Nodes), + ?SLOG( + info, + #{ + msg => "evict_session_channel", + client_id => ClientId, + node => Node, + conn_info => ConnInfo, + client_info => ClientInfo + } + ), + case emqx_eviction_agent_proto_v1:evict_session_channel(Node, ClientId, ConnInfo, ClientInfo) of + {badrpc, Reason} -> + ?SLOG( + error, + #{ + msg => "evict_session_channel_rpc_error", + client_id => ClientId, + node => Node, + reason => Reason + } + ), + {error, Reason}; + {error, Reason} = Error -> + ?SLOG( + error, + #{ + msg => "evict_session_channel_error", + client_id => ClientId, + node => Node, + reason => Reason + } + ), + Error; + Res -> + Res + end. + +-spec evict_session_channel( + emqx_types:clientid(), + emqx_types:conninfo(), + emqx_types:clientinfo() +) -> supervisor:startchild_ret(). +evict_session_channel(ClientId, ConnInfo, ClientInfo) -> + ?SLOG(info, #{ + msg => "evict_session_channel", + client_id => ClientId, + conn_info => ConnInfo, + client_info => ClientInfo + }), + Result = emqx_eviction_agent_channel:start_supervised( + #{ + conninfo => ConnInfo, + clientinfo => ClientInfo + } + ), + ?SLOG( + info, + #{ + msg => "evict_session_channel_result", + client_id => ClientId, + result => Result + } + ), + Result. + +disconnect_channel(ChanPid, ServerReference) -> + ChanPid ! + {disconnect, ?RC_USE_ANOTHER_SERVER, use_another_server, #{ + 'Server-Reference' => ServerReference + }}. + +select_random(List) when length(List) > 0 -> + lists:nth(rand:uniform(length(List)), List). diff --git a/lib-ee/emqx_eviction_agent/src/emqx_eviction_agent_api.erl b/lib-ee/emqx_eviction_agent/src/emqx_eviction_agent_api.erl new file mode 100644 index 000000000..d8c1d7645 --- /dev/null +++ b/lib-ee/emqx_eviction_agent/src/emqx_eviction_agent_api.erl @@ -0,0 +1,85 @@ +%%-------------------------------------------------------------------- +%% Copyright (c) 2022-2023 EMQ Technologies Co., Ltd. All Rights Reserved. +%%-------------------------------------------------------------------- + +-module(emqx_eviction_agent_api). + +-behaviour(minirest_api). + +-include_lib("typerefl/include/types.hrl"). +-include_lib("hocon/include/hoconsc.hrl"). +-include_lib("emqx/include/logger.hrl"). + +%% Swagger specs from hocon schema +-export([ + api_spec/0, + paths/0, + schema/1, + namespace/0 +]). + +-export([ + fields/1, + roots/0 +]). + +%% API callbacks +-export([ + '/node_eviction/status'/2 +]). + +-import(hoconsc, [mk/2, ref/1, ref/2]). + +namespace() -> "node_eviction". + +api_spec() -> + emqx_dashboard_swagger:spec(?MODULE, #{check_schema => true}). + +paths() -> + [ + "/node_eviction/status" + ]. + +schema("/node_eviction/status") -> + #{ + 'operationId' => '/node_eviction/status', + get => #{ + tags => [<<"node_eviction">>], + summary => <<"Get node eviction status">>, + description => ?DESC("node_eviction_status_get"), + responses => #{ + 200 => schema_status() + } + } + }. + +'/node_eviction/status'(_Bindings, _Params) -> + case emqx_eviction_agent:status() of + disabled -> + {200, #{status => disabled}}; + {enabled, Stats} -> + {200, #{ + status => enabled, + stats => Stats + }} + end. + +schema_status() -> + mk(hoconsc:union([ref(status_enabled), ref(status_disabled)]), #{}). + +roots() -> []. + +fields(status_enabled) -> + [ + {status, mk(enabled, #{default => enabled})}, + {stats, ref(stats)} + ]; +fields(stats) -> + [ + {connections, mk(integer(), #{})}, + {sessions, mk(integer(), #{})} + ]; +fields(status_disabled) -> + [ + {status, mk(disabled, #{default => disabled})} + ]. diff --git a/lib-ee/emqx_eviction_agent/src/emqx_eviction_agent_app.erl b/lib-ee/emqx_eviction_agent/src/emqx_eviction_agent_app.erl new file mode 100644 index 000000000..63af59b09 --- /dev/null +++ b/lib-ee/emqx_eviction_agent/src/emqx_eviction_agent_app.erl @@ -0,0 +1,24 @@ +%%-------------------------------------------------------------------- +%% Copyright (c) 2022-2023 EMQ Technologies Co., Ltd. All Rights Reserved. +%%-------------------------------------------------------------------- + +-module(emqx_eviction_agent_app). + +-behaviour(application). + +-emqx_plugin(?MODULE). + +-export([ + start/2, + stop/1 +]). + +start(_Type, _Args) -> + ok = emqx_eviction_agent:hook(), + {ok, Sup} = emqx_eviction_agent_sup:start_link(), + ok = emqx_eviction_agent_cli:load(), + {ok, Sup}. + +stop(_State) -> + ok = emqx_eviction_agent:unhook(), + ok = emqx_eviction_agent_cli:unload(). diff --git a/lib-ee/emqx_eviction_agent/src/emqx_eviction_agent_channel.erl b/lib-ee/emqx_eviction_agent/src/emqx_eviction_agent_channel.erl new file mode 100644 index 000000000..a42033c0f --- /dev/null +++ b/lib-ee/emqx_eviction_agent/src/emqx_eviction_agent_channel.erl @@ -0,0 +1,368 @@ +%%-------------------------------------------------------------------- +%% Copyright (c) 2022-2023 EMQ Technologies Co., Ltd. All Rights Reserved. +%%-------------------------------------------------------------------- + +%% MQTT Channel +-module(emqx_eviction_agent_channel). + +-include_lib("emqx/include/emqx.hrl"). +-include_lib("emqx/include/emqx_channel.hrl"). +-include_lib("emqx/include/emqx_mqtt.hrl"). +-include_lib("emqx/include/logger.hrl"). +-include_lib("emqx/include/types.hrl"). + +-include_lib("snabbkaffe/include/snabbkaffe.hrl"). + +-logger_header("[Evicted Channel]"). + +-export([ + start_link/1, + start_supervised/1, + call/2, + call/3, + cast/2, + stop/1 +]). + +-export([ + init/1, + handle_call/3, + handle_cast/2, + handle_info/2, + terminate/2, + code_change/3 +]). + +-import( + emqx_misc, + [ + maybe_apply/2 + ] +). + +-type opts() :: #{ + conninfo := emqx_types:conninfo(), + clientinfo := emqx_types:clientinfo() +}. + +%%-------------------------------------------------------------------- +%% API +%%-------------------------------------------------------------------- + +-spec start_supervised(opts()) -> supervisor:startchild_ret(). +start_supervised(#{clientinfo := #{clientid := ClientId}} = Opts) -> + RandomId = integer_to_binary(erlang:unique_integer([positive])), + ClientIdBin = bin_clientid(ClientId), + Id = <>, + ChildSpec = #{ + id => Id, + start => {?MODULE, start_link, [Opts]}, + restart => temporary, + shutdown => 5000, + type => worker, + modules => [?MODULE] + }, + supervisor:start_child( + emqx_eviction_agent_conn_sup, + ChildSpec + ). + +-spec start_link(opts()) -> startlink_ret(). +start_link(Opts) -> + gen_server:start_link(?MODULE, [Opts], []). + +-spec cast(pid(), term()) -> ok. +cast(Pid, Req) -> + gen_server:cast(Pid, Req). + +-spec call(pid(), term()) -> term(). +call(Pid, Req) -> + call(Pid, Req, infinity). + +-spec call(pid(), term(), timeout()) -> term(). +call(Pid, Req, Timeout) -> + gen_server:call(Pid, Req, Timeout). + +-spec stop(pid()) -> ok. +stop(Pid) -> + gen_server:stop(Pid). + +%%-------------------------------------------------------------------- +%% gen_server API +%%-------------------------------------------------------------------- + +init([#{conninfo := OldConnInfo, clientinfo := #{clientid := ClientId} = OldClientInfo}]) -> + process_flag(trap_exit, true), + ClientInfo = clientinfo(OldClientInfo), + ConnInfo = conninfo(OldConnInfo), + case open_session(ConnInfo, ClientInfo) of + {ok, Channel0} -> + case set_expiry_timer(Channel0) of + {ok, Channel1} -> + ?SLOG( + info, + #{ + msg => "channel_initialized", + clientid => ClientId, + node => node() + } + ), + ok = emqx_cm:mark_channel_disconnected(self()), + {ok, Channel1, hibernate}; + {error, Reason} -> + {stop, Reason} + end; + {error, Reason} -> + {stop, Reason} + end. + +handle_call(kick, _From, Channel) -> + {stop, kicked, ok, Channel}; +handle_call(discard, _From, Channel) -> + {stop, discarded, ok, Channel}; +handle_call({takeover, 'begin'}, _From, #{session := Session} = Channel) -> + {reply, Session, Channel#{takeover => true}}; +handle_call( + {takeover, 'end'}, + _From, + #{ + session := Session, + clientinfo := #{clientid := ClientId}, + pendings := Pendings + } = Channel +) -> + ok = emqx_session:takeover(Session), + %% TODO: Should not drain deliver here (side effect) + Delivers = emqx_misc:drain_deliver(), + AllPendings = lists:append(Delivers, Pendings), + ?tp( + debug, + emqx_channel_takeover_end, + #{clientid => ClientId} + ), + {stop, normal, AllPendings, Channel}; +handle_call(list_acl_cache, _From, Channel) -> + {reply, [], Channel}; +handle_call({quota, _Policy}, _From, Channel) -> + {reply, ok, Channel}; +handle_call(Req, _From, Channel) -> + ?SLOG( + error, + #{ + msg => "unexpected_call", + req => Req + } + ), + {reply, ignored, Channel}. + +handle_info(Deliver = {deliver, _Topic, _Msg}, Channel) -> + Delivers = [Deliver | emqx_misc:drain_deliver()], + {noreply, handle_deliver(Delivers, Channel)}; +handle_info(expire_session, Channel) -> + {stop, expired, Channel}; +handle_info(Info, Channel) -> + ?SLOG( + error, + #{ + msg => "unexpected_info", + info => Info + } + ), + {noreply, Channel}. + +handle_cast(Msg, Channel) -> + ?SLOG(error, #{msg => "unexpected_cast", cast => Msg}), + {noreply, Channel}. + +terminate(Reason, #{conninfo := ConnInfo, clientinfo := ClientInfo, session := Session} = Channel) -> + ok = cancel_expiry_timer(Channel), + (Reason =:= expired) andalso emqx_persistent_session:persist(ClientInfo, ConnInfo, Session), + emqx_session:terminate(ClientInfo, Reason, Session). + +code_change(_OldVsn, Channel, _Extra) -> + {ok, Channel}. + +%%-------------------------------------------------------------------- +%% Internal functions +%%-------------------------------------------------------------------- + +%% TODO: sync with emqx_channel +handle_deliver( + Delivers, + #{ + takeover := true, + pendings := Pendings, + session := Session, + clientinfo := #{clientid := ClientId} = ClientInfo + } = Channel +) -> + %% NOTE: Order is important here. While the takeover is in + %% progress, the session cannot enqueue messages, since it already + %% passed on the queue to the new connection in the session state. + NPendings = lists:append( + Pendings, + emqx_session:ignore_local(ClientInfo, emqx_channel:maybe_nack(Delivers), ClientId, Session) + ), + Channel#{pendings => NPendings}; +handle_deliver( + Delivers, + #{ + takeover := false, + session := Session, + clientinfo := #{clientid := ClientId} = ClientInfo + } = Channel +) -> + Delivers1 = emqx_channel:maybe_nack(Delivers), + Delivers2 = emqx_session:ignore_local(ClientInfo, Delivers1, ClientId, Session), + NSession = emqx_session:enqueue(ClientInfo, Delivers2, Session), + NChannel = persist(NSession, Channel), + %% We consider queued/dropped messages as delivered since they are now in the session state. + emqx_channel:maybe_mark_as_delivered(Session, Delivers), + NChannel. + +cancel_expiry_timer(#{expiry_timer := TRef}) when is_reference(TRef) -> + _ = erlang:cancel_timer(TRef), + ok; +cancel_expiry_timer(_) -> + ok. + +set_expiry_timer(#{conninfo := ConnInfo} = Channel) -> + case maps:get(expiry_interval, ConnInfo) of + ?UINT_MAX -> + {ok, Channel}; + I when I > 0 -> + Timer = erlang:send_after(timer:seconds(I), self(), expire_session), + {ok, Channel#{expiry_timer => Timer}}; + _ -> + {error, should_be_expired} + end. + +open_session(ConnInfo, #{clientid := ClientId} = ClientInfo) -> + Channel = channel(ConnInfo, ClientInfo), + case emqx_cm:open_session(false, ClientInfo, ConnInfo) of + {ok, #{present := false}} -> + ?SLOG( + info, + #{ + msg => "no_session", + clientid => ClientId, + node => node() + } + ), + {error, no_session}; + {ok, #{session := Session, present := true, pendings := Pendings0}} -> + ?SLOG( + info, + #{ + msg => "session_opened", + clientid => ClientId, + node => node() + } + ), + Pendings1 = lists:usort(lists:append(Pendings0, emqx_misc:drain_deliver())), + NSession = emqx_session:enqueue( + ClientInfo, + emqx_session:ignore_local( + ClientInfo, + emqx_channel:maybe_nack(Pendings1), + ClientId, + Session + ), + Session + ), + NChannel = Channel#{session => NSession}, + ok = emqx_cm:insert_channel_info(ClientId, info(NChannel), stats(NChannel)), + ?SLOG( + info, + #{ + msg => "channel_info_updated", + clientid => ClientId, + node => node() + } + ), + {ok, NChannel}; + {error, Reason} = Error -> + ?SLOG( + error, + #{ + msg => "session_open_failed", + clientid => ClientId, + node => node(), + reason => Reason + } + ), + Error + end. + +conninfo(OldConnInfo) -> + DisconnectedAt = maps:get(disconnected_at, OldConnInfo, erlang:system_time(millisecond)), + ConnInfo0 = maps:with( + [ + socktype, + sockname, + peername, + peercert, + clientid, + clean_start, + receive_maximum, + expiry_interval, + connected_at, + disconnected_at, + keepalive + ], + OldConnInfo + ), + ConnInfo0#{ + conn_mod => ?MODULE, + connected => false, + disconnected_at => DisconnectedAt + }. + +clientinfo(OldClientInfo) -> + maps:with( + [ + zone, + protocol, + peerhost, + sockport, + clientid, + username, + is_bridge, + is_superuser, + mountpoint + ], + OldClientInfo + ). + +channel(ConnInfo, ClientInfo) -> + #{ + conninfo => ConnInfo, + clientinfo => ClientInfo, + expiry_timer => undefined, + takeover => false, + resuming => false, + pendings => [] + }. + +persist(Session, #{clientinfo := ClientInfo, conninfo := ConnInfo} = Channel) -> + Session1 = emqx_persistent_session:persist(ClientInfo, ConnInfo, Session), + Channel#{session => Session1}. + +info(Channel) -> + #{ + conninfo => maps:get(conninfo, Channel, undefined), + clientinfo => maps:get(clientinfo, Channel, undefined), + session => maybe_apply( + fun emqx_session:info/1, + maps:get(session, Channel, undefined) + ), + conn_state => disconnected + }. + +stats(#{session := Session}) -> + lists:append(emqx_session:stats(Session), emqx_pd:get_counters(?CHANNEL_METRICS)). + +bin_clientid(ClientId) when is_binary(ClientId) -> + ClientId; +bin_clientid(ClientId) when is_atom(ClientId) -> + atom_to_binary(ClientId). diff --git a/lib-ee/emqx_eviction_agent/src/emqx_eviction_agent_cli.erl b/lib-ee/emqx_eviction_agent/src/emqx_eviction_agent_cli.erl new file mode 100644 index 000000000..3ae9365e3 --- /dev/null +++ b/lib-ee/emqx_eviction_agent/src/emqx_eviction_agent_cli.erl @@ -0,0 +1,30 @@ +%%-------------------------------------------------------------------- +%% Copyright (c) 2022-2023 EMQ Technologies Co., Ltd. All Rights Reserved. +%%-------------------------------------------------------------------- + +-module(emqx_eviction_agent_cli). + +%% APIs +-export([ + load/0, + unload/0, + cli/1 +]). + +load() -> + emqx_ctl:register_command(eviction, {?MODULE, cli}, []). + +unload() -> + emqx_ctl:unregister_command(eviction). + +cli(["status"]) -> + case emqx_eviction_agent:status() of + disabled -> + emqx_ctl:print("Eviction status: disabled~n"); + {enabled, _Stats} -> + emqx_ctl:print("Eviction status: enabled~n") + end; +cli(_) -> + emqx_ctl:usage( + [{"eviction status", "Get current node eviction status"}] + ). diff --git a/lib-ee/emqx_eviction_agent/src/emqx_eviction_agent_conn_sup.erl b/lib-ee/emqx_eviction_agent/src/emqx_eviction_agent_conn_sup.erl new file mode 100644 index 000000000..195555bd3 --- /dev/null +++ b/lib-ee/emqx_eviction_agent/src/emqx_eviction_agent_conn_sup.erl @@ -0,0 +1,21 @@ +%%-------------------------------------------------------------------- +%% Copyright (c) 2022-2023 EMQ Technologies Co., Ltd. All Rights Reserved. +%%-------------------------------------------------------------------- + +-module(emqx_eviction_agent_conn_sup). + +-behaviour(supervisor). + +-export([start_link/0]). + +-export([init/1]). + +start_link() -> + supervisor:start_link({local, ?MODULE}, ?MODULE, []). + +init([]) -> + {ok, + { + #{strategy => one_for_one, intensity => 10, period => 3600}, + [] + }}. diff --git a/lib-ee/emqx_eviction_agent/src/emqx_eviction_agent_sup.erl b/lib-ee/emqx_eviction_agent/src/emqx_eviction_agent_sup.erl new file mode 100644 index 000000000..8b774ef85 --- /dev/null +++ b/lib-ee/emqx_eviction_agent/src/emqx_eviction_agent_sup.erl @@ -0,0 +1,34 @@ +%%-------------------------------------------------------------------- +%% Copyright (c) 2022-2023 EMQ Technologies Co., Ltd. All Rights Reserved. +%%-------------------------------------------------------------------- + +-module(emqx_eviction_agent_sup). + +-behaviour(supervisor). + +-export([start_link/0]). + +-export([init/1]). + +start_link() -> + supervisor:start_link({local, ?MODULE}, ?MODULE, []). + +init([]) -> + Childs = [ + child_spec(worker, emqx_eviction_agent, []), + child_spec(supervisor, emqx_eviction_agent_conn_sup, []) + ], + {ok, { + #{strategy => one_for_one, intensity => 10, period => 3600}, + Childs + }}. + +child_spec(Type, Mod, Args) -> + #{ + id => Mod, + start => {Mod, start_link, Args}, + restart => permanent, + shutdown => 5000, + type => Type, + modules => [Mod] + }. diff --git a/lib-ee/emqx_eviction_agent/src/proto/emqx_eviction_agent_proto_v1.erl b/lib-ee/emqx_eviction_agent/src/proto/emqx_eviction_agent_proto_v1.erl new file mode 100644 index 000000000..f4c958150 --- /dev/null +++ b/lib-ee/emqx_eviction_agent/src/proto/emqx_eviction_agent_proto_v1.erl @@ -0,0 +1,27 @@ +%%-------------------------------------------------------------------- +%% Copyright (c) 2022-2023 EMQ Technologies Co., Ltd. All Rights Reserved. +%%-------------------------------------------------------------------- + +-module(emqx_eviction_agent_proto_v1). + +-behaviour(emqx_bpapi). + +-export([ + introduced_in/0, + + evict_session_channel/4 +]). + +-include_lib("emqx/include/bpapi.hrl"). + +introduced_in() -> + "5.0.22". + +-spec evict_session_channel( + node(), + emqx_types:clientid(), + emqx_types:conninfo(), + emqx_types:clientinfo() +) -> supervisor:startchild_err() | emqx_rpc:badrpc(). +evict_session_channel(Node, ClientId, ConnInfo, ClientInfo) -> + rpc:call(Node, emqx_eviction_agent, evict_session_channel, [ClientId, ConnInfo, ClientInfo]). diff --git a/lib-ee/emqx_eviction_agent/test/emqx_eviction_agent_SUITE.erl b/lib-ee/emqx_eviction_agent/test/emqx_eviction_agent_SUITE.erl new file mode 100644 index 000000000..0574ccec3 --- /dev/null +++ b/lib-ee/emqx_eviction_agent/test/emqx_eviction_agent_SUITE.erl @@ -0,0 +1,403 @@ +%%-------------------------------------------------------------------- +%% Copyright (c) 2022-2023 EMQ Technologies Co., Ltd. All Rights Reserved. +%%-------------------------------------------------------------------- + +-module(emqx_eviction_agent_SUITE). + +-compile(export_all). +-compile(nowarn_export_all). + +-include_lib("eunit/include/eunit.hrl"). +-include_lib("common_test/include/ct.hrl"). +-include_lib("emqx/include/emqx_mqtt.hrl"). +-include_lib("emqx/include/asserts.hrl"). + +-import( + emqx_eviction_agent_test_helpers, + [emqtt_connect/0, emqtt_connect/1, emqtt_connect/2] +). + +-define(assertPrinted(Printed, Code), + ?assertMatch( + {match, _}, + re:run(Code, Printed) + ) +). + +all() -> + emqx_common_test_helpers:all(?MODULE). + +init_per_suite(Config) -> + emqx_common_test_helpers:start_apps([emqx_eviction_agent]), + Config. + +end_per_suite(_Config) -> + emqx_common_test_helpers:stop_apps([emqx_eviction_agent]). + +init_per_testcase(Case, Config) -> + _ = emqx_eviction_agent:disable(test_eviction), + ok = snabbkaffe:start_trace(), + start_slave(Case, Config). + +start_slave(t_explicit_session_takeover, Config) -> + ClusterNodes = emqx_eviction_agent_test_helpers:start_cluster( + [{evacuate_test1, 2883}, {evacuate_test2, 3883}], + [emqx_eviction_agent] + ), + [{evacuate_nodes, ClusterNodes} | Config]; +start_slave(_Case, Config) -> + Config. + +end_per_testcase(TestCase, Config) -> + emqx_eviction_agent:disable(test_eviction), + ok = snabbkaffe:stop(), + stop_slave(TestCase, Config). + +stop_slave(t_explicit_session_takeover, Config) -> + emqx_eviction_agent_test_helpers:stop_cluster( + ?config(evacuate_nodes, Config), + [emqx_eviction_agent] + ); +stop_slave(_Case, _Config) -> + ok. + +%%-------------------------------------------------------------------- +%% Tests +%%-------------------------------------------------------------------- + +t_enable_disable(_Config) -> + erlang:process_flag(trap_exit, true), + + ?assertMatch( + disabled, + emqx_eviction_agent:status() + ), + + {ok, C0} = emqtt_connect(), + ok = emqtt:disconnect(C0), + + ok = emqx_eviction_agent:enable(test_eviction, undefined), + + ?assertMatch( + {error, eviction_agent_busy}, + emqx_eviction_agent:enable(bar, undefined) + ), + + ?assertMatch( + ok, + emqx_eviction_agent:enable(test_eviction, <<"srv">>) + ), + + ?assertMatch( + {enabled, #{}}, + emqx_eviction_agent:status() + ), + + ?assertMatch( + {error, {use_another_server, #{}}}, + emqtt_connect() + ), + + ?assertMatch( + {error, eviction_agent_busy}, + emqx_eviction_agent:disable(bar) + ), + + ?assertMatch( + ok, + emqx_eviction_agent:disable(test_eviction) + ), + + ?assertMatch( + {error, disabled}, + emqx_eviction_agent:disable(test_eviction) + ), + + ?assertMatch( + disabled, + emqx_eviction_agent:status() + ), + + {ok, C1} = emqtt_connect(), + ok = emqtt:disconnect(C1). + +t_evict_connections_status(_Config) -> + erlang:process_flag(trap_exit, true), + + {ok, _C} = emqtt_connect(), + + {error, disabled} = emqx_eviction_agent:evict_connections(1), + + ok = emqx_eviction_agent:enable(test_eviction, undefined), + + ?assertMatch( + {enabled, #{connections := 1, sessions := _}}, + emqx_eviction_agent:status() + ), + + ok = emqx_eviction_agent:evict_connections(1), + + ct:sleep(100), + + ?assertMatch( + {enabled, #{connections := 0, sessions := _}}, + emqx_eviction_agent:status() + ), + + ok = emqx_eviction_agent:disable(test_eviction). + +t_explicit_session_takeover(Config) -> + _ = erlang:process_flag(trap_exit, true), + ok = restart_emqx(), + + [{Node1, Port1}, {Node2, _Port2}] = ?config(evacuate_nodes, Config), + + {ok, C0} = emqtt_connect([ + {clientid, <<"client_with_session">>}, + {clean_start, false}, + {port, Port1} + ]), + {ok, _, _} = emqtt:subscribe(C0, <<"t1">>), + + ok = rpc:call(Node1, emqx_eviction_agent, enable, [test_eviction, undefined]), + + ?assertEqual( + 1, + rpc:call(Node1, emqx_eviction_agent, connection_count, []) + ), + + [ChanPid] = rpc:call(Node1, emqx_cm, lookup_channels, [<<"client_with_session">>]), + + ?assertWaitEvent( + begin + ok = rpc:call(Node1, emqx_eviction_agent, evict_connections, [1]), + receive + {'EXIT', C0, {disconnected, ?RC_USE_ANOTHER_SERVER, _}} -> ok + after 1000 -> + ?assert(false, "Connection not evicted") + end + end, + #{?snk_kind := emqx_cm_connected_client_count_dec, chan_pid := ChanPid}, + 2000 + ), + + ?assertEqual( + 0, + rpc:call(Node1, emqx_eviction_agent, connection_count, []) + ), + + ?assertEqual( + 1, + rpc:call(Node1, emqx_eviction_agent, session_count, []) + ), + + %% First, evacuate to the same node + + ?assertWaitEvent( + rpc:call(Node1, emqx_eviction_agent, evict_sessions, [1, Node1]), + #{?snk_kind := emqx_channel_takeover_end, clientid := <<"client_with_session">>}, + 1000 + ), + + ok = rpc:call(Node1, emqx_eviction_agent, disable, [test_eviction]), + + {ok, C1} = emqtt_connect([{port, Port1}]), + emqtt:publish(C1, <<"t1">>, <<"MessageToEvictedSession1">>), + ok = emqtt:disconnect(C1), + + ok = rpc:call(Node1, emqx_eviction_agent, enable, [test_eviction, undefined]), + + %% Evacuate to another node + + ?assertWaitEvent( + rpc:call(Node1, emqx_eviction_agent, evict_sessions, [1, Node2]), + #{?snk_kind := emqx_channel_takeover_end, clientid := <<"client_with_session">>}, + 1000 + ), + + ?assertEqual( + 0, + rpc:call(Node1, emqx_eviction_agent, session_count, []) + ), + + ?assertEqual( + 1, + rpc:call(Node2, emqx_eviction_agent, session_count, []) + ), + + ok = rpc:call(Node1, emqx_eviction_agent, disable, [test_eviction]), + + %% Session is on Node2, but we connect to Node1 + {ok, C2} = emqtt_connect([{port, Port1}]), + emqtt:publish(C2, <<"t1">>, <<"MessageToEvictedSession2">>), + ok = emqtt:disconnect(C2), + + ct:sleep(100), + + %% Session is on Node2, but we connect the subscribed client to Node1 + %% It should take over the session for the third time and recieve + %% previously published messages + {ok, C3} = emqtt_connect([ + {clientid, <<"client_with_session">>}, + {clean_start, false}, + {port, Port1} + ]), + + ok = assert_receive_publish( + [ + #{payload => <<"MessageToEvictedSession1">>, topic => <<"t1">>}, + #{payload => <<"MessageToEvictedSession2">>, topic => <<"t1">>} + ] + ), + ok = emqtt:disconnect(C3). + +t_disable_on_restart(_Config) -> + ok = emqx_eviction_agent:enable(test_eviction, undefined), + + ok = supervisor:terminate_child(emqx_eviction_agent_sup, emqx_eviction_agent), + {ok, _} = supervisor:restart_child(emqx_eviction_agent_sup, emqx_eviction_agent), + + ?assertEqual( + disabled, + emqx_eviction_agent:status() + ). + +t_session_serialization(_Config) -> + _ = erlang:process_flag(trap_exit, true), + ok = restart_emqx(), + + {ok, C0} = emqtt_connect(<<"client_with_session">>, false), + {ok, _, _} = emqtt:subscribe(C0, <<"t1">>), + ok = emqtt:disconnect(C0), + + ok = emqx_eviction_agent:enable(test_eviction, undefined), + + ?assertEqual( + 1, + emqx_eviction_agent:session_count() + ), + + %% Evacuate to the same node + + ?assertWaitEvent( + emqx_eviction_agent:evict_sessions(1, node()), + #{?snk_kind := emqx_channel_takeover_end, clientid := <<"client_with_session">>}, + 1000 + ), + + ok = emqx_eviction_agent:disable(test_eviction), + + ?assertEqual( + 1, + emqx_eviction_agent:session_count() + ), + + ?assertMatch( + #{data := [#{clientid := <<"client_with_session">>}]}, + emqx_mgmt_api:cluster_query( + emqx_channel_info, + #{}, + [], + fun emqx_mgmt_api_clients:qs2ms/2, + fun emqx_mgmt_api_clients:format_channel_info/2 + ) + ), + + mock_print(), + + ?assertPrinted( + "client_with_session", + emqx_mgmt_cli:clients(["list"]) + ), + + ?assertPrinted( + "client_with_session", + emqx_mgmt_cli:clients(["show", "client_with_session"]) + ), + + ?assertWaitEvent( + emqx_cm:kick_session(<<"client_with_session">>), + #{?snk_kind := emqx_cm_clean_down, client_id := <<"client_with_session">>}, + 1000 + ), + + ?assertEqual( + 0, + emqx_eviction_agent:session_count() + ). + +t_will_msg(_Config) -> + erlang:process_flag(trap_exit, true), + + WillMsg = <<"will_msg">>, + WillTopic = <<"will_topic">>, + ClientId = <<"client_with_will">>, + + _ = emqtt_connect([ + {clean_start, false}, + {clientid, ClientId}, + {will_payload, WillMsg}, + {will_topic, WillTopic} + ]), + + {ok, C} = emqtt_connect(), + {ok, _, _} = emqtt:subscribe(C, WillTopic), + + [ChanPid] = emqx_cm:lookup_channels(ClientId), + + ChanPid ! + {disconnect, ?RC_USE_ANOTHER_SERVER, use_another_server, #{ + 'Server-Reference' => <<>> + }}, + + receive + {publish, #{ + payload := WillMsg, + topic := WillTopic + }} -> + ok + after 1000 -> + ct:fail("Will message not received") + end, + + ok = emqtt:disconnect(C). + +%%-------------------------------------------------------------------- +%% Helpers +%%-------------------------------------------------------------------- + +% sn_connect_and_subscribe(ClientId, Topic) -> +% emqx_eviction_agent_test_helpers:sn_connect_and_subscribe(ClientId, Topic). + +assert_receive_publish([]) -> + ok; +assert_receive_publish([#{payload := Msg, topic := Topic} | Rest]) -> + receive + {publish, #{ + payload := Msg, + topic := Topic + }} -> + assert_receive_publish(Rest) + after 1000 -> + ?assert(false, "Message `" ++ binary_to_list(Msg) ++ "` is lost") + end. + +connect_and_publish(Topic, Message) -> + {ok, C} = emqtt_connect(), + emqtt:publish(C, Topic, Message), + ok = emqtt:disconnect(C). + +restart_emqx() -> + _ = application:stop(emqx), + _ = application:start(emqx), + _ = application:stop(emqx_eviction_agent), + _ = application:start(emqx_eviction_agent), + ok. + +mock_print() -> + catch meck:unload(emqx_ctl), + meck:new(emqx_ctl, [non_strict, passthrough]), + meck:expect(emqx_ctl, print, fun(Arg) -> emqx_ctl:format(Arg, []) end), + meck:expect(emqx_ctl, print, fun(Msg, Arg) -> emqx_ctl:format(Msg, Arg) end), + meck:expect(emqx_ctl, usage, fun(Usages) -> emqx_ctl:format_usage(Usages) end), + meck:expect(emqx_ctl, usage, fun(Cmd, Descr) -> emqx_ctl:format_usage(Cmd, Descr) end). diff --git a/lib-ee/emqx_eviction_agent/test/emqx_eviction_agent_api_SUITE.erl b/lib-ee/emqx_eviction_agent/test/emqx_eviction_agent_api_SUITE.erl new file mode 100644 index 000000000..3fe15e53a --- /dev/null +++ b/lib-ee/emqx_eviction_agent/test/emqx_eviction_agent_api_SUITE.erl @@ -0,0 +1,69 @@ +%%-------------------------------------------------------------------- +%% Copyright (c) 2022-2023 EMQ Technologies Co., Ltd. All Rights Reserved. +%%-------------------------------------------------------------------- + +-module(emqx_eviction_agent_api_SUITE). + +-compile(export_all). +-compile(nowarn_export_all). + +-include_lib("eunit/include/eunit.hrl"). +-include_lib("common_test/include/ct.hrl"). + +-import( + emqx_mgmt_api_test_util, + [ + request_api/2, + uri/1 + ] +). + +all() -> + emqx_common_test_helpers:all(?MODULE). + +init_per_suite(Config) -> + emqx_mgmt_api_test_util:init_suite([emqx_eviction_agent]), + Config. + +end_per_suite(Config) -> + emqx_mgmt_api_test_util:end_suite([emqx_eviction_agent]), + Config. + +%%-------------------------------------------------------------------- +%% Tests +%%-------------------------------------------------------------------- + +t_status(_Config) -> + ?assertMatch( + {ok, #{<<"status">> := <<"disabled">>}}, + api_get(["node_eviction", "status"]) + ), + + ok = emqx_eviction_agent:enable(apitest, undefined), + + ?assertMatch( + {ok, #{ + <<"status">> := <<"enabled">>, + <<"stats">> := #{} + }}, + api_get(["node_eviction", "status"]) + ), + + ok = emqx_eviction_agent:disable(apitest), + + ?assertMatch( + {ok, #{<<"status">> := <<"disabled">>}}, + api_get(["node_eviction", "status"]) + ). + +%%-------------------------------------------------------------------- +%% Helpers +%%-------------------------------------------------------------------- + +api_get(Path) -> + case request_api(get, uri(Path)) of + {ok, ResponseBody} -> + {ok, jiffy:decode(list_to_binary(ResponseBody), [return_maps])}; + {error, _} = Error -> + Error + end. diff --git a/lib-ee/emqx_eviction_agent/test/emqx_eviction_agent_channel_SUITE.erl b/lib-ee/emqx_eviction_agent/test/emqx_eviction_agent_channel_SUITE.erl new file mode 100644 index 000000000..3b7ef6672 --- /dev/null +++ b/lib-ee/emqx_eviction_agent/test/emqx_eviction_agent_channel_SUITE.erl @@ -0,0 +1,251 @@ +%%-------------------------------------------------------------------- +%% Copyright (c) 2022-2023 EMQ Technologies Co., Ltd. All Rights Reserved. +%%-------------------------------------------------------------------- + +-module(emqx_eviction_agent_channel_SUITE). + +-compile(export_all). +-compile(nowarn_export_all). + +-include_lib("eunit/include/eunit.hrl"). +-include_lib("common_test/include/ct.hrl"). +-include_lib("emqx/include/emqx_mqtt.hrl"). + +-define(CLIENT_ID, <<"client_with_session">>). + +-import( + emqx_eviction_agent_test_helpers, + [emqtt_connect/0, emqtt_connect/2] +). + +all() -> + emqx_common_test_helpers:all(?MODULE). + +init_per_suite(Config) -> + emqx_common_test_helpers:start_apps([emqx_conf, emqx_eviction_agent]), + {ok, _} = emqx:update_config([rpc, port_discovery], manual), + Config. + +end_per_suite(_Config) -> + emqx_common_test_helpers:stop_apps([emqx_eviction_agent, emqx_conf]). + +init_per_testcase(t_persistence, Config) -> + emqx_config:put([persistent_session_store, enabled], true), + {ok, _} = emqx_persistent_session_sup:start_link(), + emqx_persistent_session:init_db_backend(), + ?assert(emqx_persistent_session:is_store_enabled()), + Config; +init_per_testcase(_TestCase, Config) -> + Config. + +end_per_testcase(t_persistence, Config) -> + emqx_config:put([persistent_session_store, enabled], false), + emqx_persistent_session:init_db_backend(), + ?assertNot(emqx_persistent_session:is_store_enabled()), + Config; +end_per_testcase(_TestCase, _Config) -> + ok. + +%%-------------------------------------------------------------------- +%% Tests +%%-------------------------------------------------------------------- + +t_start_no_session(_Config) -> + Opts = #{ + clientinfo => #{ + clientid => ?CLIENT_ID, + zone => internal + }, + conninfo => #{ + clientid => ?CLIENT_ID, + receive_maximum => 32, + expiry_interval => 10000 + } + }, + ?assertMatch( + {error, {no_session, _}}, + emqx_eviction_agent_channel:start_supervised(Opts) + ). + +t_start_no_expire(_Config) -> + erlang:process_flag(trap_exit, true), + + _ = emqtt_connect(?CLIENT_ID, false), + + Opts = #{ + clientinfo => #{ + clientid => ?CLIENT_ID, + zone => internal + }, + conninfo => #{ + clientid => ?CLIENT_ID, + receive_maximum => 32, + expiry_interval => 0 + } + }, + ?assertMatch( + {error, {should_be_expired, _}}, + emqx_eviction_agent_channel:start_supervised(Opts) + ). + +t_start_infinite_expire(_Config) -> + erlang:process_flag(trap_exit, true), + + _ = emqtt_connect(?CLIENT_ID, false), + + Opts = #{ + clientinfo => #{ + clientid => ?CLIENT_ID, + zone => internal + }, + conninfo => #{ + clientid => ?CLIENT_ID, + receive_maximum => 32, + expiry_interval => ?UINT_MAX + } + }, + ?assertMatch( + {ok, _}, + emqx_eviction_agent_channel:start_supervised(Opts) + ). + +t_kick(_Config) -> + erlang:process_flag(trap_exit, true), + + _ = emqtt_connect(?CLIENT_ID, false), + Opts = evict_session_opts(?CLIENT_ID), + + {ok, Pid} = emqx_eviction_agent_channel:start_supervised(Opts), + + ?assertEqual( + ok, + emqx_eviction_agent_channel:call(Pid, kick) + ). + +t_discard(_Config) -> + erlang:process_flag(trap_exit, true), + + _ = emqtt_connect(?CLIENT_ID, false), + Opts = evict_session_opts(?CLIENT_ID), + + {ok, Pid} = emqx_eviction_agent_channel:start_supervised(Opts), + + ?assertEqual( + ok, + emqx_eviction_agent_channel:call(Pid, discard) + ). + +t_stop(_Config) -> + erlang:process_flag(trap_exit, true), + + _ = emqtt_connect(?CLIENT_ID, false), + Opts = evict_session_opts(?CLIENT_ID), + + {ok, Pid} = emqx_eviction_agent_channel:start_supervised(Opts), + + ?assertEqual( + ok, + emqx_eviction_agent_channel:stop(Pid) + ). + +t_ignored_calls(_Config) -> + erlang:process_flag(trap_exit, true), + + _ = emqtt_connect(?CLIENT_ID, false), + Opts = evict_session_opts(?CLIENT_ID), + + {ok, Pid} = emqx_eviction_agent_channel:start_supervised(Opts), + + ok = emqx_eviction_agent_channel:cast(Pid, unknown), + Pid ! unknown, + + ?assertEqual( + [], + emqx_eviction_agent_channel:call(Pid, list_acl_cache) + ), + + ?assertEqual( + ok, + emqx_eviction_agent_channel:call(Pid, {quota, quota}) + ), + + ?assertEqual( + ignored, + emqx_eviction_agent_channel:call(Pid, unknown) + ). + +t_expire(_Config) -> + erlang:process_flag(trap_exit, true), + + _ = emqtt_connect(?CLIENT_ID, false), + #{conninfo := ConnInfo} = Opts0 = evict_session_opts(?CLIENT_ID), + Opts1 = Opts0#{conninfo => ConnInfo#{expiry_interval => 1}}, + + {ok, Pid} = emqx_eviction_agent_channel:start_supervised(Opts1), + + ct:sleep(1500), + + ?assertNot(is_process_alive(Pid)). + +t_get_connected_client_count(_Config) -> + erlang:process_flag(trap_exit, true), + + _ = emqtt_connect(?CLIENT_ID, false), + + ?assertEqual( + 1, + emqx_cm:get_connected_client_count() + ), + + Opts = evict_session_opts(?CLIENT_ID), + + {ok, _} = emqx_eviction_agent_channel:start_supervised(Opts), + + ?assertEqual( + 0, + emqx_cm:get_connected_client_count() + ). + +t_persistence(_Config) -> + erlang:process_flag(trap_exit, true), + + Topic = <<"t1">>, + Message = <<"message_to_persist">>, + + {ok, C0} = emqtt_connect(?CLIENT_ID, false), + {ok, _, _} = emqtt:subscribe(C0, Topic, 0), + + Opts = evict_session_opts(?CLIENT_ID), + {ok, Pid} = emqx_eviction_agent_channel:start_supervised(Opts), + + {ok, C1} = emqtt_connect(), + {ok, _} = emqtt:publish(C1, Topic, Message, 1), + ok = emqtt:disconnect(C1), + + %% Kill channel so that the session is only persisted + ok = emqx_eviction_agent_channel:call(Pid, kick), + + %% Should restore session from persistents storage and receive messages + {ok, C2} = emqtt_connect(?CLIENT_ID, false), + + receive + {publish, #{ + payload := Message, + topic := Topic + }} -> + ok + after 1000 -> + ct:fail("message not received") + end, + + ok = emqtt:disconnect(C2). + +%%-------------------------------------------------------------------- +%% Helpers +%%-------------------------------------------------------------------- + +evict_session_opts(ClientId) -> + maps:with( + [conninfo, clientinfo], + emqx_cm:get_chan_info(ClientId) + ). diff --git a/lib-ee/emqx_eviction_agent/test/emqx_eviction_agent_cli_SUITE.erl b/lib-ee/emqx_eviction_agent/test/emqx_eviction_agent_cli_SUITE.erl new file mode 100644 index 000000000..4cfb2fff5 --- /dev/null +++ b/lib-ee/emqx_eviction_agent/test/emqx_eviction_agent_cli_SUITE.erl @@ -0,0 +1,39 @@ +%%-------------------------------------------------------------------- +%% Copyright (c) 2022-2023 EMQ Technologies Co., Ltd. All Rights Reserved. +%%-------------------------------------------------------------------- + +-module(emqx_eviction_agent_cli_SUITE). + +-compile(export_all). +-compile(nowarn_export_all). + +-include_lib("eunit/include/eunit.hrl"). +-include_lib("common_test/include/ct.hrl"). + +all() -> + emqx_common_test_helpers:all(?MODULE). + +init_per_suite(Config) -> + emqx_common_test_helpers:start_apps([emqx_eviction_agent]), + Config. + +end_per_suite(Config) -> + _ = emqx_eviction_agent:disable(foo), + emqx_common_test_helpers:stop_apps([emqx_eviction_agent]), + Config. + +%%-------------------------------------------------------------------- +%% Tests +%%-------------------------------------------------------------------- + +t_status(_Config) -> + %% usage + ok = emqx_eviction_agent_cli:cli(["foobar"]), + + %% status + ok = emqx_eviction_agent_cli:cli(["status"]), + + ok = emqx_eviction_agent:enable(foo, undefined), + + %% status + ok = emqx_eviction_agent_cli:cli(["status"]). diff --git a/lib-ee/emqx_eviction_agent/test/emqx_eviction_agent_test_helpers.erl b/lib-ee/emqx_eviction_agent/test/emqx_eviction_agent_test_helpers.erl new file mode 100644 index 000000000..8f88ebf97 --- /dev/null +++ b/lib-ee/emqx_eviction_agent/test/emqx_eviction_agent_test_helpers.erl @@ -0,0 +1,141 @@ +%%-------------------------------------------------------------------- +%% Copyright (c) 2022-2023 EMQ Technologies Co., Ltd. All Rights Reserved. +%%-------------------------------------------------------------------- + +-module(emqx_eviction_agent_test_helpers). + +-export([ + emqtt_connect/0, + emqtt_connect/1, + emqtt_connect/2, + emqtt_connect_many/2, + stop_many/1, + + emqtt_try_connect/1, + + start_cluster/2, + start_cluster/3, + stop_cluster/2, + + case_specific_node_name/2, + case_specific_node_name/3, + concat_atoms/1 +]). + +emqtt_connect() -> + emqtt_connect(<<"client1">>, true). + +emqtt_connect(ClientId, CleanStart) -> + emqtt_connect([{clientid, ClientId}, {clean_start, CleanStart}]). + +emqtt_connect(Opts) -> + {ok, C} = emqtt:start_link( + Opts ++ + [ + {proto_ver, v5}, + {properties, #{'Session-Expiry-Interval' => 600}} + ] + ), + case emqtt:connect(C) of + {ok, _} -> {ok, C}; + {error, _} = Error -> Error + end. + +emqtt_connect_many(Port, Count) -> + lists:map( + fun(N) -> + NBin = integer_to_binary(N), + ClientId = <<"client-", NBin/binary>>, + {ok, C} = emqtt_connect([{clientid, ClientId}, {clean_start, false}, {port, Port}]), + C + end, + lists:seq(1, Count) + ). + +stop_many(Clients) -> + lists:foreach( + fun(C) -> + catch emqtt:disconnect(C) + end, + Clients + ), + ct:sleep(100). + +emqtt_try_connect(Opts) -> + case emqtt_connect(Opts) of + {ok, C} -> + emqtt:disconnect(C), + ok; + {error, _} = Error -> + Error + end. + +start_cluster(NamesWithPorts, Apps) -> + start_cluster(NamesWithPorts, Apps, []). + +start_cluster(NamesWithPorts, Apps, Env) -> + Specs = lists:map( + fun({ShortName, Port}) -> + {core, ShortName, #{listener_ports => [{tcp, Port}]}} + end, + NamesWithPorts + ), + Opts0 = [ + {env, [{emqx, boot_modules, [broker, listeners]}]}, + {apps, Apps}, + {conf, + [{[listeners, Proto, default, enabled], false} || Proto <- [ssl, ws, wss]] ++ + [{[rpc, mode], async}]}, + {env, Env} + ], + Cluster = emqx_common_test_helpers:emqx_cluster( + Specs, + Opts0 + ), + NodesWithPorts = [ + { + emqx_common_test_helpers:start_slave(Name, Opts), + proplists:get_value(Name, NamesWithPorts) + } + || {Name, Opts} <- Cluster + ], + ok = lists:foreach( + fun({Node, _Port}) -> + snabbkaffe:forward_trace(Node) + end, + NodesWithPorts + ), + NodesWithPorts. + +stop_cluster(NodesWithPorts, Apps) -> + lists:foreach( + fun({Node, _Port}) -> + lists:foreach( + fun(App) -> + rpc:call(Node, application, stop, [App]) + end, + Apps + ), + %% This sleep is just to make logs cleaner + ct:sleep(100), + _ = rpc:call(Node, emqx_common_test_helpers, stop_apps, []), + emqx_common_test_helpers:stop_slave(Node) + end, + NodesWithPorts + ). + +case_specific_node_name(Module, Case) -> + concat_atoms([Module, '__', Case]). + +case_specific_node_name(Module, Case, Node) -> + concat_atoms([Module, '__', Case, '__', Node]). + +concat_atoms(Atoms) -> + binary_to_atom( + iolist_to_binary( + lists:map( + fun atom_to_binary/1, + Atoms + ) + ) + ). diff --git a/lib-ee/emqx_node_rebalance/README.md b/lib-ee/emqx_node_rebalance/README.md new file mode 100644 index 000000000..2e56f62cd --- /dev/null +++ b/lib-ee/emqx_node_rebalance/README.md @@ -0,0 +1,9 @@ +emqx_node_rebalance +===== + +An OTP library + +Build +----- + + $ rebar3 compile diff --git a/lib-ee/emqx_node_rebalance/etc/emqx_node_rebalance.conf b/lib-ee/emqx_node_rebalance/etc/emqx_node_rebalance.conf new file mode 100644 index 000000000..8ace22435 --- /dev/null +++ b/lib-ee/emqx_node_rebalance/etc/emqx_node_rebalance.conf @@ -0,0 +1,3 @@ +##-------------------------------------------------------------------- +## EMQX Node Rebalance Plugin +##-------------------------------------------------------------------- diff --git a/lib-ee/emqx_node_rebalance/i18n/emqx_node_rebalance_api_i18n.conf b/lib-ee/emqx_node_rebalance/i18n/emqx_node_rebalance_api_i18n.conf new file mode 100644 index 000000000..f5f161a92 --- /dev/null +++ b/lib-ee/emqx_node_rebalance/i18n/emqx_node_rebalance_api_i18n.conf @@ -0,0 +1,490 @@ +emqx_node_rebalance_api { + + ## API Request Fields + + load_rebalance_status { + desc { + en: "Get rebalance status of the current node" + zh: "获取当前节点的rebalance状态" + } + label { + en: "Get rebalance status" + zh: "获取rebalance状态" + } + } + + load_rebalance_global_status { + desc { + en: "Get status of all rebalance/evacuation processes across the cluster" + zh: "获取集群中所有rebalance/evacuation进程的状态" + } + label { + en: "Get global rebalance status" + zh: "获取全局rebalance状态" + } + } + + load_rebalance_availability_check { + desc { + en: "Check if the node is being evacuated or rebalanced" + zh: "检查节点是否正在被evacuate或rebalance" + } + label { + en: "Availability check" + zh: "可用性检查" + } + } + + load_rebalance_start { + desc { + en: "Start rebalance process" + zh: "启动rebalance进程" + } + label { + en: "Start rebalance" + zh: "启动rebalance" + } + } + + load_rebalance_stop { + desc { + en: "Stop rebalance process" + zh: "停止rebalance进程" + } + label { + en: "Stop rebalance" + zh: "停止rebalance" + } + } + + load_rebalance_evacuation_start { + desc { + en: "Start evacuation process" + zh: "启动evacuation进程" + } + label { + en: "Start evacuation" + zh: "启动evacuation" + } + } + + load_rebalance_evacuation_stop { + desc { + en: "Stop evacuation process" + zh: "停止evacuation进程" + } + label { + en: "Stop evacuation" + zh: "停止evacuation" + } + } + + param_node { + desc { + en: "Node name" + zh: "节点名称" + } + label { + en: "Node name" + zh: "节点名称" + } + } + + wait_health_check { + desc { + en: "Time to wait before starting the rebalance process, in seconds" + zh: "启动rebalance进程前等待的时间,单位为秒" + } + label { + en: "Wait health check" + zh: "等待健康检查" + } + } + + conn_evict_rate { + desc { + en: "The rate of evicting connections, in connections per second" + zh: "逐出连接的速率,以每秒连接数表示" + } + label { + en: "Connection eviction rate" + zh: "连接驱逐率" + } + } + + sess_evict_rate { + desc { + en: "The rate of evicting sessions, in sessions per second" + zh: "逐出会话的速率,以每秒会话为单位" + } + label { + en: "Session eviction rate" + zh: "会话驱逐率" + } + } + + abs_conn_threshold { + desc { + en: "Maximum desired difference between the number of connections on the node and the average number of connections on the recipient nodes" + zh: "节点上的连接数与接收节点上的平均连接数之间的最大期望差值" + } + label { + en: "Absolute connection threshold" + zh: "绝对连接阈值" + } + } + + rel_conn_threshold { + desc { + en: "Maximum desired fraction between the number of connections on the node and the average number of connections on the recipient nodes" + zh: "节点上的连接数与接收节点上的平均连接数之间的最大期望分数" + } + label { + en: "Relative connection threshold" + zh: "相对连接阈值" + } + } + + abs_sess_threshold { + desc { + en: "Maximum desired difference between the number of sessions on the node and the average number of sessions on the recipient nodes" + zh: "节点上的会话数与接收节点上的平均会话数之间的最大期望差异" + } + label { + en: "Absolute session threshold" + zh: "绝对会话阈值" + } + } + + rel_sess_threshold { + desc { + en: "Maximum desired fraction between the number of sessions on the node and the average number of sessions on the recipient nodes" + zh: "节点上的会话数与接收节点上的平均会话数之间的最大期望分数" + } + label { + en: "Relative session threshold" + zh: "相对会话阈值" + } + } + + wait_takeover { + desc { + en: "Time to wait before starting session evacuation process, in seconds" + zh: "开始会话疏散过程之前等待的时间,以秒为单位" + } + label { + en: "Wait takeover" + zh: "等待接管" + } + } + + redirect_to { + desc { + en: "Server reference to redirect clients to (MQTTv5 Server redirection)" + zh: "将客户端重定向到的服务器参考(MQTTv5 服务器重定向)" + } + label { + en: "Redirect to" + zh: "重定向至" + } + } + + migrate_to { + desc { + en: "Nodes to migrate sessions to" + zh: "将会话迁移到的节点" + } + label { + en: "Migrate to" + zh: "迁移到" + } + } + + rebalance_nodes { + desc { + en: "Nodes to participate in rebalance" + zh: "参与rebalance的节点" + } + label { + en: "Rebalance nodes" + zh: "重新平衡节点" + } + } + + ## API Response Fields + + local_status_enabled { + desc { + en: "Whether the node is being evacuated" + zh: "节点是否正在撤离" + } + label { + en: "Local evacuation status" + zh: "当地避难状况" + } + } + + local_status_process { + desc { + en: "The process that is being performed on the node: evacuation or rebalance" + zh: "正在节点上执行的过程:疏散或重新平衡" + } + label { + en: "Node process" + zh: "节点进程" + } + } + + local_status_state { + desc { + en: "The state of the process that is being performed on the node" + zh: "正在节点上执行的进程的状态" + } + label { + en: "Rebalance/evacuation current state" + zh: "重新平衡/疏散当前状态" + } + } + + local_status_coordinator_node { + desc { + en: "The node that is coordinating rebalance process" + zh: "协调再平衡过程的节点" + } + label { + en: "Coordinator node" + zh: "协调节点" + } + } + + local_status_connection_eviction_rate { + desc { + en: "The rate of evicting connections, in connections per second" + zh: "逐出连接的速率,以每秒连接数表示" + } + label { + en: "Connection eviction rate" + zh: "连接驱逐率" + } + } + + local_status_session_eviction_rate { + desc { + en: "The rate of evicting sessions, in sessions per second" + zh: "逐出会话的速率,以每秒会话为单位" + } + label { + en: "Session eviction rate" + zh: "会话驱逐率" + } + } + + local_status_connection_goal { + desc { + en: "The number of connections that the node should have after the rebalance/evacuation process" + zh: "节点在重新平衡/疏散过程后应该拥有的连接数" + } + label { + en: "Connection goal" + zh: "连接目标" + } + } + + local_status_session_goal { + desc { + en: "The number of sessions that the node should have after the evacuation process" + zh: "疏散过程后节点应有的会话数" + } + label { + en: "Session goal" + zh: "会话目标" + } + } + + local_status_disconnected_session_goal { + desc { + en: "The number of disconnected sessions that the node should have after the rebalance process" + zh: "重新平衡过程后节点应具有的断开连接的会话数" + } + label { + en: "Disconnected session goal" + zh: "断开连接的会话目标" + } + } + + local_status_session_recipients { + desc { + en: "List of nodes to which sessions are being evacuated" + zh: "会话被疏散到的节点列表" + } + label { + en: "Session recipients" + zh: "会话收件人" + } + } + + local_status_recipients { + desc { + en: "List of nodes to which connections/sessions are being evacuated during rebalance" + zh: "在重新平衡期间连接/会话被疏散到的节点列表" + } + label { + en: "Recipients" + zh: "收件人" + } + } + + local_status_stats { + desc { + en: "Statistics of the evacuation/rebalance process" + zh: "疏散/再平衡过程的统计" + } + label { + en: "Statistics" + zh: "统计数据" + } + } + + status_stats_initial_connected { + desc { + en: "The number of connections on the node before the evacuation/rebalance process" + zh: "疏散/重新平衡过程之前节点上的连接数" + } + label { + en: "Initial connected" + zh: "初始连接" + } + } + + status_stats_current_connected { + desc { + en: "Current number of connections on the node" + zh: "节点上的当前连接数" + } + label { + en: "Current connections" + zh: "当前连接" + } + } + + status_stats_initial_sessions { + desc { + en: "The number of sessions on the node before the evacuation/rebalance process" + zh: "疏散/重新平衡过程之前节点上的会话数" + } + label { + en: "Initial sessions" + zh: "初始会话" + } + } + + status_stats_current_sessions { + desc { + en: "Current number of sessions on the node" + zh: "节点上的当前会话数" + } + label { + en: "Current sessions" + zh: "当前会话" + } + } + + status_stats_current_disconnected_sessions { + desc { + en: "Current number of disconnected sessions on the node" + zh: "节点上当前断开连接的会话数" + } + label { + en: "Current disconnected sessions" + zh: "当前断开连接的会话" + } + } + + coordinator_status_donors { + desc { + en: "List of nodes from which connections/sessions are being evacuated" + zh: "正在疏散连接/会话的节点列表" + } + label { + en: "Donors" + zh: "捐助者" + } + } + + coordinator_status_donor_conn_avg { + desc { + en: "Average number of connections per donor node" + zh: "每个供体节点的平均连接数" + } + label { + en: "Donor connections average" + zh: "捐助者连接平均值" + } + } + + coordinator_status_donor_sess_avg { + desc { + en: "Average number of sessions per donor node" + zh: "每个供体节点的平均会话数" + } + label { + en: "Donor sessions average" + zh: "平均捐助会议" + } + } + + coordinator_status_node { + desc { + en: "The node that is coordinating the evacuation/rebalance process" + zh: "协调疏散/再平衡过程的节点" + } + label { + en: "Coordinator node" + zh: "协调节点" + } + } + + evacuation_status_node { + desc { + en: "The node that is being evacuated" + zh: "正在撤离的节点" + } + label { + en: "Evacuated node" + zh: "疏散节点" + } + } + + global_status_evacuations { + desc { + en: "List of nodes that are being evacuated" + zh: "正在撤离的节点列表" + } + label { + en: "Evacuations" + zh: "疏散" + } + } + + global_status_rebalances { + desc { + en: "List of nodes that coordinate a rebalance" + zh: "协调再平衡的节点列表" + } + label { + en: "Rebalances" + zh: "再平衡" + } + } + + empty_response { + desc { + en: "The response is empty" + zh: "响应为空" + } + label { + en: "Empty response" + zh: "空响应" + } + } +} diff --git a/lib-ee/emqx_node_rebalance/include/emqx_node_rebalance.hrl b/lib-ee/emqx_node_rebalance/include/emqx_node_rebalance.hrl new file mode 100644 index 000000000..ccc671e81 --- /dev/null +++ b/lib-ee/emqx_node_rebalance/include/emqx_node_rebalance.hrl @@ -0,0 +1,33 @@ +%%-------------------------------------------------------------------- +%% Copyright (c) 2022-2023 EMQ Technologies Co., Ltd. All Rights Reserved. +%% +%% Licensed under the Apache License, Version 2.0 (the "License"); +%% you may not use this file except in compliance with the License. +%% You may obtain a copy of the License at +%% +%% http://www.apache.org/licenses/LICENSE-2.0 +%% +%% Unless required by applicable law or agreed to in writing, software +%% distributed under the License is distributed on an "AS IS" BASIS, +%% WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +%% See the License for the specific language governing permissions and +%% limitations under the License. +%%-------------------------------------------------------------------- + +-define(DEFAULT_CONN_EVICT_RATE, 500). +-define(DEFAULT_SESS_EVICT_RATE, 500). + +%% sec +-define(DEFAULT_WAIT_HEALTH_CHECK, 60). +%% sec +-define(DEFAULT_WAIT_TAKEOVER, 60). + +-define(DEFAULT_ABS_CONN_THRESHOLD, 1000). +-define(DEFAULT_ABS_SESS_THRESHOLD, 1000). + +-define(DEFAULT_REL_CONN_THRESHOLD, 1.1). +-define(DEFAULT_REL_SESS_THRESHOLD, 1.1). + +-define(EVICT_INTERVAL, 1000). + +-define(EVACUATION_FILENAME, <<".evacuation">>). diff --git a/lib-ee/emqx_node_rebalance/rebar.config b/lib-ee/emqx_node_rebalance/rebar.config new file mode 100644 index 000000000..b055d8f4f --- /dev/null +++ b/lib-ee/emqx_node_rebalance/rebar.config @@ -0,0 +1,2 @@ +{deps, [{emqx, {path, "../../apps/emqx"}}]}. +{project_plugins, [erlfmt]}. diff --git a/lib-ee/emqx_node_rebalance/src/emqx_node_rebalance.app.src b/lib-ee/emqx_node_rebalance/src/emqx_node_rebalance.app.src new file mode 100644 index 000000000..9673e4fda --- /dev/null +++ b/lib-ee/emqx_node_rebalance/src/emqx_node_rebalance.app.src @@ -0,0 +1,22 @@ +{application, emqx_node_rebalance, [ + {description, "EMQX Node Rebalance"}, + {vsn, "5.0.0"}, + {registered, [ + emqx_node_rebalance_sup, + emqx_node_rebalance, + emqx_node_rebalance_agent, + emqx_node_rebalance_evacuation + ]}, + {applications, [ + kernel, + stdlib + ]}, + {mod, {emqx_node_rebalance_app, []}}, + {env, []}, + {modules, []}, + {maintainers, ["EMQX Team "]}, + {links, [ + {"Homepage", "https://emqx.io/"}, + {"Github", "https://github.com/emqx"} + ]} +]}. diff --git a/lib-ee/emqx_node_rebalance/src/emqx_node_rebalance.appup.src b/lib-ee/emqx_node_rebalance/src/emqx_node_rebalance.appup.src new file mode 100644 index 000000000..c1b84778d --- /dev/null +++ b/lib-ee/emqx_node_rebalance/src/emqx_node_rebalance.appup.src @@ -0,0 +1,3 @@ +%% -*- mode: erlang -*- +%% Unless you know what you are doing, DO NOT edit manually!! +{VSN, [{<<".*">>, []}], [{<<".*">>, []}]}. diff --git a/lib-ee/emqx_node_rebalance/src/emqx_node_rebalance.erl b/lib-ee/emqx_node_rebalance/src/emqx_node_rebalance.erl new file mode 100644 index 000000000..1f2adc565 --- /dev/null +++ b/lib-ee/emqx_node_rebalance/src/emqx_node_rebalance.erl @@ -0,0 +1,438 @@ +%%-------------------------------------------------------------------- +%% Copyright (c) 2022-2023 EMQ Technologies Co., Ltd. All Rights Reserved. +%%-------------------------------------------------------------------- + +-module(emqx_node_rebalance). + +-include("emqx_node_rebalance.hrl"). + +-include_lib("emqx/include/logger.hrl"). +-include_lib("emqx/include/types.hrl"). +-include_lib("snabbkaffe/include/snabbkaffe.hrl"). + +-export([ + start/1, + status/0, + status/1, + stop/0 +]). + +-export([start_link/0]). + +-behaviour(gen_statem). + +-export([ + init/1, + callback_mode/0, + handle_event/4, + code_change/4 +]). + +-export([ + is_node_available/0, + available_nodes/1, + connection_count/0, + session_count/0, + disconnected_session_count/0 +]). + +-export_type([ + start_opts/0, + start_error/0 +]). + +%%-------------------------------------------------------------------- +%% APIs +%%-------------------------------------------------------------------- + +-type start_opts() :: #{ + conn_evict_rate => pos_integer(), + sess_evict_rate => pos_integer(), + wait_health_check => pos_integer(), + wait_takeover => pos_integer(), + abs_conn_threshold => pos_integer(), + rel_conn_threshold => number(), + abs_sess_threshold => pos_integer(), + rel_sess_threshold => number(), + nodes => [node()] +}. +-type start_error() :: already_started | [{node(), term()}]. + +-spec start(start_opts()) -> ok_or_error(start_error()). +start(StartOpts) -> + Opts = maps:merge(default_opts(), StartOpts), + gen_statem:call(?MODULE, {start, Opts}). + +-spec stop() -> ok_or_error(not_started). +stop() -> + gen_statem:call(?MODULE, stop). + +-spec status() -> disabled | {enabled, map()}. +status() -> + gen_statem:call(?MODULE, status). + +-spec status(pid()) -> disabled | {enabled, map()}. +status(Pid) -> + gen_statem:call(Pid, status). + +-spec start_link() -> startlink_ret(). +start_link() -> + gen_statem:start_link({local, ?MODULE}, ?MODULE, [], []). + +-spec available_nodes(list(node())) -> list(node()). +available_nodes(Nodes) when is_list(Nodes) -> + {Available, _} = emqx_node_rebalance_proto_v1:available_nodes(Nodes), + lists:filter(fun is_atom/1, Available). + +%%-------------------------------------------------------------------- +%% gen_statem callbacks +%%-------------------------------------------------------------------- + +callback_mode() -> handle_event_function. + +%% states: disabled, wait_health_check, evicting_conns, wait_takeover, evicting_sessions + +init([]) -> + ?tp(debug, emqx_node_rebalance_started, #{}), + {ok, disabled, #{}}. + +%% start +handle_event( + {call, From}, + {start, #{wait_health_check := WaitHealthCheck} = Opts}, + disabled, + #{} = Data +) -> + case enable_rebalance(Data#{opts => Opts}) of + {ok, NewData} -> + ?SLOG(warning, #{msg => "node_rebalance_enabled", opts => Opts}), + {next_state, wait_health_check, NewData, [ + {state_timeout, seconds(WaitHealthCheck), evict_conns}, + {reply, From, ok} + ]}; + {error, Reason} -> + ?SLOG(warning, #{ + msg => "node_rebalance_enable_failed", + reason => Reason + }), + {keep_state_and_data, [{reply, From, {error, Reason}}]} + end; +handle_event({call, From}, {start, _Opts}, _State, #{}) -> + {keep_state_and_data, [{reply, From, {error, already_started}}]}; +%% stop +handle_event({call, From}, stop, disabled, #{}) -> + {keep_state_and_data, [{reply, From, {error, not_started}}]}; +handle_event({call, From}, stop, _State, Data) -> + ok = disable_rebalance(Data), + ?SLOG(warning, #{msg => "node_rebalance_stopped"}), + {next_state, disabled, deinit(Data), [{reply, From, ok}]}; +%% status +handle_event({call, From}, status, disabled, #{}) -> + {keep_state_and_data, [{reply, From, disabled}]}; +handle_event({call, From}, status, State, Data) -> + Stats = get_stats(State, Data), + {keep_state_and_data, [ + {reply, From, + {enabled, Stats#{ + state => State, + coordinator_node => node() + }}} + ]}; +%% conn eviction +handle_event( + state_timeout, + evict_conns, + wait_health_check, + Data +) -> + ?SLOG(warning, #{msg => "node_rebalance_wait_health_check_over"}), + {next_state, evicting_conns, Data, [{state_timeout, 0, evict_conns}]}; +handle_event( + state_timeout, + evict_conns, + evicting_conns, + #{ + opts := #{ + wait_takeover := WaitTakeover, + evict_interval := EvictInterval + } + } = Data +) -> + case evict_conns(Data) of + ok -> + ?SLOG(warning, #{msg => "node_rebalance_evict_conns_over"}), + {next_state, wait_takeover, Data, [ + {state_timeout, seconds(WaitTakeover), evict_sessions} + ]}; + {continue, NewData} -> + {keep_state, NewData, [{state_timeout, EvictInterval, evict_conns}]} + end; +handle_event( + state_timeout, + evict_sessions, + wait_takeover, + Data +) -> + ?SLOG(warning, #{msg => "node_rebalance_wait_takeover_over"}), + {next_state, evicting_sessions, Data, [{state_timeout, 0, evict_sessions}]}; +handle_event( + state_timeout, + evict_sessions, + evicting_sessions, + #{opts := #{evict_interval := EvictInterval}} = Data +) -> + case evict_sessions(Data) of + ok -> + ?tp(debug, emqx_node_rebalance_evict_sess_over, #{}), + ?SLOG(warning, #{msg => "node_rebalance_evict_sessions_over"}), + ok = disable_rebalance(Data), + ?SLOG(warning, #{msg => "node_rebalance_finished_successfully"}), + {next_state, disabled, deinit(Data)}; + {continue, NewData} -> + {keep_state, NewData, [{state_timeout, EvictInterval, evict_sessions}]} + end; +handle_event({call, From}, Msg, _State, _Data) -> + ?SLOG(warning, #{msg => "node_rebalance_unknown_call", call => Msg}), + {keep_state_and_data, [{reply, From, ignored}]}; +handle_event(info, Msg, _State, _Data) -> + ?SLOG(warning, #{msg => "node_rebalance_unknown_info", info => Msg}), + keep_state_and_data; +handle_event(cast, Msg, _State, _Data) -> + ?SLOG(warning, #{msg => "node_rebalance_unknown_cast", cast => Msg}), + keep_state_and_data. + +code_change(_Vsn, State, Data, _Extra) -> + {ok, State, Data}. + +%%-------------------------------------------------------------------- +%% internal funs +%%-------------------------------------------------------------------- + +enable_rebalance(#{opts := Opts} = Data) -> + Nodes = maps:get(nodes, Opts), + ConnCounts = multicall(Nodes, connection_counts, []), + SessCounts = multicall(Nodes, session_counts, []), + {_, Counts} = lists:unzip(ConnCounts), + Avg = avg(Counts), + {DonorCounts, RecipientCounts} = lists:partition( + fun({_Node, Count}) -> + Count >= Avg + end, + ConnCounts + ), + ?SLOG(warning, #{ + msg => "node_rebalance_enabling", + conn_counts => ConnCounts, + donor_counts => DonorCounts, + recipient_counts => RecipientCounts + }), + {DonorNodes, _} = lists:unzip(DonorCounts), + {RecipientNodes, _} = lists:unzip(RecipientCounts), + case need_rebalance(DonorNodes, RecipientNodes, ConnCounts, SessCounts, Opts) of + false -> + {error, nothing_to_balance}; + true -> + _ = multicall(DonorNodes, enable_rebalance_agent, [self()]), + {ok, Data#{ + donors => DonorNodes, + recipients => RecipientNodes, + initial_conn_counts => maps:from_list(ConnCounts), + initial_sess_counts => maps:from_list(SessCounts) + }} + end. + +disable_rebalance(#{donors := DonorNodes}) -> + _ = multicall(DonorNodes, disable_rebalance_agent, [self()]), + ok. + +evict_conns(#{donors := DonorNodes, recipients := RecipientNodes, opts := Opts} = Data) -> + DonorNodeCounts = multicall(DonorNodes, connection_counts, []), + {_, DonorCounts} = lists:unzip(DonorNodeCounts), + RecipientNodeCounts = multicall(RecipientNodes, connection_counts, []), + {_, RecipientCounts} = lists:unzip(RecipientNodeCounts), + + DonorAvg = avg(DonorCounts), + RecipientAvg = avg(RecipientCounts), + Thresholds = thresholds(conn, Opts), + NewData = Data#{ + donor_conn_avg => DonorAvg, + recipient_conn_avg => RecipientAvg, + donor_conn_counts => maps:from_list(DonorNodeCounts), + recipient_conn_counts => maps:from_list(RecipientNodeCounts) + }, + case within_thresholds(DonorAvg, RecipientAvg, Thresholds) of + true -> + ok; + false -> + ConnEvictRate = maps:get(conn_evict_rate, Opts), + NodesToEvict = nodes_to_evict(RecipientAvg, DonorNodeCounts), + ?SLOG(warning, #{ + msg => "node_rebalance_evict_conns", + nodes => NodesToEvict, + counts => ConnEvictRate + }), + _ = multicall(NodesToEvict, evict_connections, [ConnEvictRate]), + {continue, NewData} + end. + +evict_sessions(#{donors := DonorNodes, recipients := RecipientNodes, opts := Opts} = Data) -> + DonorNodeCounts = multicall(DonorNodes, disconnected_session_counts, []), + {_, DonorCounts} = lists:unzip(DonorNodeCounts), + RecipientNodeCounts = multicall(RecipientNodes, disconnected_session_counts, []), + {_, RecipientCounts} = lists:unzip(RecipientNodeCounts), + + DonorAvg = avg(DonorCounts), + RecipientAvg = avg(RecipientCounts), + Thresholds = thresholds(sess, Opts), + NewData = Data#{ + donor_sess_avg => DonorAvg, + recipient_sess_avg => RecipientAvg, + donor_sess_counts => maps:from_list(DonorNodeCounts), + recipient_sess_counts => maps:from_list(RecipientNodeCounts) + }, + case within_thresholds(DonorAvg, RecipientAvg, Thresholds) of + true -> + ok; + false -> + SessEvictRate = maps:get(sess_evict_rate, Opts), + NodesToEvict = nodes_to_evict(RecipientAvg, DonorNodeCounts), + ?SLOG(warning, #{ + msg => "node_rebalance_evict_sessions", + nodes => NodesToEvict, + counts => SessEvictRate + }), + _ = multicall( + NodesToEvict, + evict_sessions, + [SessEvictRate, RecipientNodes, disconnected] + ), + {continue, NewData} + end. + +need_rebalance([] = _DonorNodes, _RecipientNodes, _ConnCounts, _SessCounts, _Opts) -> + false; +need_rebalance(_DonorNodes, [] = _RecipientNodes, _ConnCounts, _SessCounts, _Opts) -> + false; +need_rebalance(DonorNodes, RecipientNodes, ConnCounts, SessCounts, Opts) -> + DonorConnAvg = avg_for_nodes(DonorNodes, ConnCounts), + RecipientConnAvg = avg_for_nodes(RecipientNodes, ConnCounts), + DonorSessAvg = avg_for_nodes(DonorNodes, SessCounts), + RecipientSessAvg = avg_for_nodes(RecipientNodes, SessCounts), + Result = + (not within_thresholds(DonorConnAvg, RecipientConnAvg, thresholds(conn, Opts))) orelse + (not within_thresholds(DonorSessAvg, RecipientSessAvg, thresholds(sess, Opts))), + ?tp( + debug, + emqx_node_rebalance_need_rebalance, + #{ + donors => DonorNodes, + recipients => RecipientNodes, + conn_counts => ConnCounts, + sess_counts => SessCounts, + opts => Opts, + result => Result + } + ), + Result. + +avg_for_nodes(Nodes, Counts) -> + avg(maps:values(maps:with(Nodes, maps:from_list(Counts)))). + +within_thresholds(Value, GoalValue, {AbsThres, RelThres}) -> + (Value =< GoalValue + AbsThres) orelse (Value =< GoalValue * RelThres). + +thresholds(conn, #{abs_conn_threshold := Abs, rel_conn_threshold := Rel}) -> + {Abs, Rel}; +thresholds(sess, #{abs_sess_threshold := Abs, rel_sess_threshold := Rel}) -> + {Abs, Rel}. + +nodes_to_evict(Goal, NodeCounts) -> + {Nodes, _} = lists:unzip( + lists:filter( + fun({_Node, Count}) -> + Count > Goal + end, + NodeCounts + ) + ), + Nodes. + +get_stats(disabled, _Data) -> #{}; +get_stats(_State, Data) -> Data. + +avg(List) when length(List) >= 1 -> + lists:sum(List) / length(List). + +multicall(Nodes, F, A) -> + case apply(emqx_node_rebalance_proto_v1, F, [Nodes | A]) of + {Results, []} -> + case lists:partition(fun is_ok/1, lists:zip(Nodes, Results)) of + {OkResults, []} -> + [{Node, ok_result(Result)} || {Node, Result} <- OkResults]; + {_, BadResults} -> + error({bad_nodes, BadResults}) + end; + {_, [_BadNode | _] = BadNodes} -> + error({bad_nodes, BadNodes}) + end. + +is_ok({_Node, {ok, _}}) -> true; +is_ok({_Node, ok}) -> true; +is_ok(_) -> false. + +ok_result({ok, Result}) -> Result; +ok_result(ok) -> ok. + +connection_count() -> + {ok, emqx_eviction_agent:connection_count()}. + +session_count() -> + {ok, emqx_eviction_agent:session_count()}. + +disconnected_session_count() -> + {ok, emqx_eviction_agent:session_count(disconnected)}. + +default_opts() -> + #{ + conn_evict_rate => ?DEFAULT_CONN_EVICT_RATE, + abs_conn_threshold => ?DEFAULT_ABS_CONN_THRESHOLD, + rel_conn_threshold => ?DEFAULT_REL_CONN_THRESHOLD, + + sess_evict_rate => ?DEFAULT_SESS_EVICT_RATE, + abs_sess_threshold => ?DEFAULT_ABS_SESS_THRESHOLD, + rel_sess_threshold => ?DEFAULT_REL_SESS_THRESHOLD, + + wait_health_check => ?DEFAULT_WAIT_HEALTH_CHECK, + wait_takeover => ?DEFAULT_WAIT_TAKEOVER, + + evict_interval => ?EVICT_INTERVAL, + + nodes => all_nodes() + }. + +deinit(Data) -> + Keys = [ + recipient_conn_avg, + recipient_sess_avg, + donor_conn_avg, + donor_sess_avg, + recipient_conn_counts, + recipient_sess_counts, + donor_conn_counts, + donor_sess_counts, + initial_conn_counts, + initial_sess_counts, + opts + ], + maps:without(Keys, Data). + +is_node_available() -> + true = is_pid(whereis(emqx_node_rebalance_agent)), + disabled = emqx_eviction_agent:status(), + node(). + +all_nodes() -> + mria_mnesia:running_nodes(). + +seconds(Sec) -> + round(timer:seconds(Sec)). diff --git a/lib-ee/emqx_node_rebalance/src/emqx_node_rebalance_agent.erl b/lib-ee/emqx_node_rebalance/src/emqx_node_rebalance_agent.erl new file mode 100644 index 000000000..47708d00e --- /dev/null +++ b/lib-ee/emqx_node_rebalance/src/emqx_node_rebalance_agent.erl @@ -0,0 +1,131 @@ +%%-------------------------------------------------------------------- +%% Copyright (c) 2022-2023 EMQ Technologies Co., Ltd. All Rights Reserved. +%%-------------------------------------------------------------------- + +-module(emqx_node_rebalance_agent). + +-include_lib("emqx/include/emqx_mqtt.hrl"). +-include_lib("emqx/include/logger.hrl"). +-include_lib("emqx/include/types.hrl"). + +-include_lib("stdlib/include/qlc.hrl"). +-include_lib("snabbkaffe/include/snabbkaffe.hrl"). + +-export([ + start_link/0, + enable/1, + disable/1, + status/0 +]). + +-export([ + init/1, + handle_call/3, + handle_info/2, + handle_cast/2, + code_change/3 +]). + +-define(ENABLE_KIND, emqx_node_rebalance). + +%%-------------------------------------------------------------------- +%% APIs +%%-------------------------------------------------------------------- + +-type status() :: {enabled, pid()} | disabled. + +-spec start_link() -> startlink_ret(). +start_link() -> + gen_server:start_link({local, ?MODULE}, ?MODULE, [], []). + +-spec enable(pid()) -> ok_or_error(already_enabled | eviction_agent_busy). +enable(CoordinatorPid) -> + gen_server:call(?MODULE, {enable, CoordinatorPid}). + +-spec disable(pid()) -> ok_or_error(already_disabled | invalid_coordinator). +disable(CoordinatorPid) -> + gen_server:call(?MODULE, {disable, CoordinatorPid}). + +-spec status() -> status(). +status() -> + gen_server:call(?MODULE, status). + +%%-------------------------------------------------------------------- +%% gen_server callbacks +%%-------------------------------------------------------------------- + +init([]) -> + {ok, #{}}. + +handle_call({enable, CoordinatorPid}, _From, St) -> + case St of + #{coordinator_pid := _Pid} -> + {reply, {error, already_enabled}, St}; + _ -> + true = link(CoordinatorPid), + EvictionAgentPid = whereis(emqx_eviction_agent), + true = link(EvictionAgentPid), + case emqx_eviction_agent:enable(?ENABLE_KIND, undefined) of + ok -> + {reply, ok, #{ + coordinator_pid => CoordinatorPid, + eviction_agent_pid => EvictionAgentPid + }}; + {error, eviction_agent_busy} -> + true = unlink(EvictionAgentPid), + true = unlink(CoordinatorPid), + {reply, {error, eviction_agent_busy}, St} + end + end; +handle_call({disable, CoordinatorPid}, _From, St) -> + case St of + #{ + coordinator_pid := CoordinatorPid, + eviction_agent_pid := EvictionAgentPid + } -> + _ = emqx_eviction_agent:disable(?ENABLE_KIND), + true = unlink(EvictionAgentPid), + true = unlink(CoordinatorPid), + NewSt = maps:without( + [coordinator_pid, eviction_agent_pid], + St + ), + {reply, ok, NewSt}; + #{coordinator_pid := _CoordinatorPid} -> + {reply, {error, invalid_coordinator}, St}; + #{} -> + {reply, {error, already_disabled}, St} + end; +handle_call(status, _From, St) -> + case St of + #{coordinator_pid := Pid} -> + {reply, {enabled, Pid}, St}; + _ -> + {reply, disabled, St} + end; +handle_call(Msg, _From, St) -> + ?SLOG(warning, #{ + msg => "unknown_call", + call => Msg, + state => St + }), + {reply, ignored, St}. + +handle_info(Msg, St) -> + ?SLOG(warning, #{ + msg => "unknown_info", + info => Msg, + state => St + }), + {noreply, St}. + +handle_cast(Msg, St) -> + ?SLOG(warning, #{ + msg => "unknown_cast", + cast => Msg, + state => St + }), + {noreply, St}. + +code_change(_Vsn, State, _Extra) -> + {ok, State}. diff --git a/lib-ee/emqx_node_rebalance/src/emqx_node_rebalance_api.erl b/lib-ee/emqx_node_rebalance/src/emqx_node_rebalance_api.erl new file mode 100644 index 000000000..fa322d146 --- /dev/null +++ b/lib-ee/emqx_node_rebalance/src/emqx_node_rebalance_api.erl @@ -0,0 +1,738 @@ +%%-------------------------------------------------------------------- +%% Copyright (c) 2022-2023 EMQ Technologies Co., Ltd. All Rights Reserved. +%%-------------------------------------------------------------------- +-module(emqx_node_rebalance_api). + +-behaviour(minirest_api). + +-include_lib("typerefl/include/types.hrl"). +-include_lib("hocon/include/hoconsc.hrl"). +-include_lib("emqx/include/logger.hrl"). + +%% Swagger specs from hocon schema +-export([ + api_spec/0, + paths/0, + schema/1, + namespace/0 +]). + +-export([ + fields/1, + roots/0 +]). + +%% API callbacks +-export([ + '/load_rebalance/status'/2, + '/load_rebalance/global_status'/2, + '/load_rebalance/availability_check'/2, + '/load_rebalance/:node/start'/2, + '/load_rebalance/:node/stop'/2, + '/load_rebalance/:node/evacuation/start'/2, + '/load_rebalance/:node/evacuation/stop'/2 +]). + +%% Schema examples +-export([ + rebalance_example/0, + rebalance_evacuation_example/0, + translate/2 +]). + +-import(hoconsc, [mk/2, ref/1, ref/2]). +-import(emqx_dashboard_swagger, [error_codes/2]). + +-define(BAD_REQUEST, 'BAD_REQUEST'). +-define(NODE_UNAVAILABLE, 'NODE_UNAVAILABLE'). +-define(NODE_EVACUATING, 'NODE_EVACUATING'). +-define(RPC_ERROR, 'RPC_ERROR'). + +%%-------------------------------------------------------------------- +%% API Spec +%%-------------------------------------------------------------------- + +namespace() -> "load_rebalance". + +api_spec() -> + emqx_dashboard_swagger:spec(?MODULE, #{check_schema => true}). + +paths() -> + [ + "/load_rebalance/status", + "/load_rebalance/global_status", + "/load_rebalance/availability_check", + "/load_rebalance/:node/start", + "/load_rebalance/:node/stop", + "/load_rebalance/:node/evacuation/start", + "/load_rebalance/:node/evacuation/stop" + ]. + +schema("/load_rebalance/status") -> + #{ + 'operationId' => '/load_rebalance/status', + get => #{ + tags => [<<"load_rebalance">>], + summary => <<"Get rebalance status">>, + description => ?DESC("load_rebalance_status"), + responses => #{ + 200 => local_status_response_schema() + } + } + }; +schema("/load_rebalance/global_status") -> + #{ + 'operationId' => '/load_rebalance/global_status', + get => #{ + tags => [<<"load_rebalance">>], + summary => <<"Get global rebalance status">>, + description => ?DESC("load_rebalance_global_status"), + responses => #{ + 200 => response_schema() + } + } + }; +schema("/load_rebalance/availability_check") -> + #{ + 'operationId' => '/load_rebalance/availability_check', + get => #{ + tags => [<<"load_rebalance">>], + summary => <<"Node rebalance availability check">>, + description => ?DESC("load_rebalance_availability_check"), + responses => #{ + 200 => response_schema(), + 503 => error_codes([?NODE_EVACUATING], <<"Node Evacuating">>) + } + } + }; +schema("/load_rebalance/:node/start") -> + #{ + 'operationId' => '/load_rebalance/:node/start', + post => #{ + tags => [<<"load_rebalance">>], + summary => <<"Start rebalancing with the node as coordinator">>, + description => ?DESC("load_rebalance_start"), + parameters => [param_node()], + 'requestBody' => + emqx_dashboard_swagger:schema_with_examples( + ref(rebalance_start), + rebalance_example() + ), + responses => #{ + 200 => response_schema(), + 400 => error_codes([?BAD_REQUEST, ?NODE_UNAVAILABLE], <<"Bad Request">>) + } + } + }; +schema("/load_rebalance/:node/stop") -> + #{ + 'operationId' => '/load_rebalance/:node/stop', + post => #{ + tags => [<<"load_rebalance">>], + summary => <<"Stop rebalancing coordinated by the node">>, + description => ?DESC("load_rebalance_stop"), + parameters => [param_node()], + responses => #{ + 200 => response_schema(), + 400 => error_codes([?BAD_REQUEST, ?NODE_UNAVAILABLE], <<"Bad Request">>) + } + } + }; +schema("/load_rebalance/:node/evacuation/start") -> + #{ + 'operationId' => '/load_rebalance/:node/evacuation/start', + post => #{ + tags => [<<"load_rebalance">>], + summary => <<"Start evacuation on a node">>, + description => ?DESC("load_rebalance_evacuation_start"), + parameters => [param_node()], + 'requestBody' => + emqx_dashboard_swagger:schema_with_examples( + ref(rebalance_evacuation_start), + rebalance_evacuation_example() + ), + responses => #{ + 200 => response_schema(), + 400 => error_codes([?BAD_REQUEST, ?NODE_UNAVAILABLE], <<"Bad Request">>) + } + } + }; +schema("/load_rebalance/:node/evacuation/stop") -> + #{ + 'operationId' => '/load_rebalance/:node/evacuation/stop', + post => #{ + tags => [<<"load_rebalance">>], + summary => <<"Stop evacuation on a node">>, + description => ?DESC("load_rebalance_evacuation_stop"), + parameters => [param_node()], + responses => #{ + 200 => response_schema(), + 400 => error_codes([?BAD_REQUEST, ?NODE_UNAVAILABLE], <<"Bad Request">>) + } + } + }. + +%%-------------------------------------------------------------------- +%% Handlers +%%-------------------------------------------------------------------- + +'/load_rebalance/status'(get, #{}) -> + case emqx_node_rebalance_status:local_status() of + disabled -> + {200, #{status => disabled}}; + {rebalance, Stats} -> + {200, format_status(rebalance, Stats)}; + {evacuation, Stats} -> + {200, format_status(evacuation, Stats)} + end. + +'/load_rebalance/global_status'(get, #{}) -> + #{ + evacuations := Evacuations, + rebalances := Rebalances + } = emqx_node_rebalance_status:global_status(), + {200, #{ + evacuations => format_as_map_list(Evacuations), + rebalances => format_as_map_list(Rebalances) + }}. + +'/load_rebalance/availability_check'(get, #{}) -> + case emqx_eviction_agent:status() of + disabled -> + {200, #{}}; + {enabled, _Stats} -> + error_response(503, ?NODE_EVACUATING, <<"Node Evacuating">>) + end. + +'/load_rebalance/:node/start'(post, #{bindings := #{node := NodeBin}, body := Params0}) -> + with_node(NodeBin, fun(Node) -> + Params1 = translate(rebalance_start, Params0), + with_nodes_at_key(nodes, Params1, fun(Params2) -> + wrap_rpc( + Node, emqx_node_rebalance_api_proto_v1:node_rebalance_start(Node, Params2) + ) + end) + end). + +'/load_rebalance/:node/stop'(post, #{bindings := #{node := NodeBin}}) -> + with_node(NodeBin, fun(Node) -> + wrap_rpc( + Node, emqx_node_rebalance_api_proto_v1:node_rebalance_stop(Node) + ) + end). + +'/load_rebalance/:node/evacuation/start'(post, #{ + bindings := #{node := NodeBin}, body := Params0 +}) -> + with_node(NodeBin, fun(Node) -> + Params1 = translate(rebalance_evacuation_start, Params0), + with_nodes_at_key(migrate_to, Params1, fun(Params2) -> + wrap_rpc( + Node, + emqx_node_rebalance_api_proto_v1:node_rebalance_evacuation_start( + Node, Params2 + ) + ) + end) + end). + +'/load_rebalance/:node/evacuation/stop'(post, #{bindings := #{node := NodeBin}}) -> + with_node(NodeBin, fun(Node) -> + wrap_rpc( + Node, emqx_node_rebalance_api_proto_v1:node_rebalance_evacuation_stop(Node) + ) + end). + +%%-------------------------------------------------------------------- +%% Helpers +%%-------------------------------------------------------------------- + +wrap_rpc(Node, RPCResult) -> + case RPCResult of + ok -> + {200, #{}}; + {error, Reason} -> + error_response( + 400, ?BAD_REQUEST, io_lib:format("error on node ~p: ~p", [Node, Reason]) + ); + {badrpc, Reason} -> + error_response( + 503, ?RPC_ERROR, io_lib:format("RPC error on node ~p: ~p", [Node, Reason]) + ) + end. + +format_status(Process, Stats) -> + Stats#{process => Process, status => enabled}. + +validate_nodes(Key, Params) when is_map_key(Key, Params) -> + BinNodes = maps:get(Key, Params), + {ValidNodes, InvalidNodes} = lists:foldl( + fun(BinNode, {Nodes, UnknownNodes}) -> + case parse_node(BinNode) of + {ok, Node} -> {[Node | Nodes], UnknownNodes}; + {error, _} -> {Nodes, [BinNode | UnknownNodes]} + end + end, + {[], []}, + BinNodes + ), + case InvalidNodes of + [] -> + case emqx_node_rebalance_evacuation:available_nodes(ValidNodes) of + ValidNodes -> {ok, Params#{Key => ValidNodes}}; + OtherNodes -> {error, {unavailable, ValidNodes -- OtherNodes}} + end; + _ -> + {error, {invalid, InvalidNodes}} + end; +validate_nodes(_Key, Params) -> + {ok, Params}. + +with_node(BinNode, Fun) -> + case parse_node(BinNode) of + {ok, Node} -> Fun(Node); + {error, _} -> error_response(400, ?BAD_REQUEST, [<<"Invalid node: ">>, BinNode]) + end. + +with_nodes_at_key(Key, Params, Fun) -> + Res = validate_nodes(Key, Params), + case Res of + {ok, Params1} -> + Fun(Params1); + {error, {unavailable, Nodes}} -> + error_response(400, ?NODE_UNAVAILABLE, io_lib:format("Nodes unavailable: ~p", [Nodes])); + {error, {invalid, Nodes}} -> + error_response(400, ?BAD_REQUEST, io_lib:format("Invalid nodes: ~p", [Nodes])) + end. + +parse_node(Bin) when is_binary(Bin) -> + try + {ok, binary_to_existing_atom(Bin)} + catch + error:badarg -> + {error, {unknown, Bin}} + end. + +format_as_map_list(List) -> + lists:map( + fun({Node, Info}) -> + Info#{node => Node} + end, + List + ). + +error_response(HttpCode, Code, Message) -> + {HttpCode, #{ + code => atom_to_binary(Code), + message => iolist_to_binary(Message) + }}. + +without(Keys, Props) -> + lists:filter( + fun({Key, _}) -> + not lists:member(Key, Keys) + end, + Props + ). + +%%------------------------------------------------------------------------------ +%% Schema +%%------------------------------------------------------------------------------ + +translate(Ref, Conf) -> + Options = #{atom_key => true}, + #{Ref := TranslatedConf} = hocon_tconf:check_plain( + ?MODULE, #{atom_to_binary(Ref) => Conf}, Options, [Ref] + ), + TranslatedConf. + +param_node() -> + { + node, + mk(binary(), #{ + in => path, + desc => ?DESC(param_node), + required => true + }) + }. + +fields(rebalance_start) -> + [ + {"wait_health_check", + mk( + emqx_schema:duration_s(), + #{ + desc => ?DESC(wait_health_check), + required => false + } + )}, + {"conn_evict_rate", + mk( + pos_integer(), + #{ + desc => ?DESC(conn_evict_rate), + required => false + } + )}, + {"sess_evict_rate", + mk( + pos_integer(), + #{ + desc => ?DESC(sess_evict_rate), + required => false + } + )}, + {"abs_conn_threshold", + mk( + pos_integer(), + #{ + desc => ?DESC(abs_conn_threshold), + required => false + } + )}, + {"rel_conn_threshold", + mk( + number(), + #{ + desc => ?DESC(rel_conn_threshold), + required => false, + validator => [fun(Value) -> Value > 1.0 end] + } + )}, + {"abs_sess_threshold", + mk( + pos_integer(), + #{ + desc => ?DESC(abs_sess_threshold), + required => false + } + )}, + {"rel_sess_threshold", + mk( + number(), + #{ + desc => ?DESC(rel_sess_threshold), + required => false, + validator => [fun(Value) -> Value > 1.0 end] + } + )}, + {"wait_takeover", + mk( + emqx_schema:duration_s(), + #{ + desc => ?DESC(wait_takeover), + required => false + } + )}, + {"nodes", + mk( + list(binary()), + #{ + desc => ?DESC(rebalance_nodes), + required => false, + validator => [fun(Values) -> length(Values) > 0 end] + } + )} + ]; +fields(rebalance_evacuation_start) -> + [ + {"conn_evict_rate", + mk( + pos_integer(), + #{ + desc => ?DESC(conn_evict_rate), + required => false + } + )}, + {"sess_evict_rate", + mk( + pos_integer(), + #{ + desc => ?DESC(sess_evict_rate), + required => false + } + )}, + {"redirect_to", + mk( + binary(), + #{ + desc => ?DESC(redirect_to), + required => false + } + )}, + {"wait_takeover", + mk( + pos_integer(), + #{ + desc => ?DESC(wait_takeover), + required => false + } + )}, + {"migrate_to", + mk( + list(binary()), + #{ + desc => ?DESC(migrate_to), + required => false, + validator => [fun(Values) -> length(Values) > 0 end] + } + )} + ]; +fields(local_status_disabled) -> + [ + {"status", + mk( + disabled, + #{ + desc => ?DESC(local_status_enabled), + required => true + } + )} + ]; +fields(local_status_enabled) -> + [ + {"status", + mk( + enabled, + #{ + desc => ?DESC(local_status_enabled), + required => true + } + )}, + {"process", + mk( + hoconsc:union([rebalance, evacuation]), + #{ + desc => ?DESC(local_status_process), + required => true + } + )}, + {"state", + mk( + atom(), + #{ + desc => ?DESC(local_status_state), + required => true + } + )}, + {"coordinator_node", + mk( + binary(), + #{ + desc => ?DESC(local_status_coordinator_node), + required => false + } + )}, + {"connection_eviction_rate", + mk( + pos_integer(), + #{ + desc => ?DESC(local_status_connection_eviction_rate), + required => false + } + )}, + {"session_eviction_rate", + mk( + pos_integer(), + #{ + desc => ?DESC(local_status_session_eviction_rate), + required => false + } + )}, + {"connection_goal", + mk( + non_neg_integer(), + #{ + desc => ?DESC(local_status_connection_goal), + required => false + } + )}, + {"session_goal", + mk( + non_neg_integer(), + #{ + desc => ?DESC(local_status_session_goal), + required => false + } + )}, + {"disconnected_session_goal", + mk( + non_neg_integer(), + #{ + desc => ?DESC(local_status_disconnected_session_goal), + required => false + } + )}, + {"session_recipients", + mk( + list(binary()), + #{ + desc => ?DESC(local_status_session_recipients), + required => false + } + )}, + {"recipients", + mk( + list(binary()), + #{ + desc => ?DESC(local_status_recipients), + required => false + } + )}, + {"stats", + mk( + ref(status_stats), + #{ + desc => ?DESC(local_status_stats), + required => false + } + )} + ]; +fields(status_stats) -> + [ + {"initial_connected", + mk( + non_neg_integer(), + #{ + desc => ?DESC(status_stats_initial_connected), + required => true + } + )}, + {"current_connected", + mk( + non_neg_integer(), + #{ + desc => ?DESC(status_stats_current_connected), + required => true + } + )}, + {"initial_sessions", + mk( + non_neg_integer(), + #{ + desc => ?DESC(status_stats_initial_sessions), + required => true + } + )}, + {"current_sessions", + mk( + non_neg_integer(), + #{ + desc => ?DESC(status_stats_current_sessions), + required => true + } + )}, + {"current_disconnected_sessions", + mk( + non_neg_integer(), + #{ + desc => ?DESC(status_stats_current_disconnected_sessions), + required => false + } + )} + ]; +fields(global_coordinator_status) -> + without( + ["status", "process", "session_goal", "session_recipients", "stats"], + fields(local_status_enabled) + ) ++ + [ + {"donors", + mk( + list(binary()), + #{ + desc => ?DESC(coordinator_status_donors), + required => false + } + )}, + {"donor_conn_avg", + mk( + non_neg_integer(), + #{ + desc => ?DESC(coordinator_status_donor_conn_avg), + required => false + } + )}, + {"donor_sess_avg", + mk( + non_neg_integer(), + #{ + desc => ?DESC(coordinator_status_donor_sess_avg), + required => false + } + )}, + {"node", + mk( + binary(), + #{ + desc => ?DESC(coordinator_status_node), + required => true + } + )} + ]; +fields(global_evacuation_status) -> + without(["status", "process"], fields(local_status_enabled)) ++ + [ + {"node", + mk( + binary(), + #{ + desc => ?DESC(evacuation_status_node), + required => true + } + )} + ]; +fields(global_status) -> + [ + {"evacuations", + mk( + hoconsc:array(ref(global_evacuation_status)), + #{ + desc => ?DESC(global_status_evacuations), + required => false + } + )}, + {"rebalances", + mk( + hoconsc:array(ref(global_coordinator_status)), + #{ + desc => ?DESC(global_status_rebalances), + required => false + } + )} + ]. + +rebalance_example() -> + #{ + wait_health_check => 10, + conn_evict_rate => 10, + sess_evict_rate => 20, + abs_conn_threshold => 10, + rel_conn_threshold => 1.5, + abs_sess_threshold => 10, + rel_sess_threshold => 1.5, + wait_takeover => 10, + nodes => [<<"othernode@127.0.0.1">>] + }. + +rebalance_evacuation_example() -> + #{ + conn_evict_rate => 100, + sess_evict_rate => 100, + redirect_to => <<"othernode:1883">>, + wait_takeover => 10, + migrate_to => [<<"othernode@127.0.0.1">>] + }. + +local_status_response_schema() -> + hoconsc:union([ref(local_status_disabled), ref(local_status_enabled)]). + +response_schema() -> + mk( + map(), + #{ + desc => ?DESC(empty_response) + } + ). + +roots() -> []. diff --git a/lib-ee/emqx_node_rebalance/src/emqx_node_rebalance_app.erl b/lib-ee/emqx_node_rebalance/src/emqx_node_rebalance_app.erl new file mode 100644 index 000000000..3cd59e0f4 --- /dev/null +++ b/lib-ee/emqx_node_rebalance/src/emqx_node_rebalance_app.erl @@ -0,0 +1,22 @@ +%%-------------------------------------------------------------------- +%% Copyright (c) 2022-2023 EMQ Technologies Co., Ltd. All Rights Reserved. +%%-------------------------------------------------------------------- + +-module(emqx_node_rebalance_app). + +-behaviour(application). + +-emqx_plugin(?MODULE). + +-export([ + start/2, + stop/1 +]). + +start(_Type, _Args) -> + {ok, Sup} = emqx_node_rebalance_sup:start_link(), + ok = emqx_node_rebalance_cli:load(), + {ok, Sup}. + +stop(_State) -> + emqx_node_rebalance_cli:unload(). diff --git a/lib-ee/emqx_node_rebalance/src/emqx_node_rebalance_cli.erl b/lib-ee/emqx_node_rebalance/src/emqx_node_rebalance_cli.erl new file mode 100644 index 000000000..a2706f13b --- /dev/null +++ b/lib-ee/emqx_node_rebalance/src/emqx_node_rebalance_cli.erl @@ -0,0 +1,305 @@ +%%-------------------------------------------------------------------- +%% Copyright (c) 2022-2023 EMQ Technologies Co., Ltd. All Rights Reserved. +%%-------------------------------------------------------------------- + +-module(emqx_node_rebalance_cli). + +%% APIs +-export([ + load/0, + unload/0, + cli/1 +]). + +load() -> + emqx_ctl:register_command(rebalance, {?MODULE, cli}, []). + +unload() -> + emqx_ctl:unregister_command(rebalance). + +cli(["start" | StartArgs]) -> + case start_args(StartArgs) of + {evacuation, Opts} -> + case emqx_node_rebalance_evacuation:status() of + disabled -> + ok = emqx_node_rebalance_evacuation:start(Opts), + emqx_ctl:print("Rebalance(evacuation) started~n"), + true; + {enabled, _} -> + emqx_ctl:print("Rebalance is already enabled~n"), + false + end; + {rebalance, Opts} -> + case emqx_node_rebalance:start(Opts) of + ok -> + emqx_ctl:print("Rebalance started~n"), + true; + {error, Reason} -> + emqx_ctl:print("Rebalance start error: ~p~n", [Reason]), + false + end; + {error, Error} -> + emqx_ctl:print("Rebalance start error: ~s~n", [Error]), + false + end; +cli(["node-status", NodeStr]) -> + case emqx_misc:safe_to_existing_atom(NodeStr, utf8) of + {ok, Node} -> + node_status(emqx_node_rebalance_status:local_status(Node)); + {error, _} -> + emqx_ctl:print("Node status error: invalid node~n"), + false + end; +cli(["node-status"]) -> + node_status(emqx_node_rebalance_status:local_status()); +cli(["status"]) -> + #{ + evacuations := Evacuations, + rebalances := Rebalances + } = emqx_node_rebalance_status:global_status(), + lists:foreach( + fun({Node, Status}) -> + emqx_ctl:print( + "--------------------------------------------------------------------~n" + ), + emqx_ctl:print( + "Node ~p: evacuation~n~s", + [Node, emqx_node_rebalance_status:format_local_status(Status)] + ) + end, + Evacuations + ), + lists:foreach( + fun({Node, Status}) -> + emqx_ctl:print( + "--------------------------------------------------------------------~n" + ), + emqx_ctl:print( + "Node ~p: rebalance coordinator~n~s", + [Node, emqx_node_rebalance_status:format_coordinator_status(Status)] + ) + end, + Rebalances + ); +cli(["stop"]) -> + case emqx_node_rebalance_evacuation:status() of + {enabled, _} -> + ok = emqx_node_rebalance_evacuation:stop(), + emqx_ctl:print("Rebalance(evacuation) stopped~n"), + true; + disabled -> + case emqx_node_rebalance:status() of + {enabled, _} -> + ok = emqx_node_rebalance:stop(), + emqx_ctl:print("Rebalance stopped~n"), + true; + disabled -> + emqx_ctl:print("Rebalance is already disabled~n"), + false + end + end; +cli(_) -> + emqx_ctl:usage( + [ + { + "rebalance start --evacuation \\\n" + " [--redirect-to \"Host1:Port1 Host2:Port2 ...\"] \\\n" + " [--conn-evict-rate CountPerSec] \\\n" + " [--migrate-to \"node1@host1 node2@host2 ...\"] \\\n" + " [--wait-takeover Secs] \\\n" + " [--sess-evict-rate CountPerSec]", + "Start current node evacuation with optional server redirect to the specified servers" + }, + + { + "rebalance start \\\n" + " [--nodes \"node1@host1 node2@host2\"] \\\n" + " [--wait-health-check Secs] \\\n" + " [--conn-evict-rate ConnPerSec] \\\n" + " [--abs-conn-threshold Count] \\\n" + " [--rel-conn-threshold Fraction] \\\n" + " [--conn-evict-rate ConnPerSec] \\\n" + " [--wait-takeover Secs] \\\n" + " [--sess-evict-rate CountPerSec] \\\n" + " [--abs-sess-threshold Count] \\\n" + " [--rel-sess-threshold Fraction]", + "Start rebalance on the specified nodes using the current node as the coordinator" + }, + + {"rebalance node-status", "Get current node rebalance status"}, + + {"rebalance node-status \"node1@host1\"", "Get remote node rebalance status"}, + + {"rebalance status", + "Get statuses of all current rebalance/evacuation processes across the cluster"}, + + {"rebalance stop", "Stop node rebalance"} + ] + ). + +node_status(NodeStatus) -> + case NodeStatus of + {Process, Status} when Process =:= evacuation orelse Process =:= rebalance -> + emqx_ctl:print( + "Rebalance type: ~p~n~s~n", + [Process, emqx_node_rebalance_status:format_local_status(Status)] + ); + disabled -> + emqx_ctl:print("Rebalance disabled~n"); + Other -> + emqx_ctl:print("Error detecting rebalance status: ~p~n", [Other]) + end. + +start_args(Args) -> + case collect_args(Args, #{}) of + {ok, #{"--evacuation" := true} = Collected} -> + case validate_evacuation(maps:to_list(Collected), #{}) of + {ok, Validated} -> + {evacuation, Validated}; + {error, _} = Error -> + Error + end; + {ok, #{} = Collected} -> + case validate_rebalance(maps:to_list(Collected), #{}) of + {ok, Validated} -> + {rebalance, Validated}; + {error, _} = Error -> + Error + end; + {error, _} = Error -> + Error + end. + +collect_args([], Map) -> + {ok, Map}; +%% evacuation +collect_args(["--evacuation" | Args], Map) -> + collect_args(Args, Map#{"--evacuation" => true}); +collect_args(["--redirect-to", ServerReference | Args], Map) -> + collect_args(Args, Map#{"--redirect-to" => ServerReference}); +collect_args(["--migrate-to", MigrateTo | Args], Map) -> + collect_args(Args, Map#{"--migrate-to" => MigrateTo}); +%% rebalance +collect_args(["--nodes", Nodes | Args], Map) -> + collect_args(Args, Map#{"--nodes" => Nodes}); +collect_args(["--wait-health-check", WaitHealthCheck | Args], Map) -> + collect_args(Args, Map#{"--wait-health-check" => WaitHealthCheck}); +collect_args(["--abs-conn-threshold", AbsConnThres | Args], Map) -> + collect_args(Args, Map#{"--abs-conn-threshold" => AbsConnThres}); +collect_args(["--rel-conn-threshold", RelConnThres | Args], Map) -> + collect_args(Args, Map#{"--rel-conn-threshold" => RelConnThres}); +collect_args(["--abs-sess-threshold", AbsSessThres | Args], Map) -> + collect_args(Args, Map#{"--abs-sess-threshold" => AbsSessThres}); +collect_args(["--rel-sess-threshold", RelSessThres | Args], Map) -> + collect_args(Args, Map#{"--rel-sess-threshold" => RelSessThres}); +%% common +collect_args(["--conn-evict-rate", ConnEvictRate | Args], Map) -> + collect_args(Args, Map#{"--conn-evict-rate" => ConnEvictRate}); +collect_args(["--wait-takeover", WaitTakeover | Args], Map) -> + collect_args(Args, Map#{"--wait-takeover" => WaitTakeover}); +collect_args(["--sess-evict-rate", SessEvictRate | Args], Map) -> + collect_args(Args, Map#{"--sess-evict-rate" => SessEvictRate}); +%% fallback +collect_args(Args, _Map) -> + {error, io_lib:format("unknown arguments: ~p", [Args])}. + +validate_evacuation([], Map) -> + {ok, Map}; +validate_evacuation([{"--evacuation", _} | Rest], Map) -> + validate_evacuation(Rest, Map); +validate_evacuation([{"--redirect-to", ServerReference} | Rest], Map) -> + validate_evacuation(Rest, Map#{server_reference => list_to_binary(ServerReference)}); +validate_evacuation([{"--conn-evict-rate", _} | _] = Opts, Map) -> + validate_pos_int(conn_evict_rate, Opts, Map, fun validate_evacuation/2); +validate_evacuation([{"--sess-evict-rate", _} | _] = Opts, Map) -> + validate_pos_int(sess_evict_rate, Opts, Map, fun validate_evacuation/2); +validate_evacuation([{"--wait-takeover", _} | _] = Opts, Map) -> + validate_pos_int(wait_takeover, Opts, Map, fun validate_evacuation/2); +validate_evacuation([{"--migrate-to", MigrateTo} | Rest], Map) -> + case strings_to_atoms(string:tokens(MigrateTo, ", ")) of + {_, Invalid} when Invalid =/= [] -> + {error, io_lib:format("invalid --migrate-to, invalid nodes: ~p", [Invalid])}; + {Nodes, []} -> + case emqx_node_rebalance_evacuation:available_nodes(Nodes) of + [] -> + {error, "invalid --migrate-to, no nodes"}; + Nodes -> + validate_evacuation(Rest, Map#{migrate_to => Nodes}); + OtherNodes -> + {error, + io_lib:format( + "invalid --migrate-to, unavailable nodes: ~p", + [Nodes -- OtherNodes] + )} + end + end; +validate_evacuation(Rest, _Map) -> + {error, io_lib:format("unknown evacuation arguments: ~p", [Rest])}. + +validate_rebalance([], Map) -> + {ok, Map}; +validate_rebalance([{"--wait-health-check", _} | _] = Opts, Map) -> + validate_pos_int(wait_health_check, Opts, Map, fun validate_rebalance/2); +validate_rebalance([{"--conn-evict-rate", _} | _] = Opts, Map) -> + validate_pos_int(conn_evict_rate, Opts, Map, fun validate_rebalance/2); +validate_rebalance([{"--sess-evict-rate", _} | _] = Opts, Map) -> + validate_pos_int(sess_evict_rate, Opts, Map, fun validate_rebalance/2); +validate_rebalance([{"--abs-conn-threshold", _} | _] = Opts, Map) -> + validate_pos_int(abs_conn_threshold, Opts, Map, fun validate_rebalance/2); +validate_rebalance([{"--rel-conn-threshold", _} | _] = Opts, Map) -> + validate_fraction(rel_conn_threshold, Opts, Map, fun validate_rebalance/2); +validate_rebalance([{"--abs-sess-threshold", _} | _] = Opts, Map) -> + validate_pos_int(abs_sess_threshold, Opts, Map, fun validate_rebalance/2); +validate_rebalance([{"--rel-sess-threshold", _} | _] = Opts, Map) -> + validate_fraction(rel_sess_threshold, Opts, Map, fun validate_rebalance/2); +validate_rebalance([{"--wait-takeover", _} | _] = Opts, Map) -> + validate_pos_int(wait_takeover, Opts, Map, fun validate_rebalance/2); +validate_rebalance([{"--nodes", NodeStr} | Rest], Map) -> + case strings_to_atoms(string:tokens(NodeStr, ", ")) of + {_, Invalid} when Invalid =/= [] -> + {error, io_lib:format("invalid --nodes, invalid nodes: ~p", [Invalid])}; + {Nodes, []} -> + case emqx_node_rebalance:available_nodes(Nodes) of + [] -> + {error, "invalid --nodes, no nodes"}; + Nodes -> + validate_rebalance(Rest, Map#{nodes => Nodes}); + OtherNodes -> + {error, + io_lib:format( + "invalid --nodes, unavailable nodes: ~p", + [Nodes -- OtherNodes] + )} + end + end; +validate_rebalance(Rest, _Map) -> + {error, io_lib:format("unknown rebalance arguments: ~p", [Rest])}. + +validate_fraction(Name, [{OptionName, Value} | Rest], Map, Next) -> + case string:to_float(Value) of + {Num, ""} when Num > 1.0 -> + Next(Rest, Map#{Name => Num}); + _ -> + {error, "invalid " ++ OptionName ++ " value"} + end. + +validate_pos_int(Name, [{OptionName, Value} | Rest], Map, Next) -> + case string:to_integer(Value) of + {Int, ""} when Int > 0 -> + Next(Rest, Map#{Name => Int}); + _ -> + {error, "invalid " ++ OptionName ++ " value"} + end. + +strings_to_atoms(Strings) -> + strings_to_atoms(Strings, [], []). + +strings_to_atoms([], Atoms, Invalid) -> + {lists:reverse(Atoms), lists:reverse(Invalid)}; +strings_to_atoms([Str | Rest], Atoms, Invalid) -> + case emqx_misc:safe_to_existing_atom(Str, utf8) of + {ok, Atom} -> + strings_to_atoms(Rest, [Atom | Atoms], Invalid); + {error, _} -> + strings_to_atoms(Rest, Atoms, [Str | Invalid]) + end. diff --git a/lib-ee/emqx_node_rebalance/src/emqx_node_rebalance_evacuation.erl b/lib-ee/emqx_node_rebalance/src/emqx_node_rebalance_evacuation.erl new file mode 100644 index 000000000..4de362ca9 --- /dev/null +++ b/lib-ee/emqx_node_rebalance/src/emqx_node_rebalance_evacuation.erl @@ -0,0 +1,308 @@ +%%-------------------------------------------------------------------- +%% Copyright (c) 2022-2023 EMQ Technologies Co., Ltd. All Rights Reserved. +%%-------------------------------------------------------------------- + +-module(emqx_node_rebalance_evacuation). + +-include("emqx_node_rebalance.hrl"). + +-include_lib("emqx/include/logger.hrl"). +-include_lib("emqx/include/types.hrl"). +-include_lib("snabbkaffe/include/snabbkaffe.hrl"). + +-export([ + start/1, + status/0, + stop/0 +]). + +-export([start_link/0]). + +-behaviour(gen_statem). + +-export([ + init/1, + callback_mode/0, + handle_event/4, + code_change/4 +]). + +-export([ + is_node_available/0, + available_nodes/1 +]). + +-export_type([ + start_opts/0, + start_error/0 +]). + +-ifdef(TEST). +-export([migrate_to/1]). +-endif. + +%%-------------------------------------------------------------------- +%% APIs +%%-------------------------------------------------------------------- + +-define(EVICT_INTERVAL_NO_NODES, 30000). + +-type migrate_to() :: [node()] | undefined. + +-type start_opts() :: #{ + server_reference => emqx_eviction_agent:server_reference(), + conn_evict_rate => pos_integer(), + sess_evict_rate => pos_integer(), + wait_takeover => pos_integer(), + migrate_to => migrate_to() +}. +-type start_error() :: already_started | eviction_agent_busy. +-type stats() :: #{ + initial_conns := non_neg_integer(), + initial_sessions := non_neg_integer(), + current_conns := non_neg_integer(), + current_sessions := non_neg_integer(), + conn_evict_rate := pos_integer(), + sess_evict_rate := pos_integer(), + server_reference := emqx_eviction_agent:server_reference(), + migrate_to := migrate_to() +}. +-type status() :: {enabled, stats()} | disabled. + +-spec start(start_opts()) -> ok_or_error(start_error()). +start(StartOpts) -> + Opts = maps:merge(default_opts(), StartOpts), + gen_statem:call(?MODULE, {start, Opts}). + +-spec stop() -> ok_or_error(not_started). +stop() -> + gen_statem:call(?MODULE, stop). + +-spec status() -> status(). +status() -> + gen_statem:call(?MODULE, status). + +-spec start_link() -> startlink_ret(). +start_link() -> + gen_statem:start_link({local, ?MODULE}, ?MODULE, [], []). + +-spec available_nodes(list(node())) -> list(node()). +available_nodes(Nodes) when is_list(Nodes) -> + {Available, _} = emqx_node_rebalance_evacuation_proto_v1:available_nodes(Nodes), + lists:filter(fun is_atom/1, Available). + +%%-------------------------------------------------------------------- +%% gen_statem callbacks +%%-------------------------------------------------------------------- + +callback_mode() -> handle_event_function. + +%% states: disabled, evicting_conns, waiting_takeover, evicting_sessions, prohibiting + +init([]) -> + case emqx_node_rebalance_evacuation_persist:read(default_opts()) of + {ok, #{server_reference := ServerReference} = Opts} -> + ?SLOG(warning, #{msg => "restoring_evacuation_state", opts => Opts}), + case emqx_eviction_agent:enable(?MODULE, ServerReference) of + ok -> + Data = init_data(#{}, Opts), + ok = warn_enabled(), + {ok, evicting_conns, Data, [{state_timeout, 0, evict_conns}]}; + {error, eviction_agent_busy} -> + emqx_node_rebalance_evacuation_persist:clear(), + {ok, disabled, #{}} + end; + none -> + {ok, disabled, #{}} + end. + +%% start +handle_event( + {call, From}, + {start, #{server_reference := ServerReference} = Opts}, + disabled, + #{} = Data +) -> + case emqx_eviction_agent:enable(?MODULE, ServerReference) of + ok -> + NewData = init_data(Data, Opts), + ok = emqx_node_rebalance_evacuation_persist:save(Opts), + ?SLOG(warning, #{ + msg => "node_evacuation_started", + opts => Opts + }), + {next_state, evicting_conns, NewData, [ + {state_timeout, 0, evict_conns}, + {reply, From, ok} + ]}; + {error, eviction_agent_busy} -> + {keep_state_and_data, [{reply, From, {error, eviction_agent_busy}}]} + end; +handle_event({call, From}, {start, _Opts}, _State, #{}) -> + {keep_state_and_data, [{reply, From, {error, already_started}}]}; +%% stop +handle_event({call, From}, stop, disabled, #{}) -> + {keep_state_and_data, [{reply, From, {error, not_started}}]}; +handle_event({call, From}, stop, _State, Data) -> + ok = emqx_node_rebalance_evacuation_persist:clear(), + _ = emqx_eviction_agent:disable(?MODULE), + ?SLOG(warning, #{msg => "node_evacuation_stopped"}), + {next_state, disabled, deinit(Data), [{reply, From, ok}]}; +%% status +handle_event({call, From}, status, disabled, #{}) -> + {keep_state_and_data, [{reply, From, disabled}]}; +handle_event({call, From}, status, State, #{migrate_to := MigrateTo} = Data) -> + Stats = maps:with( + [ + initial_conns, + current_conns, + initial_sessions, + current_sessions, + server_reference, + conn_evict_rate, + sess_evict_rate + ], + Data + ), + {keep_state_and_data, [ + {reply, From, {enabled, Stats#{state => State, migrate_to => migrate_to(MigrateTo)}}} + ]}; +%% conn eviction +handle_event( + state_timeout, + evict_conns, + evicting_conns, + #{ + conn_evict_rate := ConnEvictRate, + wait_takeover := WaitTakeover + } = Data +) -> + case emqx_eviction_agent:status() of + {enabled, #{connections := Conns}} when Conns > 0 -> + ok = emqx_eviction_agent:evict_connections(ConnEvictRate), + ?tp(debug, node_evacuation_evict_conn, #{conn_evict_rate => ConnEvictRate}), + ?SLOG( + warning, + #{ + msg => "node_evacuation_evict_conns", + count => Conns, + conn_evict_rate => ConnEvictRate + } + ), + NewData = Data#{current_conns => Conns}, + {keep_state, NewData, [{state_timeout, ?EVICT_INTERVAL, evict_conns}]}; + {enabled, #{connections := 0}} -> + NewData = Data#{current_conns => 0}, + ?SLOG(warning, #{msg => "node_evacuation_evict_conns_done"}), + {next_state, waiting_takeover, NewData, [ + {state_timeout, timer:seconds(WaitTakeover), evict_sessions} + ]} + end; +handle_event( + state_timeout, + evict_sessions, + waiting_takeover, + Data +) -> + ?SLOG(warning, #{msg => "node_evacuation_waiting_takeover_done"}), + {next_state, evicting_sessions, Data, [{state_timeout, 0, evict_sessions}]}; +%% session eviction +handle_event( + state_timeout, + evict_sessions, + evicting_sessions, + #{ + sess_evict_rate := SessEvictRate, + migrate_to := MigrateTo, + current_sessions := CurrSessCount + } = Data +) -> + case emqx_eviction_agent:status() of + {enabled, #{sessions := SessCount}} when SessCount > 0 -> + case migrate_to(MigrateTo) of + [] -> + ?SLOG(warning, #{ + msg => "no_nodes_to_evacuate_sessions", session_count => CurrSessCount + }), + {keep_state_and_data, [ + {state_timeout, ?EVICT_INTERVAL_NO_NODES, evict_sessions} + ]}; + Nodes -> + ok = emqx_eviction_agent:evict_sessions(SessEvictRate, Nodes), + ?SLOG( + warning, + #{ + msg => "node_evacuation_evict_sessions", + session_count => SessCount, + session_evict_rate => SessEvictRate, + target_nodes => Nodes + } + ), + NewData = Data#{current_sessions => SessCount}, + {keep_state, NewData, [{state_timeout, ?EVICT_INTERVAL, evict_sessions}]} + end; + {enabled, #{sessions := 0}} -> + ?tp(debug, node_evacuation_evict_sess_over, #{}), + ?SLOG(warning, #{msg => "node_evacuation_evict_sessions_over"}), + NewData = Data#{current_sessions => 0}, + {next_state, prohibiting, NewData} + end; +handle_event({call, From}, Msg, State, Data) -> + ?SLOG(warning, #{msg => "unknown_call", call => Msg, state => State, data => Data}), + {keep_state_and_data, [{reply, From, ignored}]}; +handle_event(info, Msg, State, Data) -> + ?SLOG(warning, #{msg => "unknown_info", info => Msg, state => State, data => Data}), + keep_state_and_data; +handle_event(cast, Msg, State, Data) -> + ?SLOG(warning, #{msg => "unknown_cast", cast => Msg, state => State, data => Data}), + keep_state_and_data. + +code_change(_Vsn, State, Data, _Extra) -> + {ok, State, Data}. + +%%-------------------------------------------------------------------- +%% internal funs +%%-------------------------------------------------------------------- + +default_opts() -> + #{ + server_reference => undefined, + conn_evict_rate => ?DEFAULT_CONN_EVICT_RATE, + sess_evict_rate => ?DEFAULT_SESS_EVICT_RATE, + wait_takeover => ?DEFAULT_WAIT_TAKEOVER, + migrate_to => undefined + }. + +init_data(Data0, Opts) -> + Data1 = maps:merge(Data0, Opts), + {enabled, #{connections := ConnCount, sessions := SessCount}} = emqx_eviction_agent:status(), + Data1#{ + initial_conns => ConnCount, + current_conns => ConnCount, + initial_sessions => SessCount, + current_sessions => SessCount + }. + +deinit(Data) -> + Keys = + [initial_conns, current_conns, initial_sessions, current_sessions] ++ + maps:keys(default_opts()), + maps:without(Keys, Data). + +warn_enabled() -> + ?SLOG(warning, #{msg => "node_evacuation_enabled"}), + io:format( + standard_error, "Node evacuation is enabled. The node will not receive connections.~n", [] + ). + +migrate_to(undefined) -> + migrate_to(all_nodes()); +migrate_to(Nodes) when is_list(Nodes) -> + available_nodes(Nodes). + +is_node_available() -> + disabled = emqx_eviction_agent:status(), + node(). + +all_nodes() -> + mria_mnesia:running_nodes() -- [node()]. diff --git a/lib-ee/emqx_node_rebalance/src/emqx_node_rebalance_evacuation_persist.erl b/lib-ee/emqx_node_rebalance/src/emqx_node_rebalance_evacuation_persist.erl new file mode 100644 index 000000000..3fc9faeea --- /dev/null +++ b/lib-ee/emqx_node_rebalance/src/emqx_node_rebalance_evacuation_persist.erl @@ -0,0 +1,120 @@ +%%-------------------------------------------------------------------- +%% Copyright (c) 2022-2023 EMQ Technologies Co., Ltd. All Rights Reserved. +%%-------------------------------------------------------------------- + +-module(emqx_node_rebalance_evacuation_persist). + +-export([ + save/1, + clear/0, + read/1 +]). + +-ifdef(TEST). +-export([evacuation_filepath/0]). +-endif. + +-include("emqx_node_rebalance.hrl"). +-include_lib("emqx/include/types.hrl"). + +%%-------------------------------------------------------------------- +%% APIs +%%-------------------------------------------------------------------- + +%% do not persist `migrate_to`: +%% * after restart there is nothing to migrate +%% * this value may be invalid after node was offline +-type persisted_start_opts() :: #{ + server_reference => emqx_eviction_agent:server_reference(), + conn_evict_rate => pos_integer(), + sess_evict_rate => pos_integer(), + wait_takeover => pos_integer() +}. +-type start_opts() :: #{ + server_reference => emqx_eviction_agent:server_reference(), + conn_evict_rate => pos_integer(), + sess_evict_rate => pos_integer(), + wait_takeover => pos_integer(), + migrate_to => emqx_node_rebalance_evacuation:migrate_to() +}. + +-spec save(persisted_start_opts()) -> ok_or_error(term()). +save( + #{ + server_reference := ServerReference, + conn_evict_rate := ConnEvictRate, + sess_evict_rate := SessEvictRate, + wait_takeover := WaitTakeover + } = Data +) when + (is_binary(ServerReference) orelse ServerReference =:= undefined) andalso + is_integer(ConnEvictRate) andalso ConnEvictRate > 0 andalso + is_integer(SessEvictRate) andalso SessEvictRate > 0 andalso + is_integer(WaitTakeover) andalso WaitTakeover >= 0 +-> + Filepath = evacuation_filepath(), + case filelib:ensure_dir(Filepath) of + ok -> + JsonData = emqx_json:encode( + prepare_for_encode(maps:with(persist_keys(), Data)), + [pretty] + ), + file:write_file(Filepath, JsonData); + {error, _} = Error -> + Error + end. + +-spec clear() -> ok. +clear() -> + file:delete(evacuation_filepath()). + +-spec read(start_opts()) -> {ok, start_opts()} | none. +read(DefaultOpts) -> + case file:read_file(evacuation_filepath()) of + {ok, Data} -> + case emqx_json:safe_decode(Data, [return_maps]) of + {ok, Map} when is_map(Map) -> + {ok, map_to_opts(DefaultOpts, Map)}; + _NotAMap -> + {ok, DefaultOpts} + end; + {error, _} -> + none + end. + +%%-------------------------------------------------------------------- +%% Internal funcs +%%-------------------------------------------------------------------- + +persist_keys() -> + [ + server_reference, + conn_evict_rate, + sess_evict_rate, + wait_takeover + ]. + +prepare_for_encode(#{server_reference := undefined} = Data) -> + Data#{server_reference => null}; +prepare_for_encode(Data) -> + Data. + +format_after_decode(#{server_reference := null} = Data) -> + Data#{server_reference => undefined}; +format_after_decode(Data) -> + Data. + +map_to_opts(DefaultOpts, Map) -> + format_after_decode( + map_to_opts( + maps:to_list(DefaultOpts), Map, #{} + ) + ). + +map_to_opts([], _Map, Opts) -> + Opts; +map_to_opts([{Key, DefaultVal} | Rest], Map, Opts) -> + map_to_opts(Rest, Map, Opts#{Key => maps:get(atom_to_binary(Key), Map, DefaultVal)}). + +evacuation_filepath() -> + filename:join([emqx:data_dir(), ?EVACUATION_FILENAME]). diff --git a/lib-ee/emqx_node_rebalance/src/emqx_node_rebalance_status.erl b/lib-ee/emqx_node_rebalance/src/emqx_node_rebalance_status.erl new file mode 100644 index 000000000..63675a3da --- /dev/null +++ b/lib-ee/emqx_node_rebalance/src/emqx_node_rebalance_status.erl @@ -0,0 +1,238 @@ +%%-------------------------------------------------------------------- +%% Copyright (c) 2022-2023 EMQ Technologies Co., Ltd. All Rights Reserved. +%%-------------------------------------------------------------------- + +-module(emqx_node_rebalance_status). + +-export([ + local_status/0, + local_status/1, + global_status/0, + format_local_status/1, + format_coordinator_status/1 +]). + +%% For RPC +-export([ + evacuation_status/0, + rebalance_status/0 +]). + +%%-------------------------------------------------------------------- +%% APIs +%%-------------------------------------------------------------------- + +-spec local_status() -> disabled | {evacuation, map()} | {rebalance, map()}. +local_status() -> + case emqx_node_rebalance_evacuation:status() of + {enabled, Status} -> + {evacuation, evacuation(Status)}; + disabled -> + case emqx_node_rebalance_agent:status() of + {enabled, CoordinatorPid} -> + case emqx_node_rebalance:status(CoordinatorPid) of + {enabled, Status} -> + local_rebalance(Status, node()); + disabled -> + disabled + end; + disabled -> + disabled + end + end. + +-spec local_status(node()) -> disabled | {evacuation, map()} | {rebalance, map()}. +local_status(Node) -> + emqx_node_rebalance_status_proto_v1:local_status(Node). + +-spec format_local_status(map()) -> iodata(). +format_local_status(Status) -> + format_status(Status, local_status_field_format_order()). + +-spec global_status() -> #{rebalances := [{node(), map()}], evacuations := [{node(), map()}]}. +global_status() -> + Nodes = mria_mnesia:running_nodes(), + {RebalanceResults, _} = emqx_node_rebalance_status_proto_v1:rebalance_status(Nodes), + Rebalances = [ + {Node, coordinator_rebalance(Status)} + || {Node, {enabled, Status}} <- RebalanceResults + ], + {EvacuatioResults, _} = emqx_node_rebalance_status_proto_v1:evacuation_status(Nodes), + Evacuations = [{Node, evacuation(Status)} || {Node, {enabled, Status}} <- EvacuatioResults], + #{rebalances => Rebalances, evacuations => Evacuations}. + +-spec format_coordinator_status(map()) -> iodata(). +format_coordinator_status(Status) -> + format_status(Status, coordinator_status_field_format_order()). + +%%-------------------------------------------------------------------- +%% Internal functions +%%-------------------------------------------------------------------- + +evacuation(Status) -> + #{ + state => maps:get(state, Status), + connection_eviction_rate => maps:get(conn_evict_rate, Status), + session_eviction_rate => maps:get(sess_evict_rate, Status), + connection_goal => 0, + session_goal => 0, + session_recipients => maps:get(migrate_to, Status), + stats => #{ + initial_connected => maps:get(initial_conns, Status), + current_connected => maps:get(current_conns, Status), + initial_sessions => maps:get(initial_sessions, Status), + current_sessions => maps:get(current_sessions, Status) + } + }. + +local_rebalance(#{donors := Donors} = Stats, Node) -> + case lists:member(Node, Donors) of + true -> {rebalance, donor_rebalance(Stats, Node)}; + false -> disabled + end. + +donor_rebalance(Status, Node) -> + Opts = maps:get(opts, Status), + InitialConnCounts = maps:get(initial_conn_counts, Status), + InitialSessCounts = maps:get(initial_sess_counts, Status), + + CurrentStats = #{ + initial_connected => maps:get(Node, InitialConnCounts), + initial_sessions => maps:get(Node, InitialSessCounts), + current_connected => emqx_eviction_agent:connection_count(), + current_sessions => emqx_eviction_agent:session_count(), + current_disconnected_sessions => emqx_eviction_agent:session_count( + disconnected + ) + }, + maps:from_list( + [ + {state, maps:get(state, Status)}, + {coordinator_node, maps:get(coordinator_node, Status)}, + {connection_eviction_rate, maps:get(conn_evict_rate, Opts)}, + {session_eviction_rate, maps:get(sess_evict_rate, Opts)}, + {recipients, maps:get(recipients, Status)}, + {stats, CurrentStats} + ] ++ + [ + {connection_goal, maps:get(recipient_conn_avg, Status)} + || maps:is_key(recipient_conn_avg, Status) + ] ++ + [ + {disconnected_session_goal, maps:get(recipient_sess_avg, Status)} + || maps:is_key(recipient_sess_avg, Status) + ] + ). + +coordinator_rebalance(Status) -> + Opts = maps:get(opts, Status), + maps:from_list( + [ + {state, maps:get(state, Status)}, + {coordinator_node, maps:get(coordinator_node, Status)}, + {connection_eviction_rate, maps:get(conn_evict_rate, Opts)}, + {session_eviction_rate, maps:get(sess_evict_rate, Opts)}, + {recipients, maps:get(recipients, Status)}, + {donors, maps:get(donors, Status)} + ] ++ + [ + {connection_goal, maps:get(recipient_conn_avg, Status)} + || maps:is_key(recipient_conn_avg, Status) + ] ++ + [ + {disconnected_session_goal, maps:get(recipient_sess_avg, Status)} + || maps:is_key(recipient_sess_avg, Status) + ] ++ + [ + {donor_conn_avg, maps:get(donor_conn_avg, Status)} + || maps:is_key(donor_conn_avg, Status) + ] ++ + [ + {donor_sess_avg, maps:get(donor_sess_avg, Status)} + || maps:is_key(donor_sess_avg, Status) + ] + ). + +local_status_field_format_order() -> + [ + state, + coordinator_node, + connection_eviction_rate, + session_eviction_rate, + connection_goal, + session_goal, + disconnected_session_goal, + session_recipients, + recipients, + stats + ]. + +coordinator_status_field_format_order() -> + [ + state, + coordinator_node, + donors, + recipients, + connection_eviction_rate, + session_eviction_rate, + connection_goal, + disconnected_session_goal, + donor_conn_avg, + donor_sess_avg + ]. + +format_status(Status, FieldOrder) -> + Fields = lists:flatmap( + fun(FieldName) -> + maps:to_list(maps:with([FieldName], Status)) + end, + FieldOrder + ), + lists:map( + fun format_local_status_field/1, + Fields + ). + +format_local_status_field({state, State}) -> + io_lib:format("Rebalance state: ~p~n", [State]); +format_local_status_field({coordinator_node, Node}) -> + io_lib:format("Coordinator node: ~p~n", [Node]); +format_local_status_field({connection_eviction_rate, ConnEvictRate}) -> + io_lib:format("Connection eviction rate: ~p connections/second~n", [ConnEvictRate]); +format_local_status_field({session_eviction_rate, SessEvictRate}) -> + io_lib:format("Session eviction rate: ~p sessions/second~n", [SessEvictRate]); +format_local_status_field({connection_goal, ConnGoal}) -> + io_lib:format("Connection goal: ~p~n", [ConnGoal]); +format_local_status_field({session_goal, SessGoal}) -> + io_lib:format("Session goal: ~p~n", [SessGoal]); +format_local_status_field({disconnected_session_goal, DisconnSessGoal}) -> + io_lib:format("Disconnected session goal: ~p~n", [DisconnSessGoal]); +format_local_status_field({session_recipients, SessionRecipients}) -> + io_lib:format("Session recipient nodes: ~p~n", [SessionRecipients]); +format_local_status_field({recipients, Recipients}) -> + io_lib:format("Recipient nodes: ~p~n", [Recipients]); +format_local_status_field({donors, Donors}) -> + io_lib:format("Donor nodes: ~p~n", [Donors]); +format_local_status_field({donor_conn_avg, DonorConnAvg}) -> + io_lib:format("Current average donor node connection count: ~p~n", [DonorConnAvg]); +format_local_status_field({donor_sess_avg, DonorSessAvg}) -> + io_lib:format("Current average donor node disconnected session count: ~p~n", [DonorSessAvg]); +format_local_status_field({stats, Stats}) -> + format_local_stats(Stats). + +format_local_stats(Stats) -> + [ + "Channel statistics:\n" + | lists:map( + fun({Name, Value}) -> + io_lib:format(" ~p: ~p~n", [Name, Value]) + end, + maps:to_list(Stats) + ) + ]. + +evacuation_status() -> + {node(), emqx_node_rebalance_evacuation:status()}. + +rebalance_status() -> + {node(), emqx_node_rebalance:status()}. diff --git a/lib-ee/emqx_node_rebalance/src/emqx_node_rebalance_sup.erl b/lib-ee/emqx_node_rebalance/src/emqx_node_rebalance_sup.erl new file mode 100644 index 000000000..cfaccc4c2 --- /dev/null +++ b/lib-ee/emqx_node_rebalance/src/emqx_node_rebalance_sup.erl @@ -0,0 +1,35 @@ +%%-------------------------------------------------------------------- +%% Copyright (c) 2022-2023 EMQ Technologies Co., Ltd. All Rights Reserved. +%%-------------------------------------------------------------------- + +-module(emqx_node_rebalance_sup). + +-behaviour(supervisor). + +-export([start_link/0]). + +-export([init/1]). + +start_link() -> + supervisor:start_link({local, ?MODULE}, ?MODULE, []). + +init([]) -> + Childs = [ + child_spec(emqx_node_rebalance_evacuation, []), + child_spec(emqx_node_rebalance_agent, []), + child_spec(emqx_node_rebalance, []) + ], + {ok, { + #{strategy => one_for_one, intensity => 10, period => 3600}, + Childs + }}. + +child_spec(Mod, Args) -> + #{ + id => Mod, + start => {Mod, start_link, Args}, + restart => permanent, + shutdown => 5000, + type => worker, + modules => [Mod] + }. diff --git a/lib-ee/emqx_node_rebalance/src/proto/emqx_node_rebalance_api_proto_v1.erl b/lib-ee/emqx_node_rebalance/src/proto/emqx_node_rebalance_api_proto_v1.erl new file mode 100644 index 000000000..131973932 --- /dev/null +++ b/lib-ee/emqx_node_rebalance/src/proto/emqx_node_rebalance_api_proto_v1.erl @@ -0,0 +1,43 @@ +%%-------------------------------------------------------------------- +%% Copyright (c) 2022-2023 EMQ Technologies Co., Ltd. All Rights Reserved. +%%-------------------------------------------------------------------- + +-module(emqx_node_rebalance_api_proto_v1). + +-behaviour(emqx_bpapi). + +-export([ + introduced_in/0, + + node_rebalance_evacuation_start/2, + node_rebalance_evacuation_stop/1, + + node_rebalance_start/2, + node_rebalance_stop/1 +]). + +-include_lib("emqx/include/bpapi.hrl"). +-include_lib("emqx/include/types.hrl"). + +introduced_in() -> + "5.0.22". + +-spec node_rebalance_evacuation_start(node(), emqx_node_rebalance_evacuation:start_opts()) -> + emqx_rpc:badrpc() | ok_or_error(emqx_node_rebalance_evacuation:start_error()). +node_rebalance_evacuation_start(Node, #{} = Opts) -> + rpc:call(Node, emqx_node_rebalance_evacuation, start, [Opts]). + +-spec node_rebalance_evacuation_stop(node()) -> + emqx_rpc:badrpc() | ok_or_error(not_started). +node_rebalance_evacuation_stop(Node) -> + rpc:call(Node, emqx_node_rebalance_evacuation, stop, []). + +-spec node_rebalance_start(node(), emqx_node_rebalance:start_opts()) -> + emqx_rpc:badrpc() | ok_or_error(emqx_node_rebalance:start_error()). +node_rebalance_start(Node, Opts) -> + rpc:call(Node, emqx_node_rebalance, start, [Opts]). + +-spec node_rebalance_stop(node()) -> + emqx_rpc:badrpc() | ok_or_error(not_started). +node_rebalance_stop(Node) -> + rpc:call(Node, emqx_node_rebalance, stop, []). diff --git a/lib-ee/emqx_node_rebalance/src/proto/emqx_node_rebalance_evacuation_proto_v1.erl b/lib-ee/emqx_node_rebalance/src/proto/emqx_node_rebalance_evacuation_proto_v1.erl new file mode 100644 index 000000000..f5a6e1077 --- /dev/null +++ b/lib-ee/emqx_node_rebalance/src/proto/emqx_node_rebalance_evacuation_proto_v1.erl @@ -0,0 +1,22 @@ +%%-------------------------------------------------------------------- +%% Copyright (c) 2022-2023 EMQ Technologies Co., Ltd. All Rights Reserved. +%%-------------------------------------------------------------------- + +-module(emqx_node_rebalance_evacuation_proto_v1). + +-behaviour(emqx_bpapi). + +-export([ + introduced_in/0, + + available_nodes/1 +]). + +-include_lib("emqx/include/bpapi.hrl"). + +introduced_in() -> + "5.0.22". + +-spec available_nodes([node()]) -> emqx_rpc:multicall_result(node()). +available_nodes(Nodes) -> + rpc:multicall(Nodes, emqx_node_rebalance_evacuation, is_node_available, []). diff --git a/lib-ee/emqx_node_rebalance/src/proto/emqx_node_rebalance_proto_v1.erl b/lib-ee/emqx_node_rebalance/src/proto/emqx_node_rebalance_proto_v1.erl new file mode 100644 index 000000000..98625d4fd --- /dev/null +++ b/lib-ee/emqx_node_rebalance/src/proto/emqx_node_rebalance_proto_v1.erl @@ -0,0 +1,62 @@ +%%-------------------------------------------------------------------- +%% Copyright (c) 2022-2023 EMQ Technologies Co., Ltd. All Rights Reserved. +%%-------------------------------------------------------------------- + +-module(emqx_node_rebalance_proto_v1). + +-behaviour(emqx_bpapi). + +-export([ + introduced_in/0, + + available_nodes/1, + evict_connections/2, + evict_sessions/4, + connection_counts/1, + session_counts/1, + enable_rebalance_agent/2, + disable_rebalance_agent/2, + disconnected_session_counts/1 +]). + +-include_lib("emqx/include/bpapi.hrl"). +-include_lib("emqx/include/types.hrl"). + +introduced_in() -> + "5.0.22". + +-spec available_nodes([node()]) -> emqx_rpc:multicall_result(node()). +available_nodes(Nodes) -> + rpc:multicall(Nodes, emqx_node_rebalance, is_node_available, []). + +-spec evict_connections([node()], non_neg_integer()) -> + emqx_rpc:multicall_result(ok_or_error(disabled)). +evict_connections(Nodes, Count) -> + rpc:multicall(Nodes, emqx_eviction_agent, evict_connections, [Count]). + +-spec evict_sessions([node()], non_neg_integer(), [node()], emqx_channel:conn_state()) -> + emqx_rpc:multicall_result(ok_or_error(disabled)). +evict_sessions(Nodes, Count, RecipientNodes, ConnState) -> + rpc:multicall(Nodes, emqx_eviction_agent, evict_sessions, [Count, RecipientNodes, ConnState]). + +-spec connection_counts([node()]) -> emqx_rpc:multicall_result({ok, non_neg_integer()}). +connection_counts(Nodes) -> + rpc:multicall(Nodes, emqx_node_rebalance, connection_count, []). + +-spec session_counts([node()]) -> emqx_rpc:multicall_result({ok, non_neg_integer()}). +session_counts(Nodes) -> + rpc:multicall(Nodes, emqx_node_rebalance, session_count, []). + +-spec enable_rebalance_agent([node()], pid()) -> + emqx_rpc:multicall_result(ok_or_error(already_enabled | eviction_agent_busy)). +enable_rebalance_agent(Nodes, OwnerPid) -> + rpc:multicall(Nodes, emqx_node_rebalance_agent, enable, [OwnerPid]). + +-spec disable_rebalance_agent([node()], pid()) -> + emqx_rpc:multicall_result(ok_or_error(already_disabled | invalid_coordinator)). +disable_rebalance_agent(Nodes, OwnerPid) -> + rpc:multicall(Nodes, emqx_node_rebalance_agent, disable, [OwnerPid]). + +-spec disconnected_session_counts([node()]) -> emqx_rpc:multicall_result({ok, non_neg_integer()}). +disconnected_session_counts(Nodes) -> + rpc:multicall(Nodes, emqx_node_rebalance, disconnected_session_count, []). diff --git a/lib-ee/emqx_node_rebalance/src/proto/emqx_node_rebalance_status_proto_v1.erl b/lib-ee/emqx_node_rebalance/src/proto/emqx_node_rebalance_status_proto_v1.erl new file mode 100644 index 000000000..e3e4a423c --- /dev/null +++ b/lib-ee/emqx_node_rebalance/src/proto/emqx_node_rebalance_status_proto_v1.erl @@ -0,0 +1,36 @@ +%%-------------------------------------------------------------------- +%% Copyright (c) 2022-2023 EMQ Technologies Co., Ltd. All Rights Reserved. +%%-------------------------------------------------------------------- + +-module(emqx_node_rebalance_status_proto_v1). + +-behaviour(emqx_bpapi). + +-export([ + introduced_in/0, + + local_status/1, + rebalance_status/1, + evacuation_status/1 +]). + +-include_lib("emqx/include/bpapi.hrl"). +-include_lib("emqx/include/types.hrl"). + +introduced_in() -> + "5.0.22". + +-spec local_status(node()) -> + emqx_rpc:badrpc() | disabled | {evacuation, map()} | {rebalance, map()}. +local_status(Node) -> + rpc:call(Node, emqx_node_rebalance_status, local_status, []). + +-spec rebalance_status([node()]) -> + emqx_rpc:multicall_result({node(), map()}). +rebalance_status(Nodes) -> + rpc:multicall(Nodes, emqx_node_rebalance_status, rebalance_status, []). + +-spec evacuation_status([node()]) -> + emqx_rpc:multicall_result({node(), map()}). +evacuation_status(Nodes) -> + rpc:multicall(Nodes, emqx_node_rebalance_status, evacuation_status, []). diff --git a/lib-ee/emqx_node_rebalance/test/emqx_node_rebalance_SUITE.erl b/lib-ee/emqx_node_rebalance/test/emqx_node_rebalance_SUITE.erl new file mode 100644 index 000000000..a818145a2 --- /dev/null +++ b/lib-ee/emqx_node_rebalance/test/emqx_node_rebalance_SUITE.erl @@ -0,0 +1,229 @@ +%%-------------------------------------------------------------------- +%% Copyright (c) 2022-2023 EMQ Technologies Co., Ltd. All Rights Reserved. +%%-------------------------------------------------------------------- + +-module(emqx_node_rebalance_SUITE). + +-compile(export_all). +-compile(nowarn_export_all). + +-include_lib("emqx/include/emqx.hrl"). +-include_lib("emqx/include/emqx_mqtt.hrl"). +-include_lib("emqx/include/asserts.hrl"). +-include_lib("eunit/include/eunit.hrl"). +-include_lib("common_test/include/ct.hrl"). +-include_lib("snabbkaffe/include/snabbkaffe.hrl"). + +-import( + emqx_eviction_agent_test_helpers, + [emqtt_connect_many/1, emqtt_connect_many/2, stop_many/1, case_specific_node_name/3] +). + +-define(START_APPS, [emqx_eviction_agent, emqx_node_rebalance]). + +all() -> + emqx_common_test_helpers:all(?MODULE). + +init_per_suite(Config) -> + ok = emqx_common_test_helpers:start_apps([]), + Config. + +end_per_suite(_Config) -> + ok = emqx_common_test_helpers:stop_apps([]), + ok. + +init_per_testcase(Case, Config) -> + ClusterNodes = emqx_eviction_agent_test_helpers:start_cluster( + [ + {case_specific_node_name(?MODULE, Case, '_donor'), 2883}, + {case_specific_node_name(?MODULE, Case, '_recipient'), 3883} + ], + ?START_APPS + ), + ok = snabbkaffe:start_trace(), + [{cluster_nodes, ClusterNodes} | Config]. + +end_per_testcase(_Case, Config) -> + ok = snabbkaffe:stop(), + ok = emqx_eviction_agent_test_helpers:stop_cluster( + ?config(cluster_nodes, Config), + ?START_APPS + ). + +%%-------------------------------------------------------------------- +%% Tests +%%-------------------------------------------------------------------- + +t_rebalance(Config) -> + process_flag(trap_exit, true), + + [{DonorNode, DonorPort}, {RecipientNode, _RecipientPort}] = ?config(cluster_nodes, Config), + + Nodes = [DonorNode, RecipientNode], + + Conns = emqtt_connect_many(DonorPort, 500), + + Opts = #{ + conn_evict_rate => 10, + sess_evict_rate => 10, + evict_interval => 10, + abs_conn_threshold => 50, + abs_sess_threshold => 50, + rel_conn_threshold => 1.0, + rel_sess_threshold => 1.0, + wait_health_check => 0.01, + wait_takeover => 0.01, + nodes => Nodes + }, + + ?assertWaitEvent( + ok = rpc:call(DonorNode, emqx_node_rebalance, start, [Opts]), + #{?snk_kind := emqx_node_rebalance_evict_sess_over}, + 10000 + ), + + DonorConnCount = rpc:call(DonorNode, emqx_eviction_agent, connection_count, []), + DonorSessCount = rpc:call(DonorNode, emqx_eviction_agent, session_count, []), + DonorDSessCount = rpc:call(DonorNode, emqx_eviction_agent, session_count, [disconnected]), + + RecipientConnCount = rpc:call(RecipientNode, emqx_eviction_agent, connection_count, []), + RecipientSessCount = rpc:call(RecipientNode, emqx_eviction_agent, session_count, []), + RecipientDSessCount = rpc:call(RecipientNode, emqx_eviction_agent, session_count, [disconnected]), + + ct:pal( + "Donor: conn=~p, sess=~p, dsess=~p", + [DonorConnCount, DonorSessCount, DonorDSessCount] + ), + ct:pal( + "Recipient: conn=~p, sess=~p, dsess=~p", + [RecipientConnCount, RecipientSessCount, RecipientDSessCount] + ), + + ?assert(DonorConnCount - 50 =< RecipientConnCount), + ?assert(DonorDSessCount - 50 =< RecipientDSessCount), + + ok = stop_many(Conns). + +t_rebalance_node_crash(Config) -> + process_flag(trap_exit, true), + + [{DonorNode, DonorPort}, {RecipientNode, _RecipientPort}] = ?config(cluster_nodes, Config), + + Nodes = [DonorNode, RecipientNode], + + Conns = emqtt_connect_many(DonorPort, 500), + + Opts = #{ + conn_evict_rate => 10, + sess_evict_rate => 10, + evict_interval => 10, + abs_conn_threshold => 50, + abs_sess_threshold => 50, + rel_conn_threshold => 1.0, + rel_sess_threshold => 1.0, + wait_health_check => 0.01, + wait_takeover => 0.01, + nodes => Nodes + }, + + ?assertWaitEvent( + begin + ok = rpc:call(DonorNode, emqx_node_rebalance, start, [Opts]), + emqx_common_test_helpers:stop_slave(RecipientNode) + end, + #{?snk_kind := emqx_node_rebalance_started}, + 1000 + ), + + ?assertEqual( + disabled, + rpc:call(DonorNode, emqx_node_rebalance, status, []) + ), + + ok = stop_many(Conns). + +t_no_need_to_rebalance(Config) -> + process_flag(trap_exit, true), + + [{DonorNode, DonorPort}, {RecipientNode, _RecipientPort}] = ?config(cluster_nodes, Config), + + Nodes = [DonorNode, RecipientNode], + + Opts = #{ + conn_evict_rate => 10, + sess_evict_rate => 10, + evict_interval => 10, + abs_conn_threshold => 50, + abs_sess_threshold => 50, + rel_conn_threshold => 1.0, + rel_sess_threshold => 1.0, + wait_health_check => 0.01, + wait_takeover => 0.01, + nodes => Nodes + }, + + ?assertEqual( + {error, nothing_to_balance}, + rpc:call(DonorNode, emqx_node_rebalance, start, [Opts]) + ), + + Conns = emqtt_connect_many(DonorPort, 50), + + ?assertEqual( + {error, nothing_to_balance}, + rpc:call(DonorNode, emqx_node_rebalance, start, [Opts]) + ), + + ok = stop_many(Conns). + +t_unknown_mesages(Config) -> + process_flag(trap_exit, true), + [{DonorNode, DonorPort}, {RecipientNode, _RecipientPort}] = ?config(cluster_nodes, Config), + + Nodes = [DonorNode, RecipientNode], + + Conns = emqtt_connect_many(DonorPort, 500), + + Opts = #{ + wait_health_check => 100, + abs_conn_threshold => 50, + nodes => Nodes + }, + + Pid = rpc:call(DonorNode, erlang, whereis, [emqx_node_rebalance]), + + Pid ! unknown, + ok = gen_server:cast(Pid, unknown), + ?assertEqual( + ignored, + gen_server:call(Pid, unknown) + ), + + ok = rpc:call(DonorNode, emqx_node_rebalance, start, [Opts]), + + Pid ! unknown, + ok = gen_server:cast(Pid, unknown), + ?assertEqual( + ignored, + gen_server:call(Pid, unknown) + ), + + ok = stop_many(Conns). + +t_available_nodes(Config) -> + [{DonorNode, _DonorPort}, {RecipientNode, _RecipientPort}] = ?config(cluster_nodes, Config), + + %% Start eviction agent on RecipientNode so that it will be "occupied" + %% and not available for rebalance + ok = rpc:call(RecipientNode, emqx_eviction_agent, enable, [test_rebalance, undefined]), + + %% Only DonorNode should be is available for rebalance, since RecipientNode is "occupied" + ?assertEqual( + [DonorNode], + rpc:call( + DonorNode, + emqx_node_rebalance, + available_nodes, + [[DonorNode, RecipientNode]] + ) + ). diff --git a/lib-ee/emqx_node_rebalance/test/emqx_node_rebalance_agent_SUITE.erl b/lib-ee/emqx_node_rebalance/test/emqx_node_rebalance_agent_SUITE.erl new file mode 100644 index 000000000..8b21f9433 --- /dev/null +++ b/lib-ee/emqx_node_rebalance/test/emqx_node_rebalance_agent_SUITE.erl @@ -0,0 +1,214 @@ +%%-------------------------------------------------------------------- +%% Copyright (c) 2022-2023 EMQ Technologies Co., Ltd. All Rights Reserved. +%%-------------------------------------------------------------------- + +-module(emqx_node_rebalance_agent_SUITE). + +-compile(export_all). +-compile(nowarn_export_all). + +-include_lib("emqx/include/emqx.hrl"). +-include_lib("emqx/include/emqx_mqtt.hrl"). +-include_lib("eunit/include/eunit.hrl"). +-include_lib("common_test/include/ct.hrl"). +-include_lib("snabbkaffe/include/snabbkaffe.hrl"). + +-import( + emqx_eviction_agent_test_helpers, + [case_specific_node_name/2] +). + +all() -> + [ + {group, local}, + {group, cluster} + ]. + +groups() -> + [ + {local, [], [ + t_enable_disable, + t_enable_egent_busy, + t_unknown_messages + ]}, + {cluster, [], [ + t_rebalance_agent_coordinator_fail, + t_rebalance_agent_fail + ]} + ]. + +init_per_suite(Config) -> + ok = emqx_common_test_helpers:start_apps([emqx_eviction_agent, emqx_node_rebalance]), + Config. + +end_per_suite(_Config) -> + ok = emqx_common_test_helpers:stop_apps([emqx_eviction_agent, emqx_node_rebalance]), + ok. + +init_per_group(local, Config) -> + [{cluster, false} | Config]; +init_per_group(cluster, Config) -> + [{cluster, true} | Config]. + +end_per_group(_Group, _Config) -> + ok. + +init_per_testcase(Case, Config) -> + case ?config(cluster, Config) of + true -> + ClusterNodes = emqx_eviction_agent_test_helpers:start_cluster( + [{case_specific_node_name(?MODULE, Case), 2883}], + [emqx_eviction_agent, emqx_node_rebalance] + ), + [{cluster_nodes, ClusterNodes} | Config]; + false -> + Config + end. + +end_per_testcase(_Case, Config) -> + case ?config(cluster, Config) of + true -> + emqx_eviction_agent_test_helpers:stop_cluster( + ?config(cluster_nodes, Config), + [emqx_eviction_agent, emqx_node_rebalance] + ); + false -> + ok + end. + +%%-------------------------------------------------------------------- +%% Tests +%%-------------------------------------------------------------------- + +%% Local tests + +t_enable_disable(_Config) -> + ?assertEqual( + disabled, + emqx_node_rebalance_agent:status() + ), + + ?assertEqual( + ok, + emqx_node_rebalance_agent:enable(self()) + ), + + ?assertEqual( + {error, already_enabled}, + emqx_node_rebalance_agent:enable(self()) + ), + + ?assertEqual( + {enabled, self()}, + emqx_node_rebalance_agent:status() + ), + + ?assertEqual( + {error, invalid_coordinator}, + emqx_node_rebalance_agent:disable(spawn_link(fun() -> ok end)) + ), + + ?assertEqual( + ok, + emqx_node_rebalance_agent:disable(self()) + ), + + ?assertEqual( + {error, already_disabled}, + emqx_node_rebalance_agent:disable(self()) + ), + + ?assertEqual( + disabled, + emqx_node_rebalance_agent:status() + ). + +t_enable_egent_busy(_Config) -> + ok = emqx_eviction_agent:enable(rebalance_test, undefined), + + ?assertEqual( + {error, eviction_agent_busy}, + emqx_node_rebalance_agent:enable(self()) + ), + + ok = emqx_eviction_agent:disable(rebalance_test). + +t_unknown_messages(_Config) -> + Pid = whereis(emqx_node_rebalance_agent), + + ok = gen_server:cast(Pid, unknown), + + Pid ! unknown, + + ignored = gen_server:call(Pid, unknown). + +%% Cluster tests + +% The following tests verify that emqx_node_rebalance_agent correctly links +% coordinator process with emqx_eviction_agent-s. + +t_rebalance_agent_coordinator_fail(Config) -> + process_flag(trap_exit, true), + + [{Node, _}] = ?config(cluster_nodes, Config), + + CoordinatorPid = spawn_link( + fun() -> + receive + done -> ok + end + end + ), + + ?assertEqual( + disabled, + rpc:call(Node, emqx_eviction_agent, status, []) + ), + + ?assertEqual( + ok, + rpc:call(Node, emqx_node_rebalance_agent, enable, [CoordinatorPid]) + ), + + ?assertMatch( + {enabled, _}, + rpc:call(Node, emqx_eviction_agent, status, []) + ), + + EvictionAgentPid = rpc:call(Node, erlang, whereis, [emqx_eviction_agent]), + true = link(EvictionAgentPid), + + true = exit(CoordinatorPid, kill), + + receive + {'EXIT', EvictionAgentPid, _} -> true + after 1000 -> + ct:fail("emqx_eviction_agent did not exit") + end. + +t_rebalance_agent_fail(Config) -> + process_flag(trap_exit, true), + + [{Node, _}] = ?config(cluster_nodes, Config), + + CoordinatorPid = spawn_link( + fun() -> + receive + done -> ok + end + end + ), + + ?assertEqual( + ok, + rpc:call(Node, emqx_node_rebalance_agent, enable, [CoordinatorPid]) + ), + + EvictionAgentPid = rpc:call(Node, erlang, whereis, [emqx_eviction_agent]), + true = exit(EvictionAgentPid, kill), + + receive + {'EXIT', CoordinatorPid, _} -> true + after 1000 -> + ct:fail("emqx_node_rebalance_agent did not exit") + end. diff --git a/lib-ee/emqx_node_rebalance/test/emqx_node_rebalance_api_SUITE.erl b/lib-ee/emqx_node_rebalance/test/emqx_node_rebalance_api_SUITE.erl new file mode 100644 index 000000000..21608b8bc --- /dev/null +++ b/lib-ee/emqx_node_rebalance/test/emqx_node_rebalance_api_SUITE.erl @@ -0,0 +1,444 @@ +%%-------------------------------------------------------------------- +%% Copyright (c) 2022-2023 EMQ Technologies Co., Ltd. All Rights Reserved. +%%-------------------------------------------------------------------- + +-module(emqx_node_rebalance_api_SUITE). + +-compile(export_all). +-compile(nowarn_export_all). + +-include_lib("eunit/include/eunit.hrl"). +-include_lib("common_test/include/ct.hrl"). + +-import( + emqx_mgmt_api_test_util, + [ + request/2, + request/3, + uri/1 + ] +). + +-import( + emqx_eviction_agent_test_helpers, + [emqtt_connect_many/2, stop_many/1, case_specific_node_name/3] +). + +-define(START_APPS, [emqx_eviction_agent, emqx_node_rebalance]). + +all() -> + emqx_common_test_helpers:all(?MODULE). + +init_per_suite(Config) -> + ok = emqx_common_test_helpers:start_apps(?START_APPS), + Config. + +end_per_suite(_Config) -> + ok = emqx_common_test_helpers:stop_apps(?START_APPS), + ok. + +init_per_testcase(Case, Config) -> + [{DonorNode, _} | _] = + ClusterNodes = emqx_eviction_agent_test_helpers:start_cluster( + [ + {case_specific_node_name(?MODULE, Case, '_donor'), 2883}, + {case_specific_node_name(?MODULE, Case, '_recipient'), 3883} + ], + ?START_APPS, + [{emqx, data_dir, case_specific_data_dir(Case, Config)}] + ), + + ok = rpc:call(DonorNode, emqx_mgmt_api_test_util, init_suite, []), + ok = take_auth_header_from(DonorNode), + + [{cluster_nodes, ClusterNodes} | Config]. +end_per_testcase(_Case, Config) -> + _ = emqx_eviction_agent_test_helpers:stop_cluster( + ?config(cluster_nodes, Config), + ?START_APPS + ). + +%%-------------------------------------------------------------------- +%% Tests +%%-------------------------------------------------------------------- + +t_start_evacuation_validation(Config) -> + [{DonorNode, _}, {RecipientNode, _}] = ?config(cluster_nodes, Config), + BadOpts = [ + #{conn_evict_rate => <<"conn">>}, + #{sess_evict_rate => <<"sess">>}, + #{redirect_to => 123}, + #{wait_takeover => <<"wait">>}, + #{migrate_to => []}, + #{migrate_to => <<"migrate_to">>}, + #{migrate_to => [<<"bad_node">>]}, + #{migrate_to => [<<"bad_node">>, atom_to_binary(DonorNode)]}, + #{unknown => <<"Value">>} + ], + lists:foreach( + fun(Opts) -> + ?assertMatch( + {ok, 400, #{}}, + api_post( + ["load_rebalance", atom_to_list(DonorNode), "evacuation", "start"], + Opts + ) + ) + end, + BadOpts + ), + ?assertMatch( + {ok, 400, #{}}, + api_post( + ["load_rebalance", "bad@node", "evacuation", "start"], + #{} + ) + ), + + ?assertMatch( + {ok, 200, #{}}, + api_post( + ["load_rebalance", atom_to_list(DonorNode), "evacuation", "start"], + #{ + conn_evict_rate => 10, + sess_evict_rate => 10, + wait_takeover => 10, + redirect_to => <<"srv">>, + migrate_to => [atom_to_binary(RecipientNode)] + } + ) + ), + + DonorNodeBin = atom_to_binary(DonorNode), + ?assertMatch( + {ok, 200, #{<<"evacuations">> := [#{<<"node">> := DonorNodeBin}]}}, + api_get(["load_rebalance", "global_status"]) + ). + +t_start_rebalance_validation(Config) -> + process_flag(trap_exit, true), + + [{DonorNode, DonorPort}, {RecipientNode, _}] = ?config(cluster_nodes, Config), + + BadOpts = [ + #{conn_evict_rate => <<"conn">>}, + #{sess_evict_rate => <<"sess">>}, + #{abs_conn_threshold => <<"act">>}, + #{rel_conn_threshold => <<"rct">>}, + #{abs_sess_threshold => <<"act">>}, + #{rel_sess_threshold => <<"rct">>}, + #{wait_takeover => <<"wait">>}, + #{wait_health_check => <<"wait">>}, + #{nodes => <<"nodes">>}, + #{nodes => []}, + #{nodes => [<<"bad_node">>]}, + #{nodes => [<<"bad_node">>, atom_to_binary(DonorNode)]}, + #{unknown => <<"Value">>} + ], + lists:foreach( + fun(Opts) -> + ?assertMatch( + {ok, 400, #{}}, + api_post( + ["load_rebalance", atom_to_list(DonorNode), "start"], + Opts + ) + ) + end, + BadOpts + ), + ?assertMatch( + {ok, 400, #{}}, + api_post( + ["load_rebalance", "bad@node", "start"], + #{} + ) + ), + + Conns = emqtt_connect_many(DonorPort, 50), + + ?assertMatch( + {ok, 200, #{}}, + api_post( + ["load_rebalance", atom_to_list(DonorNode), "start"], + #{ + conn_evict_rate => 10, + sess_evict_rate => 10, + wait_takeover => 10, + wait_health_check => 10, + abs_conn_threshold => 10, + rel_conn_threshold => 1.001, + abs_sess_threshold => 10, + rel_sess_threshold => 1.001, + nodes => [ + atom_to_binary(DonorNode), + atom_to_binary(RecipientNode) + ] + } + ) + ), + + DonorNodeBin = atom_to_binary(DonorNode), + ?assertMatch( + {ok, 200, #{<<"rebalances">> := [#{<<"node">> := DonorNodeBin}]}}, + api_get(["load_rebalance", "global_status"]) + ), + + ok = stop_many(Conns). + +t_start_stop_evacuation(Config) -> + [{DonorNode, _}, {RecipientNode, _}] = ?config(cluster_nodes, Config), + + StartOpts = maps:merge( + emqx_node_rebalance_api:rebalance_evacuation_example(), + #{migrate_to => [atom_to_binary(RecipientNode)]} + ), + + ?assertMatch( + {ok, 200, #{}}, + api_post( + ["load_rebalance", atom_to_list(DonorNode), "evacuation", "start"], + StartOpts + ) + ), + + StatusResponse = api_get(["load_rebalance", "status"]), + + ?assertMatch( + {ok, 200, _}, + StatusResponse + ), + + {ok, 200, Status} = StatusResponse, + + ?assertMatch( + #{ + process := evacuation, + connection_eviction_rate := 100, + session_eviction_rate := 100, + connection_goal := 0, + session_goal := 0, + stats := #{ + initial_connected := _, + current_connected := _, + initial_sessions := _, + current_sessions := _ + } + }, + emqx_node_rebalance_api:translate(local_status_enabled, Status) + ), + + DonorNodeBin = atom_to_binary(DonorNode), + + GlobalStatusResponse = api_get(["load_rebalance", "global_status"]), + + ?assertMatch( + {ok, 200, _}, + GlobalStatusResponse + ), + + {ok, 200, GlobalStatus} = GlobalStatusResponse, + + ?assertMatch( + #{ + rebalances := [], + evacuations := [ + #{ + node := DonorNodeBin, + connection_eviction_rate := 100, + session_eviction_rate := 100, + connection_goal := 0, + session_goal := 0, + stats := #{ + initial_connected := _, + current_connected := _, + initial_sessions := _, + current_sessions := _ + } + } + ] + }, + emqx_node_rebalance_api:translate(global_status, GlobalStatus) + ), + + ?assertMatch( + {ok, 200, #{}}, + api_post( + ["load_rebalance", atom_to_list(DonorNode), "evacuation", "stop"], + #{} + ) + ), + + ?assertMatch( + {ok, 200, #{<<"status">> := <<"disabled">>}}, + api_get(["load_rebalance", "status"]) + ), + + ?assertMatch( + {ok, 200, #{<<"evacuations">> := [], <<"rebalances">> := []}}, + api_get(["load_rebalance", "global_status"]) + ). + +t_start_stop_rebalance(Config) -> + process_flag(trap_exit, true), + + [{DonorNode, DonorPort}, {RecipientNode, _}] = ?config(cluster_nodes, Config), + + ?assertMatch( + {ok, 200, #{<<"status">> := <<"disabled">>}}, + api_get(["load_rebalance", "status"]) + ), + + Conns = emqtt_connect_many(DonorPort, 100), + + StartOpts = maps:without( + [nodes], + emqx_node_rebalance_api:rebalance_example() + ), + + ?assertMatch( + {ok, 200, #{}}, + api_post( + ["load_rebalance", atom_to_list(DonorNode), "start"], + StartOpts + ) + ), + + StatusResponse = api_get(["load_rebalance", "status"]), + + ?assertMatch( + {ok, 200, _}, + StatusResponse + ), + + {ok, 200, Status} = StatusResponse, + + ?assertMatch( + #{process := rebalance, connection_eviction_rate := 10, session_eviction_rate := 20}, + emqx_node_rebalance_api:translate(local_status_enabled, Status) + ), + + DonorNodeBin = atom_to_binary(DonorNode), + RecipientNodeBin = atom_to_binary(RecipientNode), + + GlobalStatusResponse = api_get(["load_rebalance", "global_status"]), + + ?assertMatch( + {ok, 200, _}, + GlobalStatusResponse + ), + + {ok, 200, GlobalStatus} = GlobalStatusResponse, + + ?assertMatch( + {ok, 200, #{ + <<"evacuations">> := [], + <<"rebalances">> := + [ + #{ + <<"state">> := _, + <<"node">> := DonorNodeBin, + <<"coordinator_node">> := _, + <<"connection_eviction_rate">> := 10, + <<"session_eviction_rate">> := 20, + <<"donors">> := [DonorNodeBin], + <<"recipients">> := [RecipientNodeBin] + } + ] + }}, + api_get(["load_rebalance", "global_status"]) + ), + + ?assertMatch( + #{ + evacuations := [], + rebalances := [ + #{ + state := _, + node := DonorNodeBin, + coordinator_node := _, + connection_eviction_rate := 10, + session_eviction_rate := 20, + donors := [DonorNodeBin], + recipients := [RecipientNodeBin] + } + ] + }, + emqx_node_rebalance_api:translate(global_status, GlobalStatus) + ), + + ?assertMatch( + {ok, 200, #{}}, + api_post( + ["load_rebalance", atom_to_list(DonorNode), "stop"], + #{} + ) + ), + + ?assertMatch( + {ok, 200, #{<<"status">> := <<"disabled">>}}, + api_get(["load_rebalance", "status"]) + ), + + ?assertMatch( + {ok, 200, #{<<"evacuations">> := [], <<"rebalances">> := []}}, + api_get(["load_rebalance", "global_status"]) + ), + + ok = stop_many(Conns). + +t_availability_check(Config) -> + [{DonorNode, _} | _] = ?config(cluster_nodes, Config), + ?assertMatch( + {ok, 200, #{}}, + api_get(["load_rebalance", "availability_check"]) + ), + + ok = rpc:call(DonorNode, emqx_node_rebalance_evacuation, start, [#{}]), + + ?assertMatch( + {ok, 503, _}, + api_get(["load_rebalance", "availability_check"]) + ), + + ok = rpc:call(DonorNode, emqx_node_rebalance_evacuation, stop, []), + + ?assertMatch( + {ok, 200, #{}}, + api_get(["load_rebalance", "availability_check"]) + ). + +%%-------------------------------------------------------------------- +%% Helpers +%%-------------------------------------------------------------------- + +api_get(Path) -> + case request(get, uri(Path)) of + {ok, Code, ResponseBody} -> + {ok, Code, jiffy:decode(ResponseBody, [return_maps])}; + {error, _} = Error -> + Error + end. + +api_post(Path, Data) -> + case request(post, uri(Path), Data) of + {ok, Code, ResponseBody} -> + {ok, Code, jiffy:decode(ResponseBody, [return_maps])}; + {error, _} = Error -> + Error + end. + +take_auth_header_from(Node) -> + meck:new(emqx_common_test_http, [passthrough]), + meck:expect( + emqx_common_test_http, + default_auth_header, + fun() -> rpc:call(Node, emqx_common_test_http, default_auth_header, []) end + ), + ok. + +case_specific_data_dir(Case, Config) -> + case ?config(priv_dir, Config) of + undefined -> undefined; + PrivDir -> filename:join(PrivDir, atom_to_list(Case)) + end. diff --git a/lib-ee/emqx_node_rebalance/test/emqx_node_rebalance_cli_SUITE.erl b/lib-ee/emqx_node_rebalance/test/emqx_node_rebalance_cli_SUITE.erl new file mode 100644 index 000000000..54ecad026 --- /dev/null +++ b/lib-ee/emqx_node_rebalance/test/emqx_node_rebalance_cli_SUITE.erl @@ -0,0 +1,291 @@ +%%-------------------------------------------------------------------- +%% Copyright (c) 2022-2023 EMQ Technologies Co., Ltd. All Rights Reserved. +%%%-------------------------------------------------------------------- + +-module(emqx_node_rebalance_cli_SUITE). + +-compile(export_all). +-compile(nowarn_export_all). + +-include_lib("eunit/include/eunit.hrl"). +-include_lib("common_test/include/ct.hrl"). + +-import( + emqx_eviction_agent_test_helpers, + [emqtt_connect_many/2, stop_many/1, case_specific_node_name/3] +). + +-define(START_APPS, [emqx_eviction_agent, emqx_node_rebalance]). + +all() -> + emqx_common_test_helpers:all(?MODULE). + +init_per_suite(Config) -> + emqx_common_test_helpers:start_apps(?START_APPS), + Config. + +end_per_suite(Config) -> + emqx_common_test_helpers:stop_apps(lists:reverse(?START_APPS)), + Config. + +init_per_testcase(Case = t_rebalance, Config) -> + _ = emqx_node_rebalance_evacuation:stop(), + ClusterNodes = emqx_eviction_agent_test_helpers:start_cluster( + [ + {case_specific_node_name(?MODULE, Case, '_donor'), 2883}, + {case_specific_node_name(?MODULE, Case, '_recipient'), 3883} + ], + ?START_APPS + ), + [{cluster_nodes, ClusterNodes} | Config]; +init_per_testcase(_Case, Config) -> + _ = emqx_node_rebalance_evacuation:stop(), + _ = emqx_node_rebalance:stop(), + Config. + +end_per_testcase(t_rebalance, Config) -> + _ = emqx_node_rebalance_evacuation:stop(), + _ = emqx_node_rebalance:stop(), + _ = emqx_eviction_agent_test_helpers:stop_cluster( + ?config(cluster_nodes, Config), + ?START_APPS + ); +end_per_testcase(_Case, _Config) -> + _ = emqx_node_rebalance_evacuation:stop(), + _ = emqx_node_rebalance:stop(). + +%%-------------------------------------------------------------------- +%% Tests +%%-------------------------------------------------------------------- + +t_evacuation(_Config) -> + %% usage + ok = emqx_node_rebalance_cli:cli(["foobar"]), + + %% status + ok = emqx_node_rebalance_cli:cli(["status"]), + ok = emqx_node_rebalance_cli:cli(["node-status"]), + ok = emqx_node_rebalance_cli:cli(["node-status", atom_to_list(node())]), + + %% start with invalid args + ?assertNot( + emqx_node_rebalance_cli:cli(["start", "--evacuation", "--foo-bar"]) + ), + + ?assertNot( + emqx_node_rebalance_cli:cli(["start", "--evacuation", "--conn-evict-rate", "foobar"]) + ), + + ?assertNot( + emqx_node_rebalance_cli:cli(["start", "--evacuation", "--sess-evict-rate", "foobar"]) + ), + + ?assertNot( + emqx_node_rebalance_cli:cli(["start", "--evacuation", "--wait-takeover", "foobar"]) + ), + + ?assertNot( + emqx_node_rebalance_cli:cli([ + "start", + "--evacuation", + "--migrate-to", + "nonexistent@node" + ]) + ), + ?assertNot( + emqx_node_rebalance_cli:cli([ + "start", + "--evacuation", + "--migrate-to", + "" + ]) + ), + ?assertNot( + emqx_node_rebalance_cli:cli([ + "start", + "--evacuation", + "--unknown-arg" + ]) + ), + ?assert( + emqx_node_rebalance_cli:cli([ + "start", + "--evacuation", + "--conn-evict-rate", + "10", + "--sess-evict-rate", + "10", + "--wait-takeover", + "10", + "--migrate-to", + atom_to_list(node()), + "--redirect-to", + "srv" + ]) + ), + + %% status + ok = emqx_node_rebalance_cli:cli(["status"]), + ok = emqx_node_rebalance_cli:cli(["node-status"]), + ok = emqx_node_rebalance_cli:cli(["node-status", atom_to_list(node())]), + + ?assertMatch( + {enabled, #{}}, + emqx_node_rebalance_evacuation:status() + ), + + %% already enabled + ?assertNot( + emqx_node_rebalance_cli:cli([ + "start", + "--evacuation", + "--conn-evict-rate", + "10", + "--redirect-to", + "srv" + ]) + ), + + %% stop + true = emqx_node_rebalance_cli:cli(["stop"]), + + false = emqx_node_rebalance_cli:cli(["stop"]), + + ?assertEqual( + disabled, + emqx_node_rebalance_evacuation:status() + ). + +t_rebalance(Config) -> + process_flag(trap_exit, true), + + [{DonorNode, DonorPort}, {RecipientNode, _}] = ?config(cluster_nodes, Config), + + %% start with invalid args + ?assertNot( + emqx_node_rebalance_cli(DonorNode, ["start", "--foo-bar"]) + ), + + ?assertNot( + emqx_node_rebalance_cli(DonorNode, ["start", "--conn-evict-rate", "foobar"]) + ), + + ?assertNot( + emqx_node_rebalance_cli(DonorNode, ["start", "--abs-conn-threshold", "foobar"]) + ), + + ?assertNot( + emqx_node_rebalance_cli(DonorNode, ["start", "--rel-conn-threshold", "foobar"]) + ), + + ?assertNot( + emqx_node_rebalance_cli(DonorNode, ["start", "--sess-evict-rate", "foobar"]) + ), + + ?assertNot( + emqx_node_rebalance_cli(DonorNode, ["start", "--abs-sess-threshold", "foobar"]) + ), + + ?assertNot( + emqx_node_rebalance_cli(DonorNode, ["start", "--rel-sess-threshold", "foobar"]) + ), + + ?assertNot( + emqx_node_rebalance_cli(DonorNode, ["start", "--wait-takeover", "foobar"]) + ), + + ?assertNot( + emqx_node_rebalance_cli(DonorNode, ["start", "--wait-health-check", "foobar"]) + ), + + ?assertNot( + emqx_node_rebalance_cli(DonorNode, [ + "start", + "--nodes", + "nonexistent@node" + ]) + ), + ?assertNot( + emqx_node_rebalance_cli(DonorNode, [ + "start", + "--nodes", + "" + ]) + ), + ?assertNot( + emqx_node_rebalance_cli(DonorNode, [ + "start", + "--nodes", + atom_to_list(RecipientNode) + ]) + ), + ?assertNot( + emqx_node_rebalance_cli(DonorNode, [ + "start", + "--unknown-arg" + ]) + ), + + Conns = emqtt_connect_many(DonorPort, 20), + + ?assert( + emqx_node_rebalance_cli(DonorNode, [ + "start", + "--conn-evict-rate", + "10", + "--abs-conn-threshold", + "10", + "--rel-conn-threshold", + "1.1", + "--sess-evict-rate", + "10", + "--abs-sess-threshold", + "10", + "--rel-sess-threshold", + "1.1", + "--wait-takeover", + "10", + "--nodes", + atom_to_list(DonorNode) ++ "," ++ + atom_to_list(RecipientNode) + ]) + ), + + %% status + ok = emqx_node_rebalance_cli(DonorNode, ["status"]), + ok = emqx_node_rebalance_cli(DonorNode, ["node-status"]), + ok = emqx_node_rebalance_cli(DonorNode, ["node-status", atom_to_list(DonorNode)]), + + ?assertMatch( + {enabled, #{}}, + rpc:call(DonorNode, emqx_node_rebalance, status, []) + ), + + %% already enabled + ?assertNot( + emqx_node_rebalance_cli(DonorNode, ["start"]) + ), + + %% stop + true = emqx_node_rebalance_cli(DonorNode, ["stop"]), + + false = emqx_node_rebalance_cli(DonorNode, ["stop"]), + + ?assertEqual( + disabled, + rpc:call(DonorNode, emqx_node_rebalance, status, []) + ), + + ok = stop_many(Conns). + +%%-------------------------------------------------------------------- +%% Helpers +%%-------------------------------------------------------------------- + +emqx_node_rebalance_cli(Node, Args) -> + case rpc:call(Node, emqx_node_rebalance_cli, cli, [Args]) of + {badrpc, Reason} -> + error(Reason); + Result -> + Result + end. diff --git a/lib-ee/emqx_node_rebalance/test/emqx_node_rebalance_evacuation_SUITE.erl b/lib-ee/emqx_node_rebalance/test/emqx_node_rebalance_evacuation_SUITE.erl new file mode 100644 index 000000000..cdafad97a --- /dev/null +++ b/lib-ee/emqx_node_rebalance/test/emqx_node_rebalance_evacuation_SUITE.erl @@ -0,0 +1,271 @@ +%%-------------------------------------------------------------------- +%% Copyright (c) 2022-2023 EMQ Technologies Co., Ltd. All Rights Reserved. +%%-------------------------------------------------------------------- + +-module(emqx_node_rebalance_evacuation_SUITE). + +-compile(export_all). +-compile(nowarn_export_all). + +-include_lib("emqx/include/emqx_mqtt.hrl"). +-include_lib("emqx/include/asserts.hrl"). +-include_lib("eunit/include/eunit.hrl"). +-include_lib("common_test/include/ct.hrl"). +-include_lib("snabbkaffe/include/snabbkaffe.hrl"). + +-import( + emqx_eviction_agent_test_helpers, + [emqtt_connect/1, emqtt_try_connect/1, case_specific_node_name/3] +). + +all() -> [{group, one_node}, {group, two_node}]. + +groups() -> + [ + {one_node, [], [ + t_agent_busy, + t_already_started, + t_not_started, + t_start, + t_persistence, + t_unknown_messages + ]}, + {two_node, [], [ + t_conn_evicted, + t_migrate_to, + t_session_evicted + ]} + ]. + +init_per_suite(Config) -> + ok = emqx_common_test_helpers:start_apps([]), + Config. + +end_per_suite(_Config) -> + ok = emqx_common_test_helpers:stop_apps([]), + ok. + +init_per_group(one_node, Config) -> + [{cluster_type, one_node} | Config]; +init_per_group(two_node, Config) -> + [{cluster_type, two_node} | Config]. + +end_per_group(_Group, _Config) -> + ok. + +init_per_testcase(Case, Config) -> + NodesWithPorts = + case ?config(cluster_type, Config) of + one_node -> + [{case_specific_node_name(?MODULE, Case, '_evacuated'), 2883}]; + two_node -> + [ + {case_specific_node_name(?MODULE, Case, '_evacuated'), 2883}, + {case_specific_node_name(?MODULE, Case, '_recipient'), 3883} + ] + end, + ClusterNodes = emqx_eviction_agent_test_helpers:start_cluster( + NodesWithPorts, + [emqx_eviction_agent, emqx_node_rebalance], + [{emqx, data_dir, case_specific_data_dir(Case, Config)}] + ), + ok = snabbkaffe:start_trace(), + [{cluster_nodes, ClusterNodes} | Config]. + +end_per_testcase(_Case, Config) -> + ok = snabbkaffe:stop(), + ok = emqx_eviction_agent_test_helpers:stop_cluster( + ?config(cluster_nodes, Config), + [emqx_eviction_agent, emqx_node_rebalance] + ). + +%%-------------------------------------------------------------------- +%% Tests +%%-------------------------------------------------------------------- + +%% One node tests + +t_agent_busy(Config) -> + [{DonorNode, _DonorPort}] = ?config(cluster_nodes, Config), + ok = rpc:call(DonorNode, emqx_eviction_agent, enable, [other_rebalance, undefined]), + + ?assertEqual( + {error, eviction_agent_busy}, + rpc:call(DonorNode, emqx_node_rebalance_evacuation, start, [opts(Config)]) + ). + +t_already_started(Config) -> + [{DonorNode, _DonorPort}] = ?config(cluster_nodes, Config), + ok = rpc:call(DonorNode, emqx_node_rebalance_evacuation, start, [opts(Config)]), + + ?assertEqual( + {error, already_started}, + rpc:call(DonorNode, emqx_node_rebalance_evacuation, start, [opts(Config)]) + ). + +t_not_started(Config) -> + [{DonorNode, _DonorPort}] = ?config(cluster_nodes, Config), + + ?assertEqual( + {error, not_started}, + rpc:call(DonorNode, emqx_node_rebalance_evacuation, stop, []) + ). + +t_start(Config) -> + process_flag(trap_exit, true), + + [{DonorNode, DonorPort}] = ?config(cluster_nodes, Config), + + ok = rpc:call(DonorNode, emqx_node_rebalance_evacuation, start, [opts(Config)]), + ?assertMatch( + {error, {use_another_server, #{}}}, + emqtt_try_connect([{port, DonorPort}]) + ). + +t_persistence(Config) -> + process_flag(trap_exit, true), + + [{DonorNode, DonorPort}] = ?config(cluster_nodes, Config), + + ok = rpc:call(DonorNode, emqx_node_rebalance_evacuation, start, [opts(Config)]), + + ?assertMatch( + {error, {use_another_server, #{}}}, + emqtt_try_connect([{port, DonorPort}]) + ), + + ok = rpc:call(DonorNode, supervisor, terminate_child, [ + emqx_node_rebalance_sup, emqx_node_rebalance_evacuation + ]), + {ok, _} = rpc:call(DonorNode, supervisor, restart_child, [ + emqx_node_rebalance_sup, emqx_node_rebalance_evacuation + ]), + + ?assertMatch( + {error, {use_another_server, #{}}}, + emqtt_try_connect([{port, DonorPort}]) + ), + ?assertMatch( + {enabled, #{conn_evict_rate := 10}}, + rpc:call(DonorNode, emqx_node_rebalance_evacuation, status, []) + ). + +t_unknown_messages(Config) -> + process_flag(trap_exit, true), + + [{DonorNode, _DonorPort}] = ?config(cluster_nodes, Config), + + ok = rpc:call(DonorNode, emqx_node_rebalance_evacuation, start, [opts(Config)]), + + Pid = rpc:call(DonorNode, erlang, whereis, [emqx_node_rebalance_evacuation]), + + Pid ! unknown, + + ok = gen_server:cast(Pid, unknown), + + ?assertEqual( + ignored, + gen_server:call(Pid, unknown) + ). + +%% Two node tests + +t_conn_evicted(Config) -> + process_flag(trap_exit, true), + + [{DonorNode, DonorPort}, _] = ?config(cluster_nodes, Config), + + {ok, C} = emqtt_connect([{clientid, <<"evacuated">>}, {port, DonorPort}]), + + ?assertWaitEvent( + ok = rpc:call(DonorNode, emqx_node_rebalance_evacuation, start, [opts(Config)]), + #{?snk_kind := node_evacuation_evict_conn}, + 1000 + ), + + ?assertMatch( + {error, {use_another_server, #{}}}, + emqtt_try_connect([{clientid, <<"connecting">>}, {port, DonorPort}]) + ), + + receive + {'EXIT', C, {disconnected, 156, _}} -> ok + after 1000 -> + ct:fail("Connection not evicted") + end. + +t_migrate_to(Config) -> + [{DonorNode, _DonorPort}, {RecipientNode, _RecipientPort}] = ?config(cluster_nodes, Config), + + ?assertEqual( + [RecipientNode], + rpc:call(DonorNode, emqx_node_rebalance_evacuation, migrate_to, [undefined]) + ), + + ?assertEqual( + [], + rpc:call(DonorNode, emqx_node_rebalance_evacuation, migrate_to, [['unknown@node']]) + ), + + ok = rpc:call(RecipientNode, emqx_eviction_agent, enable, [test_rebalance, undefined]), + + ?assertEqual( + [], + rpc:call(DonorNode, emqx_node_rebalance_evacuation, migrate_to, [undefined]) + ). + +t_session_evicted(Config) -> + process_flag(trap_exit, true), + + [{DonorNode, DonorPort}, {RecipientNode, _RecipientPort}] = ?config(cluster_nodes, Config), + + {ok, C} = emqtt_connect([ + {port, DonorPort}, {clientid, <<"client_with_sess">>}, {clean_start, false} + ]), + + ?assertWaitEvent( + ok = rpc:call(DonorNode, emqx_node_rebalance_evacuation, start, [opts(Config)]), + #{?snk_kind := node_evacuation_evict_sess_over}, + 5000 + ), + + receive + {'EXIT', C, {disconnected, ?RC_USE_ANOTHER_SERVER, _}} -> ok + after 1000 -> + ct:fail("Connection not evicted") + end, + + [ChannelPid] = rpc:call(DonorNode, emqx_cm_registry, lookup_channels, [<<"client_with_sess">>]), + + ?assertEqual( + RecipientNode, + node(ChannelPid) + ). + +%%-------------------------------------------------------------------- +%% Helpers +%%-------------------------------------------------------------------- + +opts(Config) -> + #{ + server_reference => <<"srv">>, + conn_evict_rate => 10, + sess_evict_rate => 10, + wait_takeover => 1, + migrate_to => migrate_to(Config) + }. + +migrate_to(Config) -> + case ?config(cluster_type, Config) of + one_node -> + []; + two_node -> + [_, {RecipientNode, _RecipientPort}] = ?config(cluster_nodes, Config), + [RecipientNode] + end. + +case_specific_data_dir(Case, Config) -> + case ?config(priv_dir, Config) of + undefined -> undefined; + PrivDir -> filename:join(PrivDir, atom_to_list(Case)) + end. diff --git a/lib-ee/emqx_node_rebalance/test/emqx_node_rebalance_evacuation_persist_SUITE.erl b/lib-ee/emqx_node_rebalance/test/emqx_node_rebalance_evacuation_persist_SUITE.erl new file mode 100644 index 000000000..450280cb8 --- /dev/null +++ b/lib-ee/emqx_node_rebalance/test/emqx_node_rebalance_evacuation_persist_SUITE.erl @@ -0,0 +1,108 @@ +%%-------------------------------------------------------------------- +%% Copyright (c) 2022-2023 EMQ Technologies Co., Ltd. All Rights Reserved. +%%-------------------------------------------------------------------- + +-module(emqx_node_rebalance_evacuation_persist_SUITE). + +-compile(export_all). +-compile(nowarn_export_all). + +-include_lib("eunit/include/eunit.hrl"). +-include_lib("common_test/include/ct.hrl"). + +all() -> + emqx_common_test_helpers:all(?MODULE). + +init_per_suite(Config) -> + Config. + +end_per_suite(_Config) -> + ok. + +init_per_testcase(_Case, Config) -> + _ = emqx_node_rebalance_evacuation_persist:clear(), + Config. + +end_per_testcase(_Case, _Config) -> + _ = emqx_node_rebalance_evacuation_persist:clear(). + +%%-------------------------------------------------------------------- +%% Tests +%%-------------------------------------------------------------------- + +t_save_read(_Config) -> + DefaultOpts = #{ + server_reference => <<"default_ref">>, + conn_evict_rate => 2001, + sess_evict_rate => 2002, + wait_takeover => 2003 + }, + + Opts0 = #{ + server_reference => <<"ref">>, + conn_evict_rate => 1001, + sess_evict_rate => 1002, + wait_takeover => 1003 + }, + ok = emqx_node_rebalance_evacuation_persist:save(Opts0), + + {ok, ReadOpts0} = emqx_node_rebalance_evacuation_persist:read(DefaultOpts), + ?assertEqual(Opts0, ReadOpts0), + + Opts1 = Opts0#{server_reference => undefined}, + ok = emqx_node_rebalance_evacuation_persist:save(Opts1), + + {ok, ReadOpts1} = emqx_node_rebalance_evacuation_persist:read(DefaultOpts), + ?assertEqual(Opts1, ReadOpts1). + +t_read_default(_Config) -> + ok = write_evacuation_file(<<"{}">>), + + DefaultOpts = #{ + server_reference => <<"ref">>, + conn_evict_rate => 1001, + sess_evict_rate => 1002, + wait_takeover => 1003 + }, + + {ok, ReadOpts} = emqx_node_rebalance_evacuation_persist:read(DefaultOpts), + ?assertEqual(DefaultOpts, ReadOpts). + +t_read_bad_data(_Config) -> + ok = write_evacuation_file(<<"{bad json">>), + + DefaultOpts = #{ + server_reference => <<"ref">>, + conn_evict_rate => 1001, + sess_evict_rate => 1002, + wait_takeover => 1003 + }, + + {ok, ReadOpts} = emqx_node_rebalance_evacuation_persist:read(DefaultOpts), + ?assertEqual(DefaultOpts, ReadOpts). + +t_clear(_Config) -> + ok = write_evacuation_file(<<"{}">>), + + ?assertMatch( + {ok, _}, + emqx_node_rebalance_evacuation_persist:read(#{}) + ), + + ok = emqx_node_rebalance_evacuation_persist:clear(), + + ?assertEqual( + none, + emqx_node_rebalance_evacuation_persist:read(#{}) + ). + +%%-------------------------------------------------------------------- +%% Helpers +%%-------------------------------------------------------------------- + +write_evacuation_file(Json) -> + ok = filelib:ensure_dir(emqx_node_rebalance_evacuation_persist:evacuation_filepath()), + ok = file:write_file( + emqx_node_rebalance_evacuation_persist:evacuation_filepath(), + Json + ). diff --git a/mix.exs b/mix.exs index 564d81ccf..cafeec7bd 100644 --- a/mix.exs +++ b/mix.exs @@ -402,7 +402,9 @@ defmodule EMQXUmbrella.MixProject do emqx_oracle: :permanent, emqx_bridge_oracle: :permanent, emqx_bridge_rabbitmq: :permanent, - emqx_ee_schema_registry: :permanent + emqx_ee_schema_registry: :permanent, + emqx_eviction_agent: :permanent, + emqx_node_rebalance: :permanent ], else: [] ) diff --git a/rebar.config.erl b/rebar.config.erl index a48a365c9..d556b41aa 100644 --- a/rebar.config.erl +++ b/rebar.config.erl @@ -478,7 +478,9 @@ relx_apps_per_edition(ee) -> emqx_oracle, emqx_bridge_oracle, emqx_bridge_rabbitmq, - emqx_ee_schema_registry + emqx_ee_schema_registry, + emqx_eviction_agent, + emqx_node_rebalance ]; relx_apps_per_edition(ce) -> []. From e683d28973c5aa9e2e52bcf19b9b2158657e92f3 Mon Sep 17 00:00:00 2001 From: Ilya Averyanov Date: Thu, 4 May 2023 23:33:46 +0300 Subject: [PATCH 2/8] chore(rebalance): rebase and review fixes --- apps/emqx/src/emqx_cm.erl | 6 +- apps/emqx_utils/src/emqx_utils_api.erl | 4 +- .../i18n/emqx_eviction_agent_api_i18n.conf | 14 - .../src/emqx_eviction_agent.erl | 4 +- .../src/emqx_eviction_agent_app.erl | 2 - .../src/emqx_eviction_agent_channel.erl | 20 +- .../test/emqx_eviction_agent_SUITE.erl | 70 ++- .../test/emqx_eviction_agent_test_helpers.erl | 11 +- .../i18n/emqx_node_rebalance_api_i18n.conf | 490 ------------------ .../src/emqx_node_rebalance_api.erl | 41 +- .../src/emqx_node_rebalance_cli.erl | 4 +- ...emqx_node_rebalance_evacuation_persist.erl | 4 +- .../src/emqx_node_rebalance_status.erl | 2 +- .../test/emqx_node_rebalance_api_SUITE.erl | 6 +- .../emqx_node_rebalance_evacuation_SUITE.erl | 25 +- rel/i18n/emqx_eviction_agent_api.hocon | 9 + rel/i18n/emqx_node_rebalance_api.hocon | 267 ++++++++++ rel/i18n/zh/emqx_eviction_agent_api.hocon | 9 + rel/i18n/zh/emqx_node_rebalance_api.hocon | 267 ++++++++++ 19 files changed, 673 insertions(+), 582 deletions(-) delete mode 100644 lib-ee/emqx_eviction_agent/i18n/emqx_eviction_agent_api_i18n.conf delete mode 100644 lib-ee/emqx_node_rebalance/i18n/emqx_node_rebalance_api_i18n.conf create mode 100644 rel/i18n/emqx_eviction_agent_api.hocon create mode 100644 rel/i18n/emqx_node_rebalance_api.hocon create mode 100644 rel/i18n/zh/emqx_eviction_agent_api.hocon create mode 100644 rel/i18n/zh/emqx_node_rebalance_api.hocon diff --git a/apps/emqx/src/emqx_cm.erl b/apps/emqx/src/emqx_cm.erl index c8296f317..66c1db36e 100644 --- a/apps/emqx/src/emqx_cm.erl +++ b/apps/emqx/src/emqx_cm.erl @@ -602,14 +602,14 @@ all_channels() -> ets:select(?CHAN_TAB, Pat). %% @doc Get clientinfo for all clients with sessions -channel_with_session_table(ConnModules) -> +channel_with_session_table(ConnModuleList) -> Ms = ets:fun2ms( fun({{ClientId, _ChanPid}, Info, _Stats}) -> {ClientId, Info} end ), Table = ets:table(?CHAN_INFO_TAB, [{traverse, {select, Ms}}]), - ConnModuleMap = maps:from_list([{Mod, true} || Mod <- ConnModules]), + ConnModules = sets:from_list(ConnModuleList, [{version, 2}]), qlc:q([ {ClientId, ConnState, ConnInfo, ClientInfo} || {ClientId, #{ @@ -618,7 +618,7 @@ channel_with_session_table(ConnModules) -> conninfo := #{clean_start := false, conn_mod := ConnModule} = ConnInfo }} <- Table, - maps:is_key(ConnModule, ConnModuleMap) + sets:is_element(ConnModule, ConnModules) ]). %% @doc Get all local connection query handle diff --git a/apps/emqx_utils/src/emqx_utils_api.erl b/apps/emqx_utils/src/emqx_utils_api.erl index e6bd07272..a1bc97cd6 100644 --- a/apps/emqx_utils/src/emqx_utils_api.erl +++ b/apps/emqx_utils/src/emqx_utils_api.erl @@ -72,4 +72,6 @@ is_running_node(Node) -> handle_result({ok, Result}) -> ?OK(Result); handle_result({error, Reason}) -> - ?BAD_REQUEST(Reason). + ?BAD_REQUEST(Reason); +handle_result({HTTPCode, Content}) when is_integer(HTTPCode) -> + {HTTPCode, Content}. diff --git a/lib-ee/emqx_eviction_agent/i18n/emqx_eviction_agent_api_i18n.conf b/lib-ee/emqx_eviction_agent/i18n/emqx_eviction_agent_api_i18n.conf deleted file mode 100644 index 8bb7282c3..000000000 --- a/lib-ee/emqx_eviction_agent/i18n/emqx_eviction_agent_api_i18n.conf +++ /dev/null @@ -1,14 +0,0 @@ -emqx_eviction_agent_api { - - node_eviction_status_get { - desc { - en: "Get the node eviction status" - zh: "获取节点驱逐状态" - } - label { - en: "Node Eviction Status" - zh: "节点驱逐状态" - } - } - -} diff --git a/lib-ee/emqx_eviction_agent/src/emqx_eviction_agent.erl b/lib-ee/emqx_eviction_agent/src/emqx_eviction_agent.erl index b8e1b5236..9a29adc69 100644 --- a/lib-ee/emqx_eviction_agent/src/emqx_eviction_agent.erl +++ b/lib-ee/emqx_eviction_agent/src/emqx_eviction_agent.erl @@ -48,7 +48,9 @@ -export_type([server_reference/0]). --define(CONN_MODULES, [emqx_connection, emqx_ws_connection, emqx_eviction_agent_channel]). +-define(CONN_MODULES, [ + emqx_connection, emqx_ws_connection, emqx_quic_connection, emqx_eviction_agent_channel +]). %%-------------------------------------------------------------------- %% APIs diff --git a/lib-ee/emqx_eviction_agent/src/emqx_eviction_agent_app.erl b/lib-ee/emqx_eviction_agent/src/emqx_eviction_agent_app.erl index 63af59b09..90b09884f 100644 --- a/lib-ee/emqx_eviction_agent/src/emqx_eviction_agent_app.erl +++ b/lib-ee/emqx_eviction_agent/src/emqx_eviction_agent_app.erl @@ -6,8 +6,6 @@ -behaviour(application). --emqx_plugin(?MODULE). - -export([ start/2, stop/1 diff --git a/lib-ee/emqx_eviction_agent/src/emqx_eviction_agent_channel.erl b/lib-ee/emqx_eviction_agent/src/emqx_eviction_agent_channel.erl index a42033c0f..a6097f03d 100644 --- a/lib-ee/emqx_eviction_agent/src/emqx_eviction_agent_channel.erl +++ b/lib-ee/emqx_eviction_agent/src/emqx_eviction_agent_channel.erl @@ -13,8 +13,6 @@ -include_lib("snabbkaffe/include/snabbkaffe.hrl"). --logger_header("[Evicted Channel]"). - -export([ start_link/1, start_supervised/1, @@ -33,13 +31,6 @@ code_change/3 ]). --import( - emqx_misc, - [ - maybe_apply/2 - ] -). - -type opts() :: #{ conninfo := emqx_types:conninfo(), clientinfo := emqx_types:clientinfo() @@ -133,7 +124,7 @@ handle_call( ) -> ok = emqx_session:takeover(Session), %% TODO: Should not drain deliver here (side effect) - Delivers = emqx_misc:drain_deliver(), + Delivers = emqx_utils:drain_deliver(), AllPendings = lists:append(Delivers, Pendings), ?tp( debug, @@ -156,7 +147,7 @@ handle_call(Req, _From, Channel) -> {reply, ignored, Channel}. handle_info(Deliver = {deliver, _Topic, _Msg}, Channel) -> - Delivers = [Deliver | emqx_misc:drain_deliver()], + Delivers = [Deliver | emqx_utils:drain_deliver()], {noreply, handle_deliver(Delivers, Channel)}; handle_info(expire_session, Channel) -> {stop, expired, Channel}; @@ -186,7 +177,6 @@ code_change(_OldVsn, Channel, _Extra) -> %% Internal functions %%-------------------------------------------------------------------- -%% TODO: sync with emqx_channel handle_deliver( Delivers, #{ @@ -239,7 +229,7 @@ set_expiry_timer(#{conninfo := ConnInfo} = Channel) -> open_session(ConnInfo, #{clientid := ClientId} = ClientInfo) -> Channel = channel(ConnInfo, ClientInfo), - case emqx_cm:open_session(false, ClientInfo, ConnInfo) of + case emqx_cm:open_session(_CleanSession = false, ClientInfo, ConnInfo) of {ok, #{present := false}} -> ?SLOG( info, @@ -259,7 +249,7 @@ open_session(ConnInfo, #{clientid := ClientId} = ClientInfo) -> node => node() } ), - Pendings1 = lists:usort(lists:append(Pendings0, emqx_misc:drain_deliver())), + Pendings1 = lists:usort(lists:append(Pendings0, emqx_utils:drain_deliver())), NSession = emqx_session:enqueue( ClientInfo, emqx_session:ignore_local( @@ -352,7 +342,7 @@ info(Channel) -> #{ conninfo => maps:get(conninfo, Channel, undefined), clientinfo => maps:get(clientinfo, Channel, undefined), - session => maybe_apply( + session => emqx_utils:maybe_apply( fun emqx_session:info/1, maps:get(session, Channel, undefined) ), diff --git a/lib-ee/emqx_eviction_agent/test/emqx_eviction_agent_SUITE.erl b/lib-ee/emqx_eviction_agent/test/emqx_eviction_agent_SUITE.erl index 0574ccec3..22b694d77 100644 --- a/lib-ee/emqx_eviction_agent/test/emqx_eviction_agent_SUITE.erl +++ b/lib-ee/emqx_eviction_agent/test/emqx_eviction_agent_SUITE.erl @@ -362,13 +362,77 @@ t_will_msg(_Config) -> ok = emqtt:disconnect(C). +t_ws_conn(_Config) -> + erlang:process_flag(trap_exit, true), + + ClientId = <<"ws_client">>, + {ok, C} = emqtt:start_link([ + {proto_ver, v5}, + {clientid, ClientId}, + {port, 8083}, + {ws_path, "/mqtt"} + ]), + {ok, _} = emqtt:ws_connect(C), + + ok = emqx_eviction_agent:enable(test_eviction, undefined), + + ?assertEqual( + 1, + emqx_eviction_agent:connection_count() + ), + + ?assertWaitEvent( + ok = emqx_eviction_agent:evict_connections(1), + #{?snk_kind := emqx_cm_connected_client_count_dec}, + 1000 + ), + + ?assertEqual( + 0, + emqx_eviction_agent:connection_count() + ). + +-ifndef(BUILD_WITHOUT_QUIC). + +t_quic_conn(_Config) -> + erlang:process_flag(trap_exit, true), + + QuicPort = emqx_common_test_helpers:select_free_port(quic), + application:ensure_all_started(quicer), + emqx_common_test_helpers:ensure_quic_listener(?MODULE, QuicPort), + + ClientId = <<"quic_client">>, + {ok, C} = emqtt:start_link([ + {proto_ver, v5}, + {clientid, ClientId}, + {port, QuicPort} + ]), + {ok, _} = emqtt:quic_connect(C), + + ok = emqx_eviction_agent:enable(test_eviction, undefined), + + ?assertEqual( + 1, + emqx_eviction_agent:connection_count() + ), + + ?assertWaitEvent( + ok = emqx_eviction_agent:evict_connections(1), + #{?snk_kind := emqx_cm_connected_client_count_dec}, + 1000 + ), + + ?assertEqual( + 0, + emqx_eviction_agent:connection_count() + ). + +-endif. + %%-------------------------------------------------------------------- %% Helpers %%-------------------------------------------------------------------- -% sn_connect_and_subscribe(ClientId, Topic) -> -% emqx_eviction_agent_test_helpers:sn_connect_and_subscribe(ClientId, Topic). - assert_receive_publish([]) -> ok; assert_receive_publish([#{payload := Msg, topic := Topic} | Rest]) -> diff --git a/lib-ee/emqx_eviction_agent/test/emqx_eviction_agent_test_helpers.erl b/lib-ee/emqx_eviction_agent/test/emqx_eviction_agent_test_helpers.erl index 8f88ebf97..3953ec3e2 100644 --- a/lib-ee/emqx_eviction_agent/test/emqx_eviction_agent_test_helpers.erl +++ b/lib-ee/emqx_eviction_agent/test/emqx_eviction_agent_test_helpers.erl @@ -81,12 +81,11 @@ start_cluster(NamesWithPorts, Apps, Env) -> NamesWithPorts ), Opts0 = [ - {env, [{emqx, boot_modules, [broker, listeners]}]}, + {env, [{emqx, boot_modules, [broker, listeners]}] ++ Env}, {apps, Apps}, {conf, [{[listeners, Proto, default, enabled], false} || Proto <- [ssl, ws, wss]] ++ - [{[rpc, mode], async}]}, - {env, Env} + [{[rpc, mode], async}]} ], Cluster = emqx_common_test_helpers:emqx_cluster( Specs, @@ -99,12 +98,6 @@ start_cluster(NamesWithPorts, Apps, Env) -> } || {Name, Opts} <- Cluster ], - ok = lists:foreach( - fun({Node, _Port}) -> - snabbkaffe:forward_trace(Node) - end, - NodesWithPorts - ), NodesWithPorts. stop_cluster(NodesWithPorts, Apps) -> diff --git a/lib-ee/emqx_node_rebalance/i18n/emqx_node_rebalance_api_i18n.conf b/lib-ee/emqx_node_rebalance/i18n/emqx_node_rebalance_api_i18n.conf deleted file mode 100644 index f5f161a92..000000000 --- a/lib-ee/emqx_node_rebalance/i18n/emqx_node_rebalance_api_i18n.conf +++ /dev/null @@ -1,490 +0,0 @@ -emqx_node_rebalance_api { - - ## API Request Fields - - load_rebalance_status { - desc { - en: "Get rebalance status of the current node" - zh: "获取当前节点的rebalance状态" - } - label { - en: "Get rebalance status" - zh: "获取rebalance状态" - } - } - - load_rebalance_global_status { - desc { - en: "Get status of all rebalance/evacuation processes across the cluster" - zh: "获取集群中所有rebalance/evacuation进程的状态" - } - label { - en: "Get global rebalance status" - zh: "获取全局rebalance状态" - } - } - - load_rebalance_availability_check { - desc { - en: "Check if the node is being evacuated or rebalanced" - zh: "检查节点是否正在被evacuate或rebalance" - } - label { - en: "Availability check" - zh: "可用性检查" - } - } - - load_rebalance_start { - desc { - en: "Start rebalance process" - zh: "启动rebalance进程" - } - label { - en: "Start rebalance" - zh: "启动rebalance" - } - } - - load_rebalance_stop { - desc { - en: "Stop rebalance process" - zh: "停止rebalance进程" - } - label { - en: "Stop rebalance" - zh: "停止rebalance" - } - } - - load_rebalance_evacuation_start { - desc { - en: "Start evacuation process" - zh: "启动evacuation进程" - } - label { - en: "Start evacuation" - zh: "启动evacuation" - } - } - - load_rebalance_evacuation_stop { - desc { - en: "Stop evacuation process" - zh: "停止evacuation进程" - } - label { - en: "Stop evacuation" - zh: "停止evacuation" - } - } - - param_node { - desc { - en: "Node name" - zh: "节点名称" - } - label { - en: "Node name" - zh: "节点名称" - } - } - - wait_health_check { - desc { - en: "Time to wait before starting the rebalance process, in seconds" - zh: "启动rebalance进程前等待的时间,单位为秒" - } - label { - en: "Wait health check" - zh: "等待健康检查" - } - } - - conn_evict_rate { - desc { - en: "The rate of evicting connections, in connections per second" - zh: "逐出连接的速率,以每秒连接数表示" - } - label { - en: "Connection eviction rate" - zh: "连接驱逐率" - } - } - - sess_evict_rate { - desc { - en: "The rate of evicting sessions, in sessions per second" - zh: "逐出会话的速率,以每秒会话为单位" - } - label { - en: "Session eviction rate" - zh: "会话驱逐率" - } - } - - abs_conn_threshold { - desc { - en: "Maximum desired difference between the number of connections on the node and the average number of connections on the recipient nodes" - zh: "节点上的连接数与接收节点上的平均连接数之间的最大期望差值" - } - label { - en: "Absolute connection threshold" - zh: "绝对连接阈值" - } - } - - rel_conn_threshold { - desc { - en: "Maximum desired fraction between the number of connections on the node and the average number of connections on the recipient nodes" - zh: "节点上的连接数与接收节点上的平均连接数之间的最大期望分数" - } - label { - en: "Relative connection threshold" - zh: "相对连接阈值" - } - } - - abs_sess_threshold { - desc { - en: "Maximum desired difference between the number of sessions on the node and the average number of sessions on the recipient nodes" - zh: "节点上的会话数与接收节点上的平均会话数之间的最大期望差异" - } - label { - en: "Absolute session threshold" - zh: "绝对会话阈值" - } - } - - rel_sess_threshold { - desc { - en: "Maximum desired fraction between the number of sessions on the node and the average number of sessions on the recipient nodes" - zh: "节点上的会话数与接收节点上的平均会话数之间的最大期望分数" - } - label { - en: "Relative session threshold" - zh: "相对会话阈值" - } - } - - wait_takeover { - desc { - en: "Time to wait before starting session evacuation process, in seconds" - zh: "开始会话疏散过程之前等待的时间,以秒为单位" - } - label { - en: "Wait takeover" - zh: "等待接管" - } - } - - redirect_to { - desc { - en: "Server reference to redirect clients to (MQTTv5 Server redirection)" - zh: "将客户端重定向到的服务器参考(MQTTv5 服务器重定向)" - } - label { - en: "Redirect to" - zh: "重定向至" - } - } - - migrate_to { - desc { - en: "Nodes to migrate sessions to" - zh: "将会话迁移到的节点" - } - label { - en: "Migrate to" - zh: "迁移到" - } - } - - rebalance_nodes { - desc { - en: "Nodes to participate in rebalance" - zh: "参与rebalance的节点" - } - label { - en: "Rebalance nodes" - zh: "重新平衡节点" - } - } - - ## API Response Fields - - local_status_enabled { - desc { - en: "Whether the node is being evacuated" - zh: "节点是否正在撤离" - } - label { - en: "Local evacuation status" - zh: "当地避难状况" - } - } - - local_status_process { - desc { - en: "The process that is being performed on the node: evacuation or rebalance" - zh: "正在节点上执行的过程:疏散或重新平衡" - } - label { - en: "Node process" - zh: "节点进程" - } - } - - local_status_state { - desc { - en: "The state of the process that is being performed on the node" - zh: "正在节点上执行的进程的状态" - } - label { - en: "Rebalance/evacuation current state" - zh: "重新平衡/疏散当前状态" - } - } - - local_status_coordinator_node { - desc { - en: "The node that is coordinating rebalance process" - zh: "协调再平衡过程的节点" - } - label { - en: "Coordinator node" - zh: "协调节点" - } - } - - local_status_connection_eviction_rate { - desc { - en: "The rate of evicting connections, in connections per second" - zh: "逐出连接的速率,以每秒连接数表示" - } - label { - en: "Connection eviction rate" - zh: "连接驱逐率" - } - } - - local_status_session_eviction_rate { - desc { - en: "The rate of evicting sessions, in sessions per second" - zh: "逐出会话的速率,以每秒会话为单位" - } - label { - en: "Session eviction rate" - zh: "会话驱逐率" - } - } - - local_status_connection_goal { - desc { - en: "The number of connections that the node should have after the rebalance/evacuation process" - zh: "节点在重新平衡/疏散过程后应该拥有的连接数" - } - label { - en: "Connection goal" - zh: "连接目标" - } - } - - local_status_session_goal { - desc { - en: "The number of sessions that the node should have after the evacuation process" - zh: "疏散过程后节点应有的会话数" - } - label { - en: "Session goal" - zh: "会话目标" - } - } - - local_status_disconnected_session_goal { - desc { - en: "The number of disconnected sessions that the node should have after the rebalance process" - zh: "重新平衡过程后节点应具有的断开连接的会话数" - } - label { - en: "Disconnected session goal" - zh: "断开连接的会话目标" - } - } - - local_status_session_recipients { - desc { - en: "List of nodes to which sessions are being evacuated" - zh: "会话被疏散到的节点列表" - } - label { - en: "Session recipients" - zh: "会话收件人" - } - } - - local_status_recipients { - desc { - en: "List of nodes to which connections/sessions are being evacuated during rebalance" - zh: "在重新平衡期间连接/会话被疏散到的节点列表" - } - label { - en: "Recipients" - zh: "收件人" - } - } - - local_status_stats { - desc { - en: "Statistics of the evacuation/rebalance process" - zh: "疏散/再平衡过程的统计" - } - label { - en: "Statistics" - zh: "统计数据" - } - } - - status_stats_initial_connected { - desc { - en: "The number of connections on the node before the evacuation/rebalance process" - zh: "疏散/重新平衡过程之前节点上的连接数" - } - label { - en: "Initial connected" - zh: "初始连接" - } - } - - status_stats_current_connected { - desc { - en: "Current number of connections on the node" - zh: "节点上的当前连接数" - } - label { - en: "Current connections" - zh: "当前连接" - } - } - - status_stats_initial_sessions { - desc { - en: "The number of sessions on the node before the evacuation/rebalance process" - zh: "疏散/重新平衡过程之前节点上的会话数" - } - label { - en: "Initial sessions" - zh: "初始会话" - } - } - - status_stats_current_sessions { - desc { - en: "Current number of sessions on the node" - zh: "节点上的当前会话数" - } - label { - en: "Current sessions" - zh: "当前会话" - } - } - - status_stats_current_disconnected_sessions { - desc { - en: "Current number of disconnected sessions on the node" - zh: "节点上当前断开连接的会话数" - } - label { - en: "Current disconnected sessions" - zh: "当前断开连接的会话" - } - } - - coordinator_status_donors { - desc { - en: "List of nodes from which connections/sessions are being evacuated" - zh: "正在疏散连接/会话的节点列表" - } - label { - en: "Donors" - zh: "捐助者" - } - } - - coordinator_status_donor_conn_avg { - desc { - en: "Average number of connections per donor node" - zh: "每个供体节点的平均连接数" - } - label { - en: "Donor connections average" - zh: "捐助者连接平均值" - } - } - - coordinator_status_donor_sess_avg { - desc { - en: "Average number of sessions per donor node" - zh: "每个供体节点的平均会话数" - } - label { - en: "Donor sessions average" - zh: "平均捐助会议" - } - } - - coordinator_status_node { - desc { - en: "The node that is coordinating the evacuation/rebalance process" - zh: "协调疏散/再平衡过程的节点" - } - label { - en: "Coordinator node" - zh: "协调节点" - } - } - - evacuation_status_node { - desc { - en: "The node that is being evacuated" - zh: "正在撤离的节点" - } - label { - en: "Evacuated node" - zh: "疏散节点" - } - } - - global_status_evacuations { - desc { - en: "List of nodes that are being evacuated" - zh: "正在撤离的节点列表" - } - label { - en: "Evacuations" - zh: "疏散" - } - } - - global_status_rebalances { - desc { - en: "List of nodes that coordinate a rebalance" - zh: "协调再平衡的节点列表" - } - label { - en: "Rebalances" - zh: "再平衡" - } - } - - empty_response { - desc { - en: "The response is empty" - zh: "响应为空" - } - label { - en: "Empty response" - zh: "空响应" - } - } -} diff --git a/lib-ee/emqx_node_rebalance/src/emqx_node_rebalance_api.erl b/lib-ee/emqx_node_rebalance/src/emqx_node_rebalance_api.erl index fa322d146..1f6328a63 100644 --- a/lib-ee/emqx_node_rebalance/src/emqx_node_rebalance_api.erl +++ b/lib-ee/emqx_node_rebalance/src/emqx_node_rebalance_api.erl @@ -8,6 +8,7 @@ -include_lib("typerefl/include/types.hrl"). -include_lib("hocon/include/hoconsc.hrl"). -include_lib("emqx/include/logger.hrl"). +-include_lib("emqx_utils/include/emqx_utils_api.hrl"). %% Swagger specs from hocon schema -export([ @@ -44,9 +45,9 @@ -import(emqx_dashboard_swagger, [error_codes/2]). -define(BAD_REQUEST, 'BAD_REQUEST'). --define(NODE_UNAVAILABLE, 'NODE_UNAVAILABLE'). -define(NODE_EVACUATING, 'NODE_EVACUATING'). -define(RPC_ERROR, 'RPC_ERROR'). +-define(NOT_FOUND, 'NOT_FOUND'). %%-------------------------------------------------------------------- %% API Spec @@ -120,7 +121,8 @@ schema("/load_rebalance/:node/start") -> ), responses => #{ 200 => response_schema(), - 400 => error_codes([?BAD_REQUEST, ?NODE_UNAVAILABLE], <<"Bad Request">>) + 400 => error_codes([?BAD_REQUEST], <<"Bad Request">>), + 404 => error_codes([?NOT_FOUND], <<"Not Found">>) } } }; @@ -134,7 +136,8 @@ schema("/load_rebalance/:node/stop") -> parameters => [param_node()], responses => #{ 200 => response_schema(), - 400 => error_codes([?BAD_REQUEST, ?NODE_UNAVAILABLE], <<"Bad Request">>) + 400 => error_codes([?BAD_REQUEST], <<"Bad Request">>), + 404 => error_codes([?NOT_FOUND], <<"Not Found">>) } } }; @@ -153,7 +156,8 @@ schema("/load_rebalance/:node/evacuation/start") -> ), responses => #{ 200 => response_schema(), - 400 => error_codes([?BAD_REQUEST, ?NODE_UNAVAILABLE], <<"Bad Request">>) + 400 => error_codes([?BAD_REQUEST], <<"Bad Request">>), + 404 => error_codes([?NOT_FOUND], <<"Not Found">>) } } }; @@ -167,7 +171,8 @@ schema("/load_rebalance/:node/evacuation/stop") -> parameters => [param_node()], responses => #{ 200 => response_schema(), - 400 => error_codes([?BAD_REQUEST, ?NODE_UNAVAILABLE], <<"Bad Request">>) + 400 => error_codes([?BAD_REQUEST], <<"Bad Request">>), + 404 => error_codes([?NOT_FOUND], <<"Not Found">>) } } }. @@ -205,7 +210,7 @@ schema("/load_rebalance/:node/evacuation/stop") -> end. '/load_rebalance/:node/start'(post, #{bindings := #{node := NodeBin}, body := Params0}) -> - with_node(NodeBin, fun(Node) -> + emqx_utils_api:with_node(NodeBin, fun(Node) -> Params1 = translate(rebalance_start, Params0), with_nodes_at_key(nodes, Params1, fun(Params2) -> wrap_rpc( @@ -215,7 +220,7 @@ schema("/load_rebalance/:node/evacuation/stop") -> end). '/load_rebalance/:node/stop'(post, #{bindings := #{node := NodeBin}}) -> - with_node(NodeBin, fun(Node) -> + emqx_utils_api:with_node(NodeBin, fun(Node) -> wrap_rpc( Node, emqx_node_rebalance_api_proto_v1:node_rebalance_stop(Node) ) @@ -224,7 +229,7 @@ schema("/load_rebalance/:node/evacuation/stop") -> '/load_rebalance/:node/evacuation/start'(post, #{ bindings := #{node := NodeBin}, body := Params0 }) -> - with_node(NodeBin, fun(Node) -> + emqx_utils_api:with_node(NodeBin, fun(Node) -> Params1 = translate(rebalance_evacuation_start, Params0), with_nodes_at_key(migrate_to, Params1, fun(Params2) -> wrap_rpc( @@ -237,7 +242,7 @@ schema("/load_rebalance/:node/evacuation/stop") -> end). '/load_rebalance/:node/evacuation/stop'(post, #{bindings := #{node := NodeBin}}) -> - with_node(NodeBin, fun(Node) -> + emqx_utils_api:with_node(NodeBin, fun(Node) -> wrap_rpc( Node, emqx_node_rebalance_api_proto_v1:node_rebalance_evacuation_stop(Node) ) @@ -288,19 +293,13 @@ validate_nodes(Key, Params) when is_map_key(Key, Params) -> validate_nodes(_Key, Params) -> {ok, Params}. -with_node(BinNode, Fun) -> - case parse_node(BinNode) of - {ok, Node} -> Fun(Node); - {error, _} -> error_response(400, ?BAD_REQUEST, [<<"Invalid node: ">>, BinNode]) - end. - with_nodes_at_key(Key, Params, Fun) -> Res = validate_nodes(Key, Params), case Res of {ok, Params1} -> Fun(Params1); {error, {unavailable, Nodes}} -> - error_response(400, ?NODE_UNAVAILABLE, io_lib:format("Nodes unavailable: ~p", [Nodes])); + error_response(400, ?NOT_FOUND, io_lib:format("Nodes unavailable: ~p", [Nodes])); {error, {invalid, Nodes}} -> error_response(400, ?BAD_REQUEST, io_lib:format("Invalid nodes: ~p", [Nodes])) end. @@ -322,10 +321,7 @@ format_as_map_list(List) -> ). error_response(HttpCode, Code, Message) -> - {HttpCode, #{ - code => atom_to_binary(Code), - message => iolist_to_binary(Message) - }}. + {HttpCode, ?ERROR_MSG(Code, Message)}. without(Keys, Props) -> lists:filter( @@ -470,11 +466,10 @@ fields(rebalance_evacuation_start) -> )}, {"migrate_to", mk( - list(binary()), + nonempty_list(binary()), #{ desc => ?DESC(migrate_to), - required => false, - validator => [fun(Values) -> length(Values) > 0 end] + required => false } )} ]; diff --git a/lib-ee/emqx_node_rebalance/src/emqx_node_rebalance_cli.erl b/lib-ee/emqx_node_rebalance/src/emqx_node_rebalance_cli.erl index a2706f13b..3bafb9ffe 100644 --- a/lib-ee/emqx_node_rebalance/src/emqx_node_rebalance_cli.erl +++ b/lib-ee/emqx_node_rebalance/src/emqx_node_rebalance_cli.erl @@ -43,7 +43,7 @@ cli(["start" | StartArgs]) -> false end; cli(["node-status", NodeStr]) -> - case emqx_misc:safe_to_existing_atom(NodeStr, utf8) of + case emqx_utils:safe_to_existing_atom(NodeStr, utf8) of {ok, Node} -> node_status(emqx_node_rebalance_status:local_status(Node)); {error, _} -> @@ -297,7 +297,7 @@ strings_to_atoms(Strings) -> strings_to_atoms([], Atoms, Invalid) -> {lists:reverse(Atoms), lists:reverse(Invalid)}; strings_to_atoms([Str | Rest], Atoms, Invalid) -> - case emqx_misc:safe_to_existing_atom(Str, utf8) of + case emqx_utils:safe_to_existing_atom(Str, utf8) of {ok, Atom} -> strings_to_atoms(Rest, [Atom | Atoms], Invalid); {error, _} -> diff --git a/lib-ee/emqx_node_rebalance/src/emqx_node_rebalance_evacuation_persist.erl b/lib-ee/emqx_node_rebalance/src/emqx_node_rebalance_evacuation_persist.erl index 3fc9faeea..6b145c699 100644 --- a/lib-ee/emqx_node_rebalance/src/emqx_node_rebalance_evacuation_persist.erl +++ b/lib-ee/emqx_node_rebalance/src/emqx_node_rebalance_evacuation_persist.erl @@ -55,7 +55,7 @@ save( Filepath = evacuation_filepath(), case filelib:ensure_dir(Filepath) of ok -> - JsonData = emqx_json:encode( + JsonData = emqx_utils_json:encode( prepare_for_encode(maps:with(persist_keys(), Data)), [pretty] ), @@ -72,7 +72,7 @@ clear() -> read(DefaultOpts) -> case file:read_file(evacuation_filepath()) of {ok, Data} -> - case emqx_json:safe_decode(Data, [return_maps]) of + case emqx_utils_json:safe_decode(Data, [return_maps]) of {ok, Map} when is_map(Map) -> {ok, map_to_opts(DefaultOpts, Map)}; _NotAMap -> diff --git a/lib-ee/emqx_node_rebalance/src/emqx_node_rebalance_status.erl b/lib-ee/emqx_node_rebalance/src/emqx_node_rebalance_status.erl index 63675a3da..1d45d64e8 100644 --- a/lib-ee/emqx_node_rebalance/src/emqx_node_rebalance_status.erl +++ b/lib-ee/emqx_node_rebalance/src/emqx_node_rebalance_status.erl @@ -208,7 +208,7 @@ format_local_status_field({session_goal, SessGoal}) -> format_local_status_field({disconnected_session_goal, DisconnSessGoal}) -> io_lib:format("Disconnected session goal: ~p~n", [DisconnSessGoal]); format_local_status_field({session_recipients, SessionRecipients}) -> - io_lib:format("Session recipient nodes: ~p~n", [SessionRecipients]); + io_lib:format("Session recipient nodes: ~p~n", [SessionRecipients]); format_local_status_field({recipients, Recipients}) -> io_lib:format("Recipient nodes: ~p~n", [Recipients]); format_local_status_field({donors, Donors}) -> diff --git a/lib-ee/emqx_node_rebalance/test/emqx_node_rebalance_api_SUITE.erl b/lib-ee/emqx_node_rebalance/test/emqx_node_rebalance_api_SUITE.erl index 21608b8bc..d8202a33e 100644 --- a/lib-ee/emqx_node_rebalance/test/emqx_node_rebalance_api_SUITE.erl +++ b/lib-ee/emqx_node_rebalance/test/emqx_node_rebalance_api_SUITE.erl @@ -88,7 +88,7 @@ t_start_evacuation_validation(Config) -> BadOpts ), ?assertMatch( - {ok, 400, #{}}, + {ok, 404, #{}}, api_post( ["load_rebalance", "bad@node", "evacuation", "start"], #{} @@ -148,7 +148,7 @@ t_start_rebalance_validation(Config) -> BadOpts ), ?assertMatch( - {ok, 400, #{}}, + {ok, 404, #{}}, api_post( ["load_rebalance", "bad@node", "start"], #{} @@ -346,7 +346,7 @@ t_start_stop_rebalance(Config) -> } ] }}, - api_get(["load_rebalance", "global_status"]) + GlobalStatusResponse ), ?assertMatch( diff --git a/lib-ee/emqx_node_rebalance/test/emqx_node_rebalance_evacuation_SUITE.erl b/lib-ee/emqx_node_rebalance/test/emqx_node_rebalance_evacuation_SUITE.erl index cdafad97a..5d774ba7c 100644 --- a/lib-ee/emqx_node_rebalance/test/emqx_node_rebalance_evacuation_SUITE.erl +++ b/lib-ee/emqx_node_rebalance/test/emqx_node_rebalance_evacuation_SUITE.erl @@ -22,21 +22,20 @@ all() -> [{group, one_node}, {group, two_node}]. groups() -> [ - {one_node, [], [ - t_agent_busy, - t_already_started, - t_not_started, - t_start, - t_persistence, - t_unknown_messages - ]}, - {two_node, [], [ - t_conn_evicted, - t_migrate_to, - t_session_evicted - ]} + {one_node, [], one_node_cases()}, + {two_node, [], two_node_cases()} ]. +two_node_cases() -> + [ + t_conn_evicted, + t_migrate_to, + t_session_evicted + ]. + +one_node_cases() -> + emqx_common_test_helpers:all(?MODULE) -- two_node_cases(). + init_per_suite(Config) -> ok = emqx_common_test_helpers:start_apps([]), Config. diff --git a/rel/i18n/emqx_eviction_agent_api.hocon b/rel/i18n/emqx_eviction_agent_api.hocon new file mode 100644 index 000000000..40566fca6 --- /dev/null +++ b/rel/i18n/emqx_eviction_agent_api.hocon @@ -0,0 +1,9 @@ +emqx_eviction_agent_api { + +node_eviction_status_get.desc: +"""Get the node eviction status""" + +node_eviction_status_get.label: +"""Node Eviction Status""" + +} diff --git a/rel/i18n/emqx_node_rebalance_api.hocon b/rel/i18n/emqx_node_rebalance_api.hocon new file mode 100644 index 000000000..51d0fa8bc --- /dev/null +++ b/rel/i18n/emqx_node_rebalance_api.hocon @@ -0,0 +1,267 @@ +emqx_node_rebalance_api { + +load_rebalance_status.desc: +"""Get rebalance status of the current node""" + +load_rebalance_status.label: +"""Get rebalance status""" + +load_rebalance_global_status.desc: +"""Get status of all rebalance/evacuation processes across the cluster""" + +load_rebalance_global_status.label: +"""Get global rebalance status""" + +load_rebalance_availability_check.desc: +"""Check if the node is being evacuated or rebalanced""" + +load_rebalance_availability_check.label: +"""Availability check""" + +load_rebalance_start.desc: +"""Start rebalance process""" + +load_rebalance_start.label: +"""Start rebalance""" + +load_rebalance_stop.desc: +"""Stop rebalance process""" + +load_rebalance_stop.label: +"""Stop rebalance""" + +load_rebalance_evacuation_start.desc: +"""Start evacuation process""" + +load_rebalance_evacuation_start.label: +"""Start evacuation""" + +load_rebalance_evacuation_stop.desc: +"""Stop evacuation process""" + +load_rebalance_evacuation_stop.label: +"""Stop evacuation""" + +param_node.desc: +"""Node name""" + +param_node.label: +"""Node name""" + +wait_health_check.desc: +"""Time to wait before starting the rebalance process, in seconds""" + +wait_health_check.label: +"""Wait health check""" + +conn_evict_rate.desc: +"""The rate of evicting connections, in connections per second""" + +conn_evict_rate.label: +"""Connection eviction rate""" + +sess_evict_rate.desc: +"""The rate of evicting sessions, in sessions per second""" + +sess_evict_rate.label: +"""Session eviction rate""" + +abs_conn_threshold.desc: +"""Maximum desired difference between the number of connections on the node and the average number of connections on the recipient nodes""" + +abs_conn_threshold.label: +"""Absolute connection threshold""" + +rel_conn_threshold.desc: +"""Maximum desired fraction between the number of connections on the node and the average number of connections on the recipient nodes""" + +rel_conn_threshold.label: +"""Relative connection threshold""" + +abs_sess_threshold.desc: +"""Maximum desired difference between the number of sessions on the node and the average number of sessions on the recipient nodes""" + +abs_sess_threshold.label: +"""Absolute session threshold""" + +rel_sess_threshold.desc: +"""Maximum desired fraction between the number of sessions on the node and the average number of sessions on the recipient nodes""" + +rel_sess_threshold.label: +"""Relative session threshold""" + +wait_takeover.desc: +"""Time to wait before starting session evacuation process, in seconds""" + +wait_takeover.label: +"""Wait takeover""" + +redirect_to.desc: +"""Server reference to redirect clients to (MQTTv5 Server redirection)""" + +redirect_to.label: +"""Redirect to""" + +migrate_to.desc: +"""Nodes to migrate sessions to""" + +migrate_to.label: +"""Migrate to""" + +rebalance_nodes.desc: +"""Nodes to participate in rebalance""" + +rebalance_nodes.label: +"""Rebalance nodes""" + +local_status_enabled.desc: +"""Whether the node is being evacuated""" + +local_status_enabled.label: +"""Local evacuation status""" + +local_status_process.desc: +"""The process that is being performed on the node: evacuation or rebalance""" + +local_status_process.label: +"""Node process""" + +local_status_state.desc: +"""The state of the process that is being performed on the node""" + +local_status_state.label: +"""Rebalance/evacuation current state""" + +local_status_coordinator_node.desc: +"""The node that is coordinating rebalance process""" + +local_status_coordinator_node.label: +"""Coordinator node""" + +local_status_connection_eviction_rate.desc: +"""The rate of evicting connections, in connections per second""" + +local_status_connection_eviction_rate.label: +"""Connection eviction rate""" + +local_status_session_eviction_rate.desc: +"""The rate of evicting sessions, in sessions per second""" + +local_status_session_eviction_rate.label: +"""Session eviction rate""" + +local_status_connection_goal.desc: +"""The number of connections that the node should have after the rebalance/evacuation process""" + +local_status_connection_goal.label: +"""Connection goal""" + +local_status_session_goal.desc: +"""The number of sessions that the node should have after the evacuation process""" + +local_status_session_goal.label: +"""Session goal""" + +local_status_disconnected_session_goal.desc: +"""The number of disconnected sessions that the node should have after the rebalance process""" + +local_status_disconnected_session_goal.label: +"""Disconnected session goal""" + +local_status_session_recipients.desc: +"""List of nodes to which sessions are being evacuated""" + +local_status_session_recipients.label: +"""Session recipients""" + +local_status_recipients.desc: +"""List of nodes to which connections/sessions are being evacuated during rebalance""" + +local_status_recipients.label: +"""Recipients""" + +local_status_stats.desc: +"""Statistics of the evacuation/rebalance process""" + +local_status_stats.label: +"""Statistics""" + +status_stats_initial_connected.desc: +"""The number of connections on the node before the evacuation/rebalance process""" + +status_stats_initial_connected.label: +"""Initial connected""" + +status_stats_current_connected.desc: +"""Current number of connections on the node""" + +status_stats_current_connected.label: +"""Current connections""" + +status_stats_initial_sessions.desc: +"""The number of sessions on the node before the evacuation/rebalance process""" + +status_stats_initial_sessions.label: +"""Initial sessions""" + +status_stats_current_sessions.desc: +"""Current number of sessions on the node""" + +status_stats_current_sessions.label: +"""Current sessions""" + +status_stats_current_disconnected_sessions.desc: +"""Current number of disconnected sessions on the node""" + +status_stats_current_disconnected_sessions.label: +"""Current disconnected sessions""" + +coordinator_status_donors.desc: +"""List of nodes from which connections/sessions are being evacuated""" + +coordinator_status_donors.label: +"""Donors""" + +coordinator_status_donor_conn_avg.desc: +"""Average number of connections per donor node""" + +coordinator_status_donor_conn_avg.label: +"""Donor connections average""" + +coordinator_status_donor_sess_avg.desc: +"""Average number of sessions per donor node""" + +coordinator_status_donor_sess_avg.label: +"""Donor sessions average""" + +coordinator_status_node.desc: +"""The node that is coordinating the evacuation/rebalance process""" + +coordinator_status_node.label: +"""Coordinator node""" + +evacuation_status_node.desc: +"""The node that is being evacuated""" + +evacuation_status_node.label: +"""Evacuated node""" + +global_status_evacuations.desc: +"""List of nodes that are being evacuated""" + +global_status_evacuations.label: +"""Evacuations""" + +global_status_rebalances.desc: +"""List of nodes that coordinate a rebalance""" + +global_status_rebalances.label: +"""Rebalances""" + +empty_response.desc: +"""The response is empty""" + +empty_response.label: +"""Empty response""" + +} diff --git a/rel/i18n/zh/emqx_eviction_agent_api.hocon b/rel/i18n/zh/emqx_eviction_agent_api.hocon new file mode 100644 index 000000000..a4d9f5c12 --- /dev/null +++ b/rel/i18n/zh/emqx_eviction_agent_api.hocon @@ -0,0 +1,9 @@ +emqx_eviction_agent_api { + +node_eviction_status_get.desc: +"""获取节点驱逐状态""" + +node_eviction_status_get.label: +"""节点驱逐状态""" + +} diff --git a/rel/i18n/zh/emqx_node_rebalance_api.hocon b/rel/i18n/zh/emqx_node_rebalance_api.hocon new file mode 100644 index 000000000..3066158b3 --- /dev/null +++ b/rel/i18n/zh/emqx_node_rebalance_api.hocon @@ -0,0 +1,267 @@ +emqx_node_rebalance_api { + +load_rebalance_status.desc: +"""获取当前节点的rebalance状态""" + +load_rebalance_status.label: +"""获取rebalance状态""" + +load_rebalance_global_status.desc: +"""获取集群中所有rebalance/evacuation进程的状态""" + +load_rebalance_global_status.label: +"""获取全局rebalance状态""" + +load_rebalance_availability_check.desc: +"""检查节点是否正在被evacuate或rebalance""" + +load_rebalance_availability_check.label: +"""可用性检查""" + +load_rebalance_start.desc: +"""启动rebalance进程""" + +load_rebalance_start.label: +"""启动rebalance""" + +load_rebalance_stop.desc: +"""停止rebalance进程""" + +load_rebalance_stop.label: +"""停止rebalance""" + +load_rebalance_evacuation_start.desc: +"""启动evacuation进程""" + +load_rebalance_evacuation_start.label: +"""启动evacuation""" + +load_rebalance_evacuation_stop.desc: +"""停止evacuation进程""" + +load_rebalance_evacuation_stop.label: +"""停止evacuation""" + +param_node.desc: +"""节点名称""" + +param_node.label: +"""节点名称""" + +wait_health_check.desc: +"""启动rebalance进程前等待的时间,单位为秒""" + +wait_health_check.label: +"""等待健康检查""" + +conn_evict_rate.desc: +"""逐出连接的速率,以每秒连接数表示""" + +conn_evict_rate.label: +"""连接驱逐率""" + +sess_evict_rate.desc: +"""逐出会话的速率,以每秒会话为单位""" + +sess_evict_rate.label: +"""会话驱逐率""" + +abs_conn_threshold.desc: +"""节点上的连接数与接收节点上的平均连接数之间的最大期望差值""" + +abs_conn_threshold.label: +"""绝对连接阈值""" + +rel_conn_threshold.desc: +"""节点上的连接数与接收节点上的平均连接数之间的最大期望分数""" + +rel_conn_threshold.label: +"""相对连接阈值""" + +abs_sess_threshold.desc: +"""节点上的会话数与接收节点上的平均会话数之间的最大期望差异""" + +abs_sess_threshold.label: +"""绝对会话阈值""" + +rel_sess_threshold.desc: +"""节点上的会话数与接收节点上的平均会话数之间的最大期望分数""" + +rel_sess_threshold.label: +"""相对会话阈值""" + +wait_takeover.desc: +"""开始会话疏散过程之前等待的时间,以秒为单位""" + +wait_takeover.label: +"""等待接管""" + +redirect_to.desc: +"""将客户端重定向到的服务器参考(MQTTv5 服务器重定向)""" + +redirect_to.label: +"""重定向至""" + +migrate_to.desc: +"""将会话迁移到的节点""" + +migrate_to.label: +"""迁移到""" + +rebalance_nodes.desc: +"""参与rebalance的节点""" + +rebalance_nodes.label: +"""重新平衡节点""" + +local_status_enabled.desc: +"""节点是否正在撤离""" + +local_status_enabled.label: +"""当地避难状况""" + +local_status_process.desc: +"""正在节点上执行的过程:疏散或重新平衡""" + +local_status_process.label: +"""节点进程""" + +local_status_state.desc: +"""正在节点上执行的进程的状态""" + +local_status_state.label: +"""重新平衡/疏散当前状态""" + +local_status_coordinator_node.desc: +"""协调再平衡过程的节点""" + +local_status_coordinator_node.label: +"""协调节点""" + +local_status_connection_eviction_rate.desc: +"""逐出连接的速率,以每秒连接数表示""" + +local_status_connection_eviction_rate.label: +"""连接驱逐率""" + +local_status_session_eviction_rate.desc: +"""逐出会话的速率,以每秒会话为单位""" + +local_status_session_eviction_rate.label: +"""会话驱逐率""" + +local_status_connection_goal.desc: +"""节点在重新平衡/疏散过程后应该拥有的连接数""" + +local_status_connection_goal.label: +"""连接目标""" + +local_status_session_goal.desc: +"""疏散过程后节点应有的会话数""" + +local_status_session_goal.label: +"""会话目标""" + +local_status_disconnected_session_goal.desc: +"""重新平衡过程后节点应具有的断开连接的会话数""" + +local_status_disconnected_session_goal.label: +"""断开连接的会话目标""" + +local_status_session_recipients.desc: +"""会话被疏散到的节点列表""" + +local_status_session_recipients.label: +"""会话收件人""" + +local_status_recipients.desc: +"""在重新平衡期间连接/会话被疏散到的节点列表""" + +local_status_recipients.label: +"""收件人""" + +local_status_stats.desc: +"""疏散/再平衡过程的统计""" + +local_status_stats.label: +"""统计数据""" + +status_stats_initial_connected.desc: +"""疏散/重新平衡过程之前节点上的连接数""" + +status_stats_initial_connected.label: +"""初始连接""" + +status_stats_current_connected.desc: +"""节点上的当前连接数""" + +status_stats_current_connected.label: +"""当前连接""" + +status_stats_initial_sessions.desc: +"""疏散/重新平衡过程之前节点上的会话数""" + +status_stats_initial_sessions.label: +"""初始会话""" + +status_stats_current_sessions.desc: +"""节点上的当前会话数""" + +status_stats_current_sessions.label: +"""当前会话""" + +status_stats_current_disconnected_sessions.desc: +"""节点上当前断开连接的会话数""" + +status_stats_current_disconnected_sessions.label: +"""当前断开连接的会话""" + +coordinator_status_donors.desc: +"""正在疏散连接/会话的节点列表""" + +coordinator_status_donors.label: +"""捐助者""" + +coordinator_status_donor_conn_avg.desc: +"""每个供体节点的平均连接数""" + +coordinator_status_donor_conn_avg.label: +"""捐助者连接平均值""" + +coordinator_status_donor_sess_avg.desc: +"""每个供体节点的平均会话数""" + +coordinator_status_donor_sess_avg.label: +"""平均捐助会议""" + +coordinator_status_node.desc: +"""协调疏散/再平衡过程的节点""" + +coordinator_status_node.label: +"""协调节点""" + +evacuation_status_node.desc: +"""正在撤离的节点""" + +evacuation_status_node.label: +"""疏散节点""" + +global_status_evacuations.desc: +"""正在撤离的节点列表""" + +global_status_evacuations.label: +"""疏散""" + +global_status_rebalances.desc: +"""协调再平衡的节点列表""" + +global_status_rebalances.label: +"""再平衡""" + +empty_response.desc: +"""响应为空""" + +empty_response.label: +"""空响应""" +} + From ffed8a132bb806a142608db2c40ff5526609bf87 Mon Sep 17 00:00:00 2001 From: Ilya Averyanov Date: Tue, 9 May 2023 14:13:49 +0500 Subject: [PATCH 3/8] chore(rebalance): review fixes Co-authored-by: Zaiming (Stone) Shi --- lib-ee/emqx_node_rebalance/src/emqx_node_rebalance.app.src | 3 +-- rel/i18n/emqx_node_rebalance_api.hocon | 4 ++-- 2 files changed, 3 insertions(+), 4 deletions(-) diff --git a/lib-ee/emqx_node_rebalance/src/emqx_node_rebalance.app.src b/lib-ee/emqx_node_rebalance/src/emqx_node_rebalance.app.src index 9673e4fda..381001b87 100644 --- a/lib-ee/emqx_node_rebalance/src/emqx_node_rebalance.app.src +++ b/lib-ee/emqx_node_rebalance/src/emqx_node_rebalance.app.src @@ -14,9 +14,8 @@ {mod, {emqx_node_rebalance_app, []}}, {env, []}, {modules, []}, - {maintainers, ["EMQX Team "]}, {links, [ - {"Homepage", "https://emqx.io/"}, + {"Homepage", "https://www.emqx.com/"}, {"Github", "https://github.com/emqx"} ]} ]}. diff --git a/rel/i18n/emqx_node_rebalance_api.hocon b/rel/i18n/emqx_node_rebalance_api.hocon index 51d0fa8bc..0c9e369be 100644 --- a/rel/i18n/emqx_node_rebalance_api.hocon +++ b/rel/i18n/emqx_node_rebalance_api.hocon @@ -121,10 +121,10 @@ local_status_enabled.label: """Local evacuation status""" local_status_process.desc: -"""The process that is being performed on the node: evacuation or rebalance""" +"""The type of the task that is being performed on the node: 'evacuation' or 'rebalance'""" local_status_process.label: -"""Node process""" +"""Task Type""" local_status_state.desc: """The state of the process that is being performed on the node""" From 48b53b9ca448e75638b8d246e6e6be5106bbcb82 Mon Sep 17 00:00:00 2001 From: "Zaiming (Stone) Shi" Date: Mon, 8 May 2023 23:25:32 +0200 Subject: [PATCH 4/8] docs: delete zh changelog --- changes/ee/feat-10075.en.md | 3 ++- changes/ee/feat-10075.zh.md | 1 - 2 files changed, 2 insertions(+), 2 deletions(-) delete mode 100644 changes/ee/feat-10075.zh.md diff --git a/changes/ee/feat-10075.en.md b/changes/ee/feat-10075.en.md index e6e070ddc..35c3949e3 100644 --- a/changes/ee/feat-10075.en.md +++ b/changes/ee/feat-10075.en.md @@ -1 +1,2 @@ -Add node rebalance/node evacuation [functionality](https://github.com/emqx/eip/blob/main/active/0020-node-rebalance.md). +Add node rebalance/node evacuation functionality. +See also: [design doc](https://github.com/emqx/eip/blob/main/active/0020-node-rebalance.md) diff --git a/changes/ee/feat-10075.zh.md b/changes/ee/feat-10075.zh.md deleted file mode 100644 index 36c78acb8..000000000 --- a/changes/ee/feat-10075.zh.md +++ /dev/null @@ -1 +0,0 @@ -添加节点再平衡/节点疏散[功能](https://github.com/emqx/eip/blob/main/active/0020-node-rebalance.md)。 From 9e6cc0d110f17de88b1725cc10af873a9acab569 Mon Sep 17 00:00:00 2001 From: "Zaiming (Stone) Shi" Date: Mon, 8 May 2023 23:28:06 +0200 Subject: [PATCH 5/8] docs: delete APL header from ee file --- .../include/emqx_node_rebalance.hrl | 12 ------------ 1 file changed, 12 deletions(-) diff --git a/lib-ee/emqx_node_rebalance/include/emqx_node_rebalance.hrl b/lib-ee/emqx_node_rebalance/include/emqx_node_rebalance.hrl index ccc671e81..7d7bc439e 100644 --- a/lib-ee/emqx_node_rebalance/include/emqx_node_rebalance.hrl +++ b/lib-ee/emqx_node_rebalance/include/emqx_node_rebalance.hrl @@ -1,17 +1,5 @@ %%-------------------------------------------------------------------- %% Copyright (c) 2022-2023 EMQ Technologies Co., Ltd. All Rights Reserved. -%% -%% Licensed under the Apache License, Version 2.0 (the "License"); -%% you may not use this file except in compliance with the License. -%% You may obtain a copy of the License at -%% -%% http://www.apache.org/licenses/LICENSE-2.0 -%% -%% Unless required by applicable law or agreed to in writing, software -%% distributed under the License is distributed on an "AS IS" BASIS, -%% WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -%% See the License for the specific language governing permissions and -%% limitations under the License. %%-------------------------------------------------------------------- -define(DEFAULT_CONN_EVICT_RATE, 500). From 3ff04d51bc5c427aea0e00d1d21359996c2613bd Mon Sep 17 00:00:00 2001 From: "Zaiming (Stone) Shi" Date: Tue, 9 May 2023 00:09:16 +0200 Subject: [PATCH 6/8] docs: refine zh tr --- rel/i18n/emqx_node_rebalance_api.hocon | 8 +- rel/i18n/zh/emqx_node_rebalance_api.hocon | 133 +++++++++++----------- 2 files changed, 70 insertions(+), 71 deletions(-) diff --git a/rel/i18n/emqx_node_rebalance_api.hocon b/rel/i18n/emqx_node_rebalance_api.hocon index 0c9e369be..bb67f2aad 100644 --- a/rel/i18n/emqx_node_rebalance_api.hocon +++ b/rel/i18n/emqx_node_rebalance_api.hocon @@ -67,25 +67,25 @@ sess_evict_rate.label: """Session eviction rate""" abs_conn_threshold.desc: -"""Maximum desired difference between the number of connections on the node and the average number of connections on the recipient nodes""" +"""Maximum desired difference between the number of connections on the node and the average number of connections on the recipient nodes. Difference lower than this is the goal of the rebalance process.""" abs_conn_threshold.label: """Absolute connection threshold""" rel_conn_threshold.desc: -"""Maximum desired fraction between the number of connections on the node and the average number of connections on the recipient nodes""" +"""Maximum desired fraction between the number of connections on the node and the average number of connections on the recipient nodes. Fraction lower than this is the goal of the rebalance process.""" rel_conn_threshold.label: """Relative connection threshold""" abs_sess_threshold.desc: -"""Maximum desired difference between the number of sessions on the node and the average number of sessions on the recipient nodes""" +"""Maximum desired difference between the number of sessions on the node and the average number of sessions on the recipient nodes. Difference lower than this is the goal of the evacuation process.""" abs_sess_threshold.label: """Absolute session threshold""" rel_sess_threshold.desc: -"""Maximum desired fraction between the number of sessions on the node and the average number of sessions on the recipient nodes""" +"""Maximum desired fraction between the number of sessions on the node and the average number of sessions on the recipient nodes. Fraction lower than this is the goal of the evacuation process""" rel_sess_threshold.label: """Relative session threshold""" diff --git a/rel/i18n/zh/emqx_node_rebalance_api.hocon b/rel/i18n/zh/emqx_node_rebalance_api.hocon index 3066158b3..5f6753aff 100644 --- a/rel/i18n/zh/emqx_node_rebalance_api.hocon +++ b/rel/i18n/zh/emqx_node_rebalance_api.hocon @@ -1,46 +1,46 @@ emqx_node_rebalance_api { load_rebalance_status.desc: -"""获取当前节点的rebalance状态""" +"""获取当前节点的重平衡状态""" load_rebalance_status.label: -"""获取rebalance状态""" +"""获取重平衡状态""" load_rebalance_global_status.desc: -"""获取集群中所有rebalance/evacuation进程的状态""" +"""获取集群中所有重平衡/疏散任务的状态""" load_rebalance_global_status.label: -"""获取全局rebalance状态""" +"""获取全局重平衡状态""" load_rebalance_availability_check.desc: -"""检查节点是否正在被evacuate或rebalance""" +"""检查节点是否正在被执行重平衡或疏散""" load_rebalance_availability_check.label: """可用性检查""" load_rebalance_start.desc: -"""启动rebalance进程""" +"""启动重平衡任务""" load_rebalance_start.label: -"""启动rebalance""" +"""启动重平衡""" load_rebalance_stop.desc: -"""停止rebalance进程""" +"""停止重平衡任务""" load_rebalance_stop.label: -"""停止rebalance""" +"""停止重平衡""" load_rebalance_evacuation_start.desc: -"""启动evacuation进程""" +"""启动疏散任务""" load_rebalance_evacuation_start.label: -"""启动evacuation""" +"""启动疏散""" load_rebalance_evacuation_stop.desc: -"""停止evacuation进程""" +"""停止疏散任务""" load_rebalance_evacuation_stop.label: -"""停止evacuation""" +"""停止疏散""" param_node.desc: """节点名称""" @@ -49,49 +49,49 @@ param_node.label: """节点名称""" wait_health_check.desc: -"""启动rebalance进程前等待的时间,单位为秒""" +"""启动重平衡任务前等待的时间,单位为秒""" wait_health_check.label: """等待健康检查""" conn_evict_rate.desc: -"""逐出连接的速率,以每秒连接数表示""" +"""每秒迁出连接数""" conn_evict_rate.label: -"""连接驱逐率""" +"""迁出速率""" sess_evict_rate.desc: -"""逐出会话的速率,以每秒会话为单位""" +"""每秒迁出会话数""" sess_evict_rate.label: -"""会话驱逐率""" +"""会话迁出速率""" abs_conn_threshold.desc: -"""节点上的连接数与接收节点上的平均连接数之间的最大期望差值""" +"""当前节点上的连接数与迁入节点上的平均连接数的差值(绝对值)上限,低于该差值时停止迁移连接。""" abs_conn_threshold.label: -"""绝对连接阈值""" +"""连接数差值""" rel_conn_threshold.desc: -"""节点上的连接数与接收节点上的平均连接数之间的最大期望分数""" +"""当前节点上的连接数与迁入节点上的平均连接数的比值上限,低于该比值时停止迁移连接。""" rel_conn_threshold.label: -"""相对连接阈值""" +"""连接数比值""" abs_sess_threshold.desc: -"""节点上的会话数与接收节点上的平均会话数之间的最大期望差异""" +"""当前节点上的会话数与迁入节点上的平均会话数之间的差值(绝对值)上限,低于该差值时停止迁移会话。""" abs_sess_threshold.label: -"""绝对会话阈值""" +"""会话数差值""" rel_sess_threshold.desc: -"""节点上的会话数与接收节点上的平均会话数之间的最大期望分数""" +"""当前节点上的会话数与迁入节点上的平均会话数的比值上限,低于该比值时停止迁移会话。""" rel_sess_threshold.label: -"""相对会话阈值""" +"""会话数比值""" wait_takeover.desc: -"""开始会话疏散过程之前等待的时间,以秒为单位""" +"""开始会话疏散任务之前的等待时间,以秒为单位""" wait_takeover.label: """等待接管""" @@ -103,91 +103,91 @@ redirect_to.label: """重定向至""" migrate_to.desc: -"""将会话迁移到的节点""" +"""接受会话迁入的节点""" migrate_to.label: -"""迁移到""" +"""迁入节点""" rebalance_nodes.desc: -"""参与rebalance的节点""" +"""参与重平衡的节点""" rebalance_nodes.label: """重新平衡节点""" local_status_enabled.desc: -"""节点是否正在撤离""" +"""节点是否正在执行重平衡疏散任务""" local_status_enabled.label: -"""当地避难状况""" +"""运行状态""" local_status_process.desc: -"""正在节点上执行的过程:疏散或重新平衡""" +"""正在节点上执行的任务:'evacuation' 或 'rebalance'""" local_status_process.label: -"""节点进程""" +"""节点任务""" local_status_state.desc: -"""正在节点上执行的进程的状态""" +"""正在节点上执行的任务的状态""" local_status_state.label: """重新平衡/疏散当前状态""" local_status_coordinator_node.desc: -"""协调再平衡过程的节点""" +"""协调分配重平衡任务的节点""" local_status_coordinator_node.label: """协调节点""" local_status_connection_eviction_rate.desc: -"""逐出连接的速率,以每秒连接数表示""" +"""每秒迁出的连接数""" local_status_connection_eviction_rate.label: -"""连接驱逐率""" +"""连接迁出速率""" local_status_session_eviction_rate.desc: -"""逐出会话的速率,以每秒会话为单位""" +"""每秒迁出的会话数""" local_status_session_eviction_rate.label: -"""会话驱逐率""" +"""会话迁出速率""" local_status_connection_goal.desc: -"""节点在重新平衡/疏散过程后应该拥有的连接数""" +"""节点在重新平衡/疏散任务完成后预期拥有的连接数""" local_status_connection_goal.label: -"""连接目标""" +"""连接数目标""" local_status_session_goal.desc: -"""疏散过程后节点应有的会话数""" +"""疏散任务完成后节点预期的会话数""" local_status_session_goal.label: -"""会话目标""" +"""会话数目标""" local_status_disconnected_session_goal.desc: -"""重新平衡过程后节点应具有的断开连接的会话数""" +"""重新平衡任务完成后节点预期的无连接的会话数""" local_status_disconnected_session_goal.label: -"""断开连接的会话目标""" +"""预期无连接会话数""" local_status_session_recipients.desc: -"""会话被疏散到的节点列表""" +"""会话被迁入的节点列表""" local_status_session_recipients.label: -"""会话收件人""" +"""会话迁入节点""" local_status_recipients.desc: -"""在重新平衡期间连接/会话被疏散到的节点列表""" +"""在重新平衡期间接受连接/会话迁入的节点列表""" local_status_recipients.label: -"""收件人""" +"""接受迁入节点""" local_status_stats.desc: -"""疏散/再平衡过程的统计""" +"""疏散/重平衡的统计""" local_status_stats.label: """统计数据""" status_stats_initial_connected.desc: -"""疏散/重新平衡过程之前节点上的连接数""" +"""疏散/重新平衡任务开始之前节点上的连接数""" status_stats_initial_connected.label: """初始连接""" @@ -199,7 +199,7 @@ status_stats_current_connected.label: """当前连接""" status_stats_initial_sessions.desc: -"""疏散/重新平衡过程之前节点上的会话数""" +"""疏散/重新平衡任务开始之前节点上的会话数""" status_stats_initial_sessions.label: """初始会话""" @@ -211,52 +211,52 @@ status_stats_current_sessions.label: """当前会话""" status_stats_current_disconnected_sessions.desc: -"""节点上当前断开连接的会话数""" +"""节点上当前无连接的会话数""" status_stats_current_disconnected_sessions.label: -"""当前断开连接的会话""" +"""当前无连接会话""" coordinator_status_donors.desc: -"""正在疏散连接/会话的节点列表""" +"""正在迁出连接/会话的节点列表""" coordinator_status_donors.label: -"""捐助者""" +"""迁出节点""" coordinator_status_donor_conn_avg.desc: -"""每个供体节点的平均连接数""" +"""每个迁出节点的平均连接数""" coordinator_status_donor_conn_avg.label: -"""捐助者连接平均值""" +"""迁出节点连接平均值""" coordinator_status_donor_sess_avg.desc: -"""每个供体节点的平均会话数""" +"""每个迁出节点的平均会话数""" coordinator_status_donor_sess_avg.label: -"""平均捐助会议""" +"""迁出节点会话平均数""" coordinator_status_node.desc: -"""协调疏散/再平衡过程的节点""" +"""协调分配疏散/重平衡任务的节点""" coordinator_status_node.label: """协调节点""" evacuation_status_node.desc: -"""正在撤离的节点""" +"""正在迁出的节点""" evacuation_status_node.label: """疏散节点""" global_status_evacuations.desc: -"""正在撤离的节点列表""" +"""正在迁出的节点列表""" global_status_evacuations.label: """疏散""" global_status_rebalances.desc: -"""协调再平衡的节点列表""" +"""协调重平衡的节点列表""" global_status_rebalances.label: -"""再平衡""" +"""重平衡""" empty_response.desc: """响应为空""" @@ -264,4 +264,3 @@ empty_response.desc: empty_response.label: """空响应""" } - From 8d83dc12e71d67957a67a6db2ce042ab6dae9c71 Mon Sep 17 00:00:00 2001 From: Ilya Averyanov Date: Tue, 9 May 2023 20:47:23 +0500 Subject: [PATCH 7/8] chore(rebalance): move apps from lib-ee, add READMEs --- apps/emqx_eviction_agent/BSL.txt | 94 +++++++++++++++++++ apps/emqx_eviction_agent/README.md | 35 +++++++ .../etc/emqx_eviction_agent.conf | 0 .../emqx_eviction_agent/rebar.config | 0 .../src/emqx_eviction_agent.app.src | 0 .../src/emqx_eviction_agent.appup.src | 0 .../src/emqx_eviction_agent.erl | 0 .../src/emqx_eviction_agent_api.erl | 0 .../src/emqx_eviction_agent_app.erl | 0 .../src/emqx_eviction_agent_channel.erl | 0 .../src/emqx_eviction_agent_cli.erl | 0 .../src/emqx_eviction_agent_conn_sup.erl | 0 .../src/emqx_eviction_agent_sup.erl | 0 .../proto/emqx_eviction_agent_proto_v1.erl | 0 .../test/emqx_eviction_agent_SUITE.erl | 0 .../test/emqx_eviction_agent_api_SUITE.erl | 0 .../emqx_eviction_agent_channel_SUITE.erl | 0 .../test/emqx_eviction_agent_cli_SUITE.erl | 0 .../test/emqx_eviction_agent_test_helpers.erl | 0 apps/emqx_node_rebalance/BSL.txt | 94 +++++++++++++++++++ apps/emqx_node_rebalance/README.md | 40 ++++++++ .../etc/emqx_node_rebalance.conf | 0 .../include/emqx_node_rebalance.hrl | 0 .../emqx_node_rebalance/rebar.config | 0 .../src/emqx_node_rebalance.app.src | 0 .../src/emqx_node_rebalance.appup.src | 0 .../src/emqx_node_rebalance.erl | 0 .../src/emqx_node_rebalance_agent.erl | 0 .../src/emqx_node_rebalance_api.erl | 0 .../src/emqx_node_rebalance_app.erl | 0 .../src/emqx_node_rebalance_cli.erl | 0 .../src/emqx_node_rebalance_evacuation.erl | 0 ...emqx_node_rebalance_evacuation_persist.erl | 0 .../src/emqx_node_rebalance_status.erl | 0 .../src/emqx_node_rebalance_sup.erl | 0 .../emqx_node_rebalance_api_proto_v1.erl | 0 ...mqx_node_rebalance_evacuation_proto_v1.erl | 0 .../proto/emqx_node_rebalance_proto_v1.erl | 0 .../emqx_node_rebalance_status_proto_v1.erl | 0 .../test/emqx_node_rebalance_SUITE.erl | 0 .../test/emqx_node_rebalance_agent_SUITE.erl | 0 .../test/emqx_node_rebalance_api_SUITE.erl | 0 .../test/emqx_node_rebalance_cli_SUITE.erl | 0 .../emqx_node_rebalance_evacuation_SUITE.erl | 0 ...ode_rebalance_evacuation_persist_SUITE.erl | 0 lib-ee/emqx_eviction_agent/README.md | 9 -- lib-ee/emqx_node_rebalance/README.md | 9 -- 47 files changed, 263 insertions(+), 18 deletions(-) create mode 100644 apps/emqx_eviction_agent/BSL.txt create mode 100644 apps/emqx_eviction_agent/README.md rename {lib-ee => apps}/emqx_eviction_agent/etc/emqx_eviction_agent.conf (100%) rename {lib-ee => apps}/emqx_eviction_agent/rebar.config (100%) rename {lib-ee => apps}/emqx_eviction_agent/src/emqx_eviction_agent.app.src (100%) rename {lib-ee => apps}/emqx_eviction_agent/src/emqx_eviction_agent.appup.src (100%) rename {lib-ee => apps}/emqx_eviction_agent/src/emqx_eviction_agent.erl (100%) rename {lib-ee => apps}/emqx_eviction_agent/src/emqx_eviction_agent_api.erl (100%) rename {lib-ee => apps}/emqx_eviction_agent/src/emqx_eviction_agent_app.erl (100%) rename {lib-ee => apps}/emqx_eviction_agent/src/emqx_eviction_agent_channel.erl (100%) rename {lib-ee => apps}/emqx_eviction_agent/src/emqx_eviction_agent_cli.erl (100%) rename {lib-ee => apps}/emqx_eviction_agent/src/emqx_eviction_agent_conn_sup.erl (100%) rename {lib-ee => apps}/emqx_eviction_agent/src/emqx_eviction_agent_sup.erl (100%) rename {lib-ee => apps}/emqx_eviction_agent/src/proto/emqx_eviction_agent_proto_v1.erl (100%) rename {lib-ee => apps}/emqx_eviction_agent/test/emqx_eviction_agent_SUITE.erl (100%) rename {lib-ee => apps}/emqx_eviction_agent/test/emqx_eviction_agent_api_SUITE.erl (100%) rename {lib-ee => apps}/emqx_eviction_agent/test/emqx_eviction_agent_channel_SUITE.erl (100%) rename {lib-ee => apps}/emqx_eviction_agent/test/emqx_eviction_agent_cli_SUITE.erl (100%) rename {lib-ee => apps}/emqx_eviction_agent/test/emqx_eviction_agent_test_helpers.erl (100%) create mode 100644 apps/emqx_node_rebalance/BSL.txt create mode 100644 apps/emqx_node_rebalance/README.md rename {lib-ee => apps}/emqx_node_rebalance/etc/emqx_node_rebalance.conf (100%) rename {lib-ee => apps}/emqx_node_rebalance/include/emqx_node_rebalance.hrl (100%) rename {lib-ee => apps}/emqx_node_rebalance/rebar.config (100%) rename {lib-ee => apps}/emqx_node_rebalance/src/emqx_node_rebalance.app.src (100%) rename {lib-ee => apps}/emqx_node_rebalance/src/emqx_node_rebalance.appup.src (100%) rename {lib-ee => apps}/emqx_node_rebalance/src/emqx_node_rebalance.erl (100%) rename {lib-ee => apps}/emqx_node_rebalance/src/emqx_node_rebalance_agent.erl (100%) rename {lib-ee => apps}/emqx_node_rebalance/src/emqx_node_rebalance_api.erl (100%) rename {lib-ee => apps}/emqx_node_rebalance/src/emqx_node_rebalance_app.erl (100%) rename {lib-ee => apps}/emqx_node_rebalance/src/emqx_node_rebalance_cli.erl (100%) rename {lib-ee => apps}/emqx_node_rebalance/src/emqx_node_rebalance_evacuation.erl (100%) rename {lib-ee => apps}/emqx_node_rebalance/src/emqx_node_rebalance_evacuation_persist.erl (100%) rename {lib-ee => apps}/emqx_node_rebalance/src/emqx_node_rebalance_status.erl (100%) rename {lib-ee => apps}/emqx_node_rebalance/src/emqx_node_rebalance_sup.erl (100%) rename {lib-ee => apps}/emqx_node_rebalance/src/proto/emqx_node_rebalance_api_proto_v1.erl (100%) rename {lib-ee => apps}/emqx_node_rebalance/src/proto/emqx_node_rebalance_evacuation_proto_v1.erl (100%) rename {lib-ee => apps}/emqx_node_rebalance/src/proto/emqx_node_rebalance_proto_v1.erl (100%) rename {lib-ee => apps}/emqx_node_rebalance/src/proto/emqx_node_rebalance_status_proto_v1.erl (100%) rename {lib-ee => apps}/emqx_node_rebalance/test/emqx_node_rebalance_SUITE.erl (100%) rename {lib-ee => apps}/emqx_node_rebalance/test/emqx_node_rebalance_agent_SUITE.erl (100%) rename {lib-ee => apps}/emqx_node_rebalance/test/emqx_node_rebalance_api_SUITE.erl (100%) rename {lib-ee => apps}/emqx_node_rebalance/test/emqx_node_rebalance_cli_SUITE.erl (100%) rename {lib-ee => apps}/emqx_node_rebalance/test/emqx_node_rebalance_evacuation_SUITE.erl (100%) rename {lib-ee => apps}/emqx_node_rebalance/test/emqx_node_rebalance_evacuation_persist_SUITE.erl (100%) delete mode 100644 lib-ee/emqx_eviction_agent/README.md delete mode 100644 lib-ee/emqx_node_rebalance/README.md diff --git a/apps/emqx_eviction_agent/BSL.txt b/apps/emqx_eviction_agent/BSL.txt new file mode 100644 index 000000000..0acc0e696 --- /dev/null +++ b/apps/emqx_eviction_agent/BSL.txt @@ -0,0 +1,94 @@ +Business Source License 1.1 + +Licensor: Hangzhou EMQ Technologies Co., Ltd. +Licensed Work: EMQX Enterprise Edition + The Licensed Work is (c) 2023 + Hangzhou EMQ Technologies Co., Ltd. +Additional Use Grant: Students and educators are granted right to copy, + modify, and create derivative work for research + or education. +Change Date: 2027-02-01 +Change License: Apache License, Version 2.0 + +For information about alternative licensing arrangements for the Software, +please contact Licensor: https://www.emqx.com/en/contact + +Notice + +The Business Source License (this document, or the “License”) is not an Open +Source license. However, the Licensed Work will eventually be made available +under an Open Source License, as stated in this License. + +License text copyright (c) 2017 MariaDB Corporation Ab, All Rights Reserved. +“Business Source License” is a trademark of MariaDB Corporation Ab. + +----------------------------------------------------------------------------- + +Business Source License 1.1 + +Terms + +The Licensor hereby grants you the right to copy, modify, create derivative +works, redistribute, and make non-production use of the Licensed Work. The +Licensor may make an Additional Use Grant, above, permitting limited +production use. + +Effective on the Change Date, or the fourth anniversary of the first publicly +available distribution of a specific version of the Licensed Work under this +License, whichever comes first, the Licensor hereby grants you rights under +the terms of the Change License, and the rights granted in the paragraph +above terminate. + +If your use of the Licensed Work does not comply with the requirements +currently in effect as described in this License, you must purchase a +commercial license from the Licensor, its affiliated entities, or authorized +resellers, or you must refrain from using the Licensed Work. + +All copies of the original and modified Licensed Work, and derivative works +of the Licensed Work, are subject to this License. This License applies +separately for each version of the Licensed Work and the Change Date may vary +for each version of the Licensed Work released by Licensor. + +You must conspicuously display this License on each original or modified copy +of the Licensed Work. If you receive the Licensed Work in original or +modified form from a third party, the terms and conditions set forth in this +License apply to your use of that work. + +Any use of the Licensed Work in violation of this License will automatically +terminate your rights under this License for the current and all other +versions of the Licensed Work. + +This License does not grant you any right in any trademark or logo of +Licensor or its affiliates (provided that you may use a trademark or logo of +Licensor as expressly required by this License). + +TO THE EXTENT PERMITTED BY APPLICABLE LAW, THE LICENSED WORK IS PROVIDED ON +AN “AS IS” BASIS. LICENSOR HEREBY DISCLAIMS ALL WARRANTIES AND CONDITIONS, +EXPRESS OR IMPLIED, INCLUDING (WITHOUT LIMITATION) WARRANTIES OF +MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NON-INFRINGEMENT, AND +TITLE. + +MariaDB hereby grants you permission to use this License’s text to license +your works, and to refer to it using the trademark “Business Source License”, +as long as you comply with the Covenants of Licensor below. + +Covenants of Licensor + +In consideration of the right to use this License’s text and the “Business +Source License” name and trademark, Licensor covenants to MariaDB, and to all +other recipients of the licensed work to be provided by Licensor: + +1. To specify as the Change License the GPL Version 2.0 or any later version, + or a license that is compatible with GPL Version 2.0 or a later version, + where “compatible” means that software provided under the Change License can + be included in a program with software provided under GPL Version 2.0 or a + later version. Licensor may specify additional Change Licenses without + limitation. + +2. To either: (a) specify an additional grant of rights to use that does not + impose any additional restriction on the right granted in this License, as + the Additional Use Grant; or (b) insert the text “None”. + +3. To specify a Change Date. + +4. Not to modify this License in any other way. diff --git a/apps/emqx_eviction_agent/README.md b/apps/emqx_eviction_agent/README.md new file mode 100644 index 000000000..943bd7d12 --- /dev/null +++ b/apps/emqx_eviction_agent/README.md @@ -0,0 +1,35 @@ +# EMQX Eviction Agent + +`emqx_eviction_agent` is a part of the node evacuation/node rebalance feature in EMQX. +It is a low-level application that encapsulates working with actual MQTT connections. + +## Application Responsibilities + +`emqx_eviction_agent` application: + +* Blocks incoming connection to the node it is running on. +* Serves as a facade for connection/session eviction operations. +* Reports blocking status via HTTP API. + +The `emqx_eviction_agent` is relatively passive and has no eviction/rebalancing logic. It allows +`emqx_node_rebalance` to perform eviction/rebalancing operations using high-level API, without having to deal with +MQTT connections directly. + +## EMQX Integration + +`emqx_eviction_agent` interacts with the following EMQX components: +* `emqx_cm` - to get the list of active MQTT connections; +* `emqx_hooks` subsystem - to block/unblock incoming connections; +* `emqx_channel` and the corresponding connection modules to perform the eviction. + +## User Facing API + +The application provided a very simple API (CLI and HTTP) to inspect the current blocking status. + +# Documentation + +The rebalancing concept is described in the corresponding [EIP](https://github.com/emqx/eip/blob/main/active/0020-node-rebalance.md). + +# Contributing + +Please see our [contributing.md](../../CONTRIBUTING.md). diff --git a/lib-ee/emqx_eviction_agent/etc/emqx_eviction_agent.conf b/apps/emqx_eviction_agent/etc/emqx_eviction_agent.conf similarity index 100% rename from lib-ee/emqx_eviction_agent/etc/emqx_eviction_agent.conf rename to apps/emqx_eviction_agent/etc/emqx_eviction_agent.conf diff --git a/lib-ee/emqx_eviction_agent/rebar.config b/apps/emqx_eviction_agent/rebar.config similarity index 100% rename from lib-ee/emqx_eviction_agent/rebar.config rename to apps/emqx_eviction_agent/rebar.config diff --git a/lib-ee/emqx_eviction_agent/src/emqx_eviction_agent.app.src b/apps/emqx_eviction_agent/src/emqx_eviction_agent.app.src similarity index 100% rename from lib-ee/emqx_eviction_agent/src/emqx_eviction_agent.app.src rename to apps/emqx_eviction_agent/src/emqx_eviction_agent.app.src diff --git a/lib-ee/emqx_eviction_agent/src/emqx_eviction_agent.appup.src b/apps/emqx_eviction_agent/src/emqx_eviction_agent.appup.src similarity index 100% rename from lib-ee/emqx_eviction_agent/src/emqx_eviction_agent.appup.src rename to apps/emqx_eviction_agent/src/emqx_eviction_agent.appup.src diff --git a/lib-ee/emqx_eviction_agent/src/emqx_eviction_agent.erl b/apps/emqx_eviction_agent/src/emqx_eviction_agent.erl similarity index 100% rename from lib-ee/emqx_eviction_agent/src/emqx_eviction_agent.erl rename to apps/emqx_eviction_agent/src/emqx_eviction_agent.erl diff --git a/lib-ee/emqx_eviction_agent/src/emqx_eviction_agent_api.erl b/apps/emqx_eviction_agent/src/emqx_eviction_agent_api.erl similarity index 100% rename from lib-ee/emqx_eviction_agent/src/emqx_eviction_agent_api.erl rename to apps/emqx_eviction_agent/src/emqx_eviction_agent_api.erl diff --git a/lib-ee/emqx_eviction_agent/src/emqx_eviction_agent_app.erl b/apps/emqx_eviction_agent/src/emqx_eviction_agent_app.erl similarity index 100% rename from lib-ee/emqx_eviction_agent/src/emqx_eviction_agent_app.erl rename to apps/emqx_eviction_agent/src/emqx_eviction_agent_app.erl diff --git a/lib-ee/emqx_eviction_agent/src/emqx_eviction_agent_channel.erl b/apps/emqx_eviction_agent/src/emqx_eviction_agent_channel.erl similarity index 100% rename from lib-ee/emqx_eviction_agent/src/emqx_eviction_agent_channel.erl rename to apps/emqx_eviction_agent/src/emqx_eviction_agent_channel.erl diff --git a/lib-ee/emqx_eviction_agent/src/emqx_eviction_agent_cli.erl b/apps/emqx_eviction_agent/src/emqx_eviction_agent_cli.erl similarity index 100% rename from lib-ee/emqx_eviction_agent/src/emqx_eviction_agent_cli.erl rename to apps/emqx_eviction_agent/src/emqx_eviction_agent_cli.erl diff --git a/lib-ee/emqx_eviction_agent/src/emqx_eviction_agent_conn_sup.erl b/apps/emqx_eviction_agent/src/emqx_eviction_agent_conn_sup.erl similarity index 100% rename from lib-ee/emqx_eviction_agent/src/emqx_eviction_agent_conn_sup.erl rename to apps/emqx_eviction_agent/src/emqx_eviction_agent_conn_sup.erl diff --git a/lib-ee/emqx_eviction_agent/src/emqx_eviction_agent_sup.erl b/apps/emqx_eviction_agent/src/emqx_eviction_agent_sup.erl similarity index 100% rename from lib-ee/emqx_eviction_agent/src/emqx_eviction_agent_sup.erl rename to apps/emqx_eviction_agent/src/emqx_eviction_agent_sup.erl diff --git a/lib-ee/emqx_eviction_agent/src/proto/emqx_eviction_agent_proto_v1.erl b/apps/emqx_eviction_agent/src/proto/emqx_eviction_agent_proto_v1.erl similarity index 100% rename from lib-ee/emqx_eviction_agent/src/proto/emqx_eviction_agent_proto_v1.erl rename to apps/emqx_eviction_agent/src/proto/emqx_eviction_agent_proto_v1.erl diff --git a/lib-ee/emqx_eviction_agent/test/emqx_eviction_agent_SUITE.erl b/apps/emqx_eviction_agent/test/emqx_eviction_agent_SUITE.erl similarity index 100% rename from lib-ee/emqx_eviction_agent/test/emqx_eviction_agent_SUITE.erl rename to apps/emqx_eviction_agent/test/emqx_eviction_agent_SUITE.erl diff --git a/lib-ee/emqx_eviction_agent/test/emqx_eviction_agent_api_SUITE.erl b/apps/emqx_eviction_agent/test/emqx_eviction_agent_api_SUITE.erl similarity index 100% rename from lib-ee/emqx_eviction_agent/test/emqx_eviction_agent_api_SUITE.erl rename to apps/emqx_eviction_agent/test/emqx_eviction_agent_api_SUITE.erl diff --git a/lib-ee/emqx_eviction_agent/test/emqx_eviction_agent_channel_SUITE.erl b/apps/emqx_eviction_agent/test/emqx_eviction_agent_channel_SUITE.erl similarity index 100% rename from lib-ee/emqx_eviction_agent/test/emqx_eviction_agent_channel_SUITE.erl rename to apps/emqx_eviction_agent/test/emqx_eviction_agent_channel_SUITE.erl diff --git a/lib-ee/emqx_eviction_agent/test/emqx_eviction_agent_cli_SUITE.erl b/apps/emqx_eviction_agent/test/emqx_eviction_agent_cli_SUITE.erl similarity index 100% rename from lib-ee/emqx_eviction_agent/test/emqx_eviction_agent_cli_SUITE.erl rename to apps/emqx_eviction_agent/test/emqx_eviction_agent_cli_SUITE.erl diff --git a/lib-ee/emqx_eviction_agent/test/emqx_eviction_agent_test_helpers.erl b/apps/emqx_eviction_agent/test/emqx_eviction_agent_test_helpers.erl similarity index 100% rename from lib-ee/emqx_eviction_agent/test/emqx_eviction_agent_test_helpers.erl rename to apps/emqx_eviction_agent/test/emqx_eviction_agent_test_helpers.erl diff --git a/apps/emqx_node_rebalance/BSL.txt b/apps/emqx_node_rebalance/BSL.txt new file mode 100644 index 000000000..0acc0e696 --- /dev/null +++ b/apps/emqx_node_rebalance/BSL.txt @@ -0,0 +1,94 @@ +Business Source License 1.1 + +Licensor: Hangzhou EMQ Technologies Co., Ltd. +Licensed Work: EMQX Enterprise Edition + The Licensed Work is (c) 2023 + Hangzhou EMQ Technologies Co., Ltd. +Additional Use Grant: Students and educators are granted right to copy, + modify, and create derivative work for research + or education. +Change Date: 2027-02-01 +Change License: Apache License, Version 2.0 + +For information about alternative licensing arrangements for the Software, +please contact Licensor: https://www.emqx.com/en/contact + +Notice + +The Business Source License (this document, or the “License”) is not an Open +Source license. However, the Licensed Work will eventually be made available +under an Open Source License, as stated in this License. + +License text copyright (c) 2017 MariaDB Corporation Ab, All Rights Reserved. +“Business Source License” is a trademark of MariaDB Corporation Ab. + +----------------------------------------------------------------------------- + +Business Source License 1.1 + +Terms + +The Licensor hereby grants you the right to copy, modify, create derivative +works, redistribute, and make non-production use of the Licensed Work. The +Licensor may make an Additional Use Grant, above, permitting limited +production use. + +Effective on the Change Date, or the fourth anniversary of the first publicly +available distribution of a specific version of the Licensed Work under this +License, whichever comes first, the Licensor hereby grants you rights under +the terms of the Change License, and the rights granted in the paragraph +above terminate. + +If your use of the Licensed Work does not comply with the requirements +currently in effect as described in this License, you must purchase a +commercial license from the Licensor, its affiliated entities, or authorized +resellers, or you must refrain from using the Licensed Work. + +All copies of the original and modified Licensed Work, and derivative works +of the Licensed Work, are subject to this License. This License applies +separately for each version of the Licensed Work and the Change Date may vary +for each version of the Licensed Work released by Licensor. + +You must conspicuously display this License on each original or modified copy +of the Licensed Work. If you receive the Licensed Work in original or +modified form from a third party, the terms and conditions set forth in this +License apply to your use of that work. + +Any use of the Licensed Work in violation of this License will automatically +terminate your rights under this License for the current and all other +versions of the Licensed Work. + +This License does not grant you any right in any trademark or logo of +Licensor or its affiliates (provided that you may use a trademark or logo of +Licensor as expressly required by this License). + +TO THE EXTENT PERMITTED BY APPLICABLE LAW, THE LICENSED WORK IS PROVIDED ON +AN “AS IS” BASIS. LICENSOR HEREBY DISCLAIMS ALL WARRANTIES AND CONDITIONS, +EXPRESS OR IMPLIED, INCLUDING (WITHOUT LIMITATION) WARRANTIES OF +MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NON-INFRINGEMENT, AND +TITLE. + +MariaDB hereby grants you permission to use this License’s text to license +your works, and to refer to it using the trademark “Business Source License”, +as long as you comply with the Covenants of Licensor below. + +Covenants of Licensor + +In consideration of the right to use this License’s text and the “Business +Source License” name and trademark, Licensor covenants to MariaDB, and to all +other recipients of the licensed work to be provided by Licensor: + +1. To specify as the Change License the GPL Version 2.0 or any later version, + or a license that is compatible with GPL Version 2.0 or a later version, + where “compatible” means that software provided under the Change License can + be included in a program with software provided under GPL Version 2.0 or a + later version. Licensor may specify additional Change Licenses without + limitation. + +2. To either: (a) specify an additional grant of rights to use that does not + impose any additional restriction on the right granted in this License, as + the Additional Use Grant; or (b) insert the text “None”. + +3. To specify a Change Date. + +4. Not to modify this License in any other way. diff --git a/apps/emqx_node_rebalance/README.md b/apps/emqx_node_rebalance/README.md new file mode 100644 index 000000000..8a384fb5d --- /dev/null +++ b/apps/emqx_node_rebalance/README.md @@ -0,0 +1,40 @@ +# EMQX Node Rebalance + +`emqx_node_rebalance` is a part of the node evacuation/node rebalance feature in EMQX. +It implements high-level scenarios for node evacuation and rebalancing. + +## Application Responsibilities + +`emqx_node_rebalance` application's core concept is a _rebalance coordinator_. +_Rebalance сoordinator_ is an entity that implements the rebalancing logic and orchestrates the rebalancing process. +In particular, it: + +* Enables/Disables Eviction Agent on nodes. +* Sends connection/session eviction commands to Eviction Agents according to the evacuation logic. + +We have two implementations of the _rebalance coordinator_: +* `emqx_node_rebalance` - a coordinator that implements node rebalancing; +* `emqx_node_rebalance_evacuation` - a coordinator that implements node evacuation. + +## EMQX Integration + +`emqx_node_rebalance` is a high-level application that is loosely coupled with the rest of the system. +It uses Eviction Agent to perform the required operations. + +## User Facing API + +The application provides API (CLI and HTTP) to perform the following operations: +* Start/Stop rebalancing across a set of nodes or the whole cluster; +* Start/Stop evacuation of a node; +* Get the current rebalancing status of a local node. +* Get the current rebalancing status of the whole cluster. + +Also, an HTTP endpoint is provided for liveness probes. + +# Documentation + +The rebalancing concept is described in the corresponding [EIP](https://github.com/emqx/eip/blob/main/active/0020-node-rebalance.md). + +# Contributing + +Please see our [contributing.md](../../CONTRIBUTING.md). diff --git a/lib-ee/emqx_node_rebalance/etc/emqx_node_rebalance.conf b/apps/emqx_node_rebalance/etc/emqx_node_rebalance.conf similarity index 100% rename from lib-ee/emqx_node_rebalance/etc/emqx_node_rebalance.conf rename to apps/emqx_node_rebalance/etc/emqx_node_rebalance.conf diff --git a/lib-ee/emqx_node_rebalance/include/emqx_node_rebalance.hrl b/apps/emqx_node_rebalance/include/emqx_node_rebalance.hrl similarity index 100% rename from lib-ee/emqx_node_rebalance/include/emqx_node_rebalance.hrl rename to apps/emqx_node_rebalance/include/emqx_node_rebalance.hrl diff --git a/lib-ee/emqx_node_rebalance/rebar.config b/apps/emqx_node_rebalance/rebar.config similarity index 100% rename from lib-ee/emqx_node_rebalance/rebar.config rename to apps/emqx_node_rebalance/rebar.config diff --git a/lib-ee/emqx_node_rebalance/src/emqx_node_rebalance.app.src b/apps/emqx_node_rebalance/src/emqx_node_rebalance.app.src similarity index 100% rename from lib-ee/emqx_node_rebalance/src/emqx_node_rebalance.app.src rename to apps/emqx_node_rebalance/src/emqx_node_rebalance.app.src diff --git a/lib-ee/emqx_node_rebalance/src/emqx_node_rebalance.appup.src b/apps/emqx_node_rebalance/src/emqx_node_rebalance.appup.src similarity index 100% rename from lib-ee/emqx_node_rebalance/src/emqx_node_rebalance.appup.src rename to apps/emqx_node_rebalance/src/emqx_node_rebalance.appup.src diff --git a/lib-ee/emqx_node_rebalance/src/emqx_node_rebalance.erl b/apps/emqx_node_rebalance/src/emqx_node_rebalance.erl similarity index 100% rename from lib-ee/emqx_node_rebalance/src/emqx_node_rebalance.erl rename to apps/emqx_node_rebalance/src/emqx_node_rebalance.erl diff --git a/lib-ee/emqx_node_rebalance/src/emqx_node_rebalance_agent.erl b/apps/emqx_node_rebalance/src/emqx_node_rebalance_agent.erl similarity index 100% rename from lib-ee/emqx_node_rebalance/src/emqx_node_rebalance_agent.erl rename to apps/emqx_node_rebalance/src/emqx_node_rebalance_agent.erl diff --git a/lib-ee/emqx_node_rebalance/src/emqx_node_rebalance_api.erl b/apps/emqx_node_rebalance/src/emqx_node_rebalance_api.erl similarity index 100% rename from lib-ee/emqx_node_rebalance/src/emqx_node_rebalance_api.erl rename to apps/emqx_node_rebalance/src/emqx_node_rebalance_api.erl diff --git a/lib-ee/emqx_node_rebalance/src/emqx_node_rebalance_app.erl b/apps/emqx_node_rebalance/src/emqx_node_rebalance_app.erl similarity index 100% rename from lib-ee/emqx_node_rebalance/src/emqx_node_rebalance_app.erl rename to apps/emqx_node_rebalance/src/emqx_node_rebalance_app.erl diff --git a/lib-ee/emqx_node_rebalance/src/emqx_node_rebalance_cli.erl b/apps/emqx_node_rebalance/src/emqx_node_rebalance_cli.erl similarity index 100% rename from lib-ee/emqx_node_rebalance/src/emqx_node_rebalance_cli.erl rename to apps/emqx_node_rebalance/src/emqx_node_rebalance_cli.erl diff --git a/lib-ee/emqx_node_rebalance/src/emqx_node_rebalance_evacuation.erl b/apps/emqx_node_rebalance/src/emqx_node_rebalance_evacuation.erl similarity index 100% rename from lib-ee/emqx_node_rebalance/src/emqx_node_rebalance_evacuation.erl rename to apps/emqx_node_rebalance/src/emqx_node_rebalance_evacuation.erl diff --git a/lib-ee/emqx_node_rebalance/src/emqx_node_rebalance_evacuation_persist.erl b/apps/emqx_node_rebalance/src/emqx_node_rebalance_evacuation_persist.erl similarity index 100% rename from lib-ee/emqx_node_rebalance/src/emqx_node_rebalance_evacuation_persist.erl rename to apps/emqx_node_rebalance/src/emqx_node_rebalance_evacuation_persist.erl diff --git a/lib-ee/emqx_node_rebalance/src/emqx_node_rebalance_status.erl b/apps/emqx_node_rebalance/src/emqx_node_rebalance_status.erl similarity index 100% rename from lib-ee/emqx_node_rebalance/src/emqx_node_rebalance_status.erl rename to apps/emqx_node_rebalance/src/emqx_node_rebalance_status.erl diff --git a/lib-ee/emqx_node_rebalance/src/emqx_node_rebalance_sup.erl b/apps/emqx_node_rebalance/src/emqx_node_rebalance_sup.erl similarity index 100% rename from lib-ee/emqx_node_rebalance/src/emqx_node_rebalance_sup.erl rename to apps/emqx_node_rebalance/src/emqx_node_rebalance_sup.erl diff --git a/lib-ee/emqx_node_rebalance/src/proto/emqx_node_rebalance_api_proto_v1.erl b/apps/emqx_node_rebalance/src/proto/emqx_node_rebalance_api_proto_v1.erl similarity index 100% rename from lib-ee/emqx_node_rebalance/src/proto/emqx_node_rebalance_api_proto_v1.erl rename to apps/emqx_node_rebalance/src/proto/emqx_node_rebalance_api_proto_v1.erl diff --git a/lib-ee/emqx_node_rebalance/src/proto/emqx_node_rebalance_evacuation_proto_v1.erl b/apps/emqx_node_rebalance/src/proto/emqx_node_rebalance_evacuation_proto_v1.erl similarity index 100% rename from lib-ee/emqx_node_rebalance/src/proto/emqx_node_rebalance_evacuation_proto_v1.erl rename to apps/emqx_node_rebalance/src/proto/emqx_node_rebalance_evacuation_proto_v1.erl diff --git a/lib-ee/emqx_node_rebalance/src/proto/emqx_node_rebalance_proto_v1.erl b/apps/emqx_node_rebalance/src/proto/emqx_node_rebalance_proto_v1.erl similarity index 100% rename from lib-ee/emqx_node_rebalance/src/proto/emqx_node_rebalance_proto_v1.erl rename to apps/emqx_node_rebalance/src/proto/emqx_node_rebalance_proto_v1.erl diff --git a/lib-ee/emqx_node_rebalance/src/proto/emqx_node_rebalance_status_proto_v1.erl b/apps/emqx_node_rebalance/src/proto/emqx_node_rebalance_status_proto_v1.erl similarity index 100% rename from lib-ee/emqx_node_rebalance/src/proto/emqx_node_rebalance_status_proto_v1.erl rename to apps/emqx_node_rebalance/src/proto/emqx_node_rebalance_status_proto_v1.erl diff --git a/lib-ee/emqx_node_rebalance/test/emqx_node_rebalance_SUITE.erl b/apps/emqx_node_rebalance/test/emqx_node_rebalance_SUITE.erl similarity index 100% rename from lib-ee/emqx_node_rebalance/test/emqx_node_rebalance_SUITE.erl rename to apps/emqx_node_rebalance/test/emqx_node_rebalance_SUITE.erl diff --git a/lib-ee/emqx_node_rebalance/test/emqx_node_rebalance_agent_SUITE.erl b/apps/emqx_node_rebalance/test/emqx_node_rebalance_agent_SUITE.erl similarity index 100% rename from lib-ee/emqx_node_rebalance/test/emqx_node_rebalance_agent_SUITE.erl rename to apps/emqx_node_rebalance/test/emqx_node_rebalance_agent_SUITE.erl diff --git a/lib-ee/emqx_node_rebalance/test/emqx_node_rebalance_api_SUITE.erl b/apps/emqx_node_rebalance/test/emqx_node_rebalance_api_SUITE.erl similarity index 100% rename from lib-ee/emqx_node_rebalance/test/emqx_node_rebalance_api_SUITE.erl rename to apps/emqx_node_rebalance/test/emqx_node_rebalance_api_SUITE.erl diff --git a/lib-ee/emqx_node_rebalance/test/emqx_node_rebalance_cli_SUITE.erl b/apps/emqx_node_rebalance/test/emqx_node_rebalance_cli_SUITE.erl similarity index 100% rename from lib-ee/emqx_node_rebalance/test/emqx_node_rebalance_cli_SUITE.erl rename to apps/emqx_node_rebalance/test/emqx_node_rebalance_cli_SUITE.erl diff --git a/lib-ee/emqx_node_rebalance/test/emqx_node_rebalance_evacuation_SUITE.erl b/apps/emqx_node_rebalance/test/emqx_node_rebalance_evacuation_SUITE.erl similarity index 100% rename from lib-ee/emqx_node_rebalance/test/emqx_node_rebalance_evacuation_SUITE.erl rename to apps/emqx_node_rebalance/test/emqx_node_rebalance_evacuation_SUITE.erl diff --git a/lib-ee/emqx_node_rebalance/test/emqx_node_rebalance_evacuation_persist_SUITE.erl b/apps/emqx_node_rebalance/test/emqx_node_rebalance_evacuation_persist_SUITE.erl similarity index 100% rename from lib-ee/emqx_node_rebalance/test/emqx_node_rebalance_evacuation_persist_SUITE.erl rename to apps/emqx_node_rebalance/test/emqx_node_rebalance_evacuation_persist_SUITE.erl diff --git a/lib-ee/emqx_eviction_agent/README.md b/lib-ee/emqx_eviction_agent/README.md deleted file mode 100644 index f9b8037bf..000000000 --- a/lib-ee/emqx_eviction_agent/README.md +++ /dev/null @@ -1,9 +0,0 @@ -emqx_eviction_agent -===== - -An OTP library - -Build ------ - - $ rebar3 compile diff --git a/lib-ee/emqx_node_rebalance/README.md b/lib-ee/emqx_node_rebalance/README.md deleted file mode 100644 index 2e56f62cd..000000000 --- a/lib-ee/emqx_node_rebalance/README.md +++ /dev/null @@ -1,9 +0,0 @@ -emqx_node_rebalance -===== - -An OTP library - -Build ------ - - $ rebar3 compile From 61deda3ea6c3839d30d30716cd6d72acfcbf24e0 Mon Sep 17 00:00:00 2001 From: Ilya Averyanov Date: Tue, 9 May 2023 21:05:20 +0500 Subject: [PATCH 8/8] chore(rebalance): fix app metadata --- apps/emqx_eviction_agent/src/emqx_eviction_agent.app.src | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/apps/emqx_eviction_agent/src/emqx_eviction_agent.app.src b/apps/emqx_eviction_agent/src/emqx_eviction_agent.app.src index a360133f4..239d9052e 100644 --- a/apps/emqx_eviction_agent/src/emqx_eviction_agent.app.src +++ b/apps/emqx_eviction_agent/src/emqx_eviction_agent.app.src @@ -14,9 +14,8 @@ {mod, {emqx_eviction_agent_app, []}}, {env, []}, {modules, []}, - {maintainers, ["EMQX Team "]}, {links, [ - {"Homepage", "https://emqx.io/"}, + {"Homepage", "https://www.emqx.com/"}, {"Github", "https://github.com/emqx"} ]} ]}.