feat(rebalance): port apps from 4.x

This commit is contained in:
Ilya Averyanov 2023-02-17 00:16:29 +02:00
parent 954eef8f39
commit 609f7bd8fd
59 changed files with 6686 additions and 43 deletions

View File

@ -179,6 +179,7 @@ clean-all:
@rm -f rebar.lock
@rm -rf deps
@rm -rf _build
@rm -f emqx_dialyzer_*_plt
.PHONY: deps-all
deps-all: $(REBAR) $(PROFILES:%=deps-%)

View File

@ -0,0 +1,31 @@
%%--------------------------------------------------------------------
%% Copyright (c) 2023 EMQ Technologies Co., Ltd. All Rights Reserved.
%%
%% Licensed under the Apache License, Version 2.0 (the "License");
%% you may not use this file except in compliance with the License.
%% You may obtain a copy of the License at
%%
%% http://www.apache.org/licenses/LICENSE-2.0
%%
%% Unless required by applicable law or agreed to in writing, software
%% distributed under the License is distributed on an "AS IS" BASIS,
%% WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
%% See the License for the specific language governing permissions and
%% limitations under the License.
%%--------------------------------------------------------------------
%% This file contains common macros for testing.
%% It must not be used anywhere except in test suites.
-include_lib("snabbkaffe/include/snabbkaffe.hrl").
-define(assertWaitEvent(Code, EventMatch, Timeout),
?assertMatch(
{_, {ok, EventMatch}},
?wait_async_action(
Code,
EventMatch,
Timeout
)
)
).

View File

@ -0,0 +1,42 @@
%%--------------------------------------------------------------------
%% Copyright (c) 2017-2023 EMQ Technologies Co., Ltd. All Rights Reserved.
%%
%% Licensed under the Apache License, Version 2.0 (the "License");
%% you may not use this file except in compliance with the License.
%% You may obtain a copy of the License at
%%
%% http://www.apache.org/licenses/LICENSE-2.0
%%
%% Unless required by applicable law or agreed to in writing, software
%% distributed under the License is distributed on an "AS IS" BASIS,
%% WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
%% See the License for the specific language governing permissions and
%% limitations under the License.
%%--------------------------------------------------------------------
-define(CHANNEL_METRICS, [
recv_pkt,
recv_msg,
'recv_msg.qos0',
'recv_msg.qos1',
'recv_msg.qos2',
'recv_msg.dropped',
'recv_msg.dropped.await_pubrel_timeout',
send_pkt,
send_msg,
'send_msg.qos0',
'send_msg.qos1',
'send_msg.qos2',
'send_msg.dropped',
'send_msg.dropped.expired',
'send_msg.dropped.queue_full',
'send_msg.dropped.too_large'
]).
-define(INFO_KEYS, [
conninfo,
conn_state,
clientinfo,
session,
will_msg
]).

View File

@ -34,6 +34,7 @@
-define(HP_BRIDGE, 870).
-define(HP_DELAY_PUB, 860).
%% apps that can stop the hooks chain from continuing
-define(HP_NODE_REBALANCE, 110).
-define(HP_EXHOOK, 100).
%% == Lowest Priority = 0, don't change this value as the plugins may depend on it.

View File

@ -13,6 +13,7 @@
{emqx_conf,2}.
{emqx_dashboard,1}.
{emqx_delayed,1}.
{emqx_eviction_agent,1}.
{emqx_exhook,1}.
{emqx_gateway_api_listeners,1}.
{emqx_gateway_cm,1}.
@ -26,6 +27,10 @@
{emqx_mgmt_cluster,1}.
{emqx_mgmt_trace,1}.
{emqx_mgmt_trace,2}.
{emqx_node_rebalance,1}.
{emqx_node_rebalance_api,1}.
{emqx_node_rebalance_evacuation,1}.
{emqx_node_rebalance_status,1}.
{emqx_persistent_session,1}.
{emqx_plugin_libs,1}.
{emqx_plugins,1}.

View File

@ -18,6 +18,7 @@
-module(emqx_channel).
-include("emqx.hrl").
-include("emqx_channel.hrl").
-include("emqx_mqtt.hrl").
-include("logger.hrl").
-include("types.hrl").
@ -57,6 +58,12 @@
clear_keepalive/1
]).
%% Export for emqx_channel implementations
-export([
maybe_nack/1,
maybe_mark_as_delivered/2
]).
%% Exports for CT
-export([set_field/3]).
@ -69,7 +76,7 @@
]
).
-export_type([channel/0, opts/0]).
-export_type([channel/0, opts/0, conn_state/0]).
-record(channel, {
%% MQTT ConnInfo
@ -131,33 +138,6 @@
quota_timer => expire_quota_limit
}).
-define(CHANNEL_METRICS, [
recv_pkt,
recv_msg,
'recv_msg.qos0',
'recv_msg.qos1',
'recv_msg.qos2',
'recv_msg.dropped',
'recv_msg.dropped.await_pubrel_timeout',
send_pkt,
send_msg,
'send_msg.qos0',
'send_msg.qos1',
'send_msg.qos2',
'send_msg.dropped',
'send_msg.dropped.expired',
'send_msg.dropped.queue_full',
'send_msg.dropped.too_large'
]).
-define(INFO_KEYS, [
conninfo,
conn_state,
clientinfo,
session,
will_msg
]).
-define(LIMITER_ROUTING, message_routing).
-dialyzer({no_match, [shutdown/4, ensure_timer/2, interval/2]}).
@ -1078,10 +1058,12 @@ handle_out(unsuback, {PacketId, _ReasonCodes}, Channel) ->
handle_out(disconnect, ReasonCode, Channel) when is_integer(ReasonCode) ->
ReasonName = disconnect_reason(ReasonCode),
handle_out(disconnect, {ReasonCode, ReasonName}, Channel);
handle_out(disconnect, {ReasonCode, ReasonName}, Channel = ?IS_MQTT_V5) ->
Packet = ?DISCONNECT_PACKET(ReasonCode),
handle_out(disconnect, {ReasonCode, ReasonName}, Channel) ->
handle_out(disconnect, {ReasonCode, ReasonName, #{}}, Channel);
handle_out(disconnect, {ReasonCode, ReasonName, Props}, Channel = ?IS_MQTT_V5) ->
Packet = ?DISCONNECT_PACKET(ReasonCode, Props),
{ok, [{outgoing, Packet}, {close, ReasonName}], Channel};
handle_out(disconnect, {_ReasonCode, ReasonName}, Channel) ->
handle_out(disconnect, {_ReasonCode, ReasonName, _Props}, Channel) ->
{ok, {close, ReasonName}, Channel};
handle_out(auth, {ReasonCode, Properties}, Channel) ->
{ok, ?AUTH_PACKET(ReasonCode, Properties), Channel};
@ -1198,13 +1180,19 @@ handle_call(
{takeover, 'end'},
Channel = #channel{
session = Session,
pendings = Pendings
pendings = Pendings,
conninfo = #{clientid := ClientId}
}
) ->
ok = emqx_session:takeover(Session),
%% TODO: Should not drain deliver here (side effect)
Delivers = emqx_utils:drain_deliver(),
AllPendings = lists:append(Delivers, Pendings),
?tp(
debug,
emqx_channel_takeover_end,
#{clientid => ClientId}
),
disconnect_and_shutdown(takenover, AllPendings, Channel);
handle_call(list_authz_cache, Channel) ->
{reply, emqx_authz_cache:list_authz_cache(), Channel};
@ -1276,6 +1264,8 @@ handle_info(die_if_test = Info, Channel) ->
die_if_test_compiled(),
?SLOG(error, #{msg => "unexpected_info", info => Info}),
{ok, Channel};
handle_info({disconnect, ReasonCode, ReasonName, Props}, Channel) ->
handle_out(disconnect, {ReasonCode, ReasonName, Props}, Channel);
handle_info(Info, Channel) ->
?SLOG(error, #{msg => "unexpected_info", info => Info}),
{ok, Channel}.

View File

@ -23,6 +23,8 @@
-include("logger.hrl").
-include("types.hrl").
-include_lib("snabbkaffe/include/snabbkaffe.hrl").
-include_lib("stdlib/include/qlc.hrl").
-include_lib("stdlib/include/ms_transform.hrl").
-export([start_link/0]).
@ -72,6 +74,12 @@
get_session_confs/2
]).
%% Client management
-export([
channel_with_session_table/1,
live_connection_table/1
]).
%% gen_server callbacks
-export([
init/1,
@ -593,6 +601,40 @@ all_channels() ->
Pat = [{{'_', '$1'}, [], ['$1']}],
ets:select(?CHAN_TAB, Pat).
%% @doc Get clientinfo for all clients with sessions
channel_with_session_table(ConnModules) ->
Ms = ets:fun2ms(
fun({{ClientId, _ChanPid}, Info, _Stats}) ->
{ClientId, Info}
end
),
Table = ets:table(?CHAN_INFO_TAB, [{traverse, {select, Ms}}]),
ConnModuleMap = maps:from_list([{Mod, true} || Mod <- ConnModules]),
qlc:q([
{ClientId, ConnState, ConnInfo, ClientInfo}
|| {ClientId, #{
conn_state := ConnState,
clientinfo := ClientInfo,
conninfo := #{clean_start := false, conn_mod := ConnModule} = ConnInfo
}} <-
Table,
maps:is_key(ConnModule, ConnModuleMap)
]).
%% @doc Get all local connection query handle
live_connection_table(ConnModules) ->
Ms = lists:map(fun live_connection_ms/1, ConnModules),
Table = ets:table(?CHAN_CONN_TAB, [{traverse, {select, Ms}}]),
qlc:q([{ClientId, ChanPid} || {ClientId, ChanPid} <- Table, is_channel_connected(ChanPid)]).
live_connection_ms(ConnModule) ->
{{{'$1', '$2'}, ConnModule}, [], [{{'$1', '$2'}}]}.
is_channel_connected(ChanPid) when node(ChanPid) =:= node() ->
ets:member(?CHAN_LIVE_TAB, ChanPid);
is_channel_connected(_ChanPid) ->
false.
%% @doc Get all registered clientIDs. Debug/test interface
all_client_ids() ->
Pat = [{{'$1', '_'}, [], ['$1']}],
@ -693,7 +735,8 @@ code_change(_OldVsn, State, _Extra) ->
%%--------------------------------------------------------------------
clean_down({ChanPid, ClientId}) ->
do_unregister_channel({ClientId, ChanPid}).
do_unregister_channel({ClientId, ChanPid}),
ok = ?tp(debug, emqx_cm_clean_down, #{client_id => ClientId}).
stats_fun() ->
lists:foreach(fun update_stats/1, ?CHAN_STATS).
@ -719,12 +762,12 @@ get_chann_conn_mod(ClientId, ChanPid) ->
wrap_rpc(emqx_cm_proto_v1:get_chann_conn_mod(ClientId, ChanPid)).
mark_channel_connected(ChanPid) ->
?tp(emqx_cm_connected_client_count_inc, #{}),
?tp(emqx_cm_connected_client_count_inc, #{chan_pid => ChanPid}),
ets:insert_new(?CHAN_LIVE_TAB, {ChanPid, true}),
ok.
mark_channel_disconnected(ChanPid) ->
?tp(emqx_cm_connected_client_count_dec, #{}),
?tp(emqx_cm_connected_client_count_dec, #{chan_pid => ChanPid}),
ets:delete(?CHAN_LIVE_TAB, ChanPid),
ok.

View File

@ -167,9 +167,15 @@ handle_info(Info, State) ->
{noreply, State}.
terminate(_Reason, _State) ->
ok = ekka:unmonitor(membership),
emqx_stats:cancel_update(route_stats),
mnesia:unsubscribe({table, ?ROUTING_NODE, simple}).
try
ok = ekka:unmonitor(membership),
emqx_stats:cancel_update(route_stats),
mnesia:unsubscribe({table, ?ROUTING_NODE, simple})
catch
exit:{noproc, {gen_server, call, [mria_membership, _]}} ->
?SLOG(warning, #{msg => "mria_membership_down"}),
ok
end.
code_change(_OldVsn, State, _Extra) ->
{ok, State}.

View File

@ -20,6 +20,7 @@
set_default_config/0,
set_default_config/1,
set_default_config/2,
set_default_config/3,
request/2,
request/3,
request/4,
@ -40,11 +41,14 @@ set_default_config(DefaultUsername) ->
set_default_config(DefaultUsername, false).
set_default_config(DefaultUsername, HAProxyEnabled) ->
set_default_config(DefaultUsername, HAProxyEnabled, #{}).
set_default_config(DefaultUsername, HAProxyEnabled, Opts) ->
Config = #{
listeners => #{
http => #{
enable => true,
bind => 18083,
bind => maps:get(bind, Opts, 18083),
inet6 => false,
ipv6_v6only => false,
max_connections => 512,

View File

@ -149,8 +149,14 @@ basic_reboot_apps() ->
emqx_plugins
],
case emqx_release:edition() of
ce -> CE;
ee -> CE ++ []
ce ->
CE;
ee ->
CE ++
[
emqx_eviction_agent,
emqx_node_rebalance
]
end.
sorted_reboot_apps() ->

View File

@ -0,0 +1 @@
Add node rebalance/node evacuation [functionality](https://github.com/emqx/eip/blob/main/active/0020-node-rebalance.md).

View File

@ -0,0 +1 @@
添加节点再平衡/节点疏散[功能](https://github.com/emqx/eip/blob/main/active/0020-node-rebalance.md)。

View File

@ -0,0 +1,9 @@
emqx_eviction_agent
=====
An OTP library
Build
-----
$ rebar3 compile

View File

@ -0,0 +1,3 @@
##--------------------------------------------------------------------
## EMQX Eviction Agent Plugin
##--------------------------------------------------------------------

View File

@ -0,0 +1,14 @@
emqx_eviction_agent_api {
node_eviction_status_get {
desc {
en: "Get the node eviction status"
zh: "获取节点驱逐状态"
}
label {
en: "Node Eviction Status"
zh: "节点驱逐状态"
}
}
}

View File

@ -0,0 +1,2 @@
{deps, [{emqx, {path, "../../apps/emqx"}}]}.
{project_plugins, [erlfmt]}.

View File

@ -0,0 +1,22 @@
{application, emqx_eviction_agent, [
{description, "EMQX Eviction Agent"},
{vsn, "5.0.0"},
{registered, [
emqx_eviction_agent_sup,
emqx_eviction_agent,
emqx_eviction_agent_conn_sup
]},
{applications, [
kernel,
stdlib,
emqx_ctl
]},
{mod, {emqx_eviction_agent_app, []}},
{env, []},
{modules, []},
{maintainers, ["EMQX Team <contact@emqx.io>"]},
{links, [
{"Homepage", "https://emqx.io/"},
{"Github", "https://github.com/emqx"}
]}
]}.

View File

@ -0,0 +1,3 @@
%% -*- mode: erlang -*-
%% Unless you know what you are doing, DO NOT edit manually!!
{VSN, [{<<".*">>, []}], [{<<".*">>, []}]}.

View File

@ -0,0 +1,346 @@
%%--------------------------------------------------------------------
%% Copyright (c) 2022-2023 EMQ Technologies Co., Ltd. All Rights Reserved.
%%--------------------------------------------------------------------
-module(emqx_eviction_agent).
-include_lib("emqx/include/emqx_mqtt.hrl").
-include_lib("emqx/include/logger.hrl").
-include_lib("emqx/include/types.hrl").
-include_lib("emqx/include/emqx_hooks.hrl").
-include_lib("stdlib/include/qlc.hrl").
-include_lib("snabbkaffe/include/snabbkaffe.hrl").
-export([
start_link/0,
enable/2,
disable/1,
status/0,
connection_count/0,
session_count/0,
session_count/1,
evict_connections/1,
evict_sessions/2,
evict_sessions/3,
evict_session_channel/3
]).
-behaviour(gen_server).
-export([
init/1,
handle_call/3,
handle_info/2,
handle_cast/2,
code_change/3
]).
-export([
on_connect/2,
on_connack/3
]).
-export([
hook/0,
unhook/0
]).
-export_type([server_reference/0]).
-define(CONN_MODULES, [emqx_connection, emqx_ws_connection, emqx_eviction_agent_channel]).
%%--------------------------------------------------------------------
%% APIs
%%--------------------------------------------------------------------
-type server_reference() :: binary() | undefined.
-type status() :: {enabled, conn_stats()} | disabled.
-type conn_stats() :: #{
connections := non_neg_integer(),
sessions := non_neg_integer()
}.
-type kind() :: atom().
-spec start_link() -> startlink_ret().
start_link() ->
gen_server:start_link({local, ?MODULE}, ?MODULE, [], []).
-spec enable(kind(), server_reference()) -> ok_or_error(eviction_agent_busy).
enable(Kind, ServerReference) ->
gen_server:call(?MODULE, {enable, Kind, ServerReference}).
-spec disable(kind()) -> ok.
disable(Kind) ->
gen_server:call(?MODULE, {disable, Kind}).
-spec status() -> status().
status() ->
case enable_status() of
{enabled, _Kind, _ServerReference} ->
{enabled, stats()};
disabled ->
disabled
end.
-spec evict_connections(pos_integer()) -> ok_or_error(disabled).
evict_connections(N) ->
case enable_status() of
{enabled, _Kind, ServerReference} ->
ok = do_evict_connections(N, ServerReference);
disabled ->
{error, disabled}
end.
-spec evict_sessions(pos_integer(), node() | [node()]) -> ok_or_error(disabled).
evict_sessions(N, Node) when is_atom(Node) ->
evict_sessions(N, [Node]);
evict_sessions(N, Nodes) when is_list(Nodes) andalso length(Nodes) > 0 ->
evict_sessions(N, Nodes, any).
-spec evict_sessions(pos_integer(), node() | [node()], atom()) -> ok_or_error(disabled).
evict_sessions(N, Node, ConnState) when is_atom(Node) ->
evict_sessions(N, [Node], ConnState);
evict_sessions(N, Nodes, ConnState) when
is_list(Nodes) andalso length(Nodes) > 0
->
case enable_status() of
{enabled, _Kind, _ServerReference} ->
ok = do_evict_sessions(N, Nodes, ConnState);
disabled ->
{error, disabled}
end.
%%--------------------------------------------------------------------
%% gen_server callbacks
%%--------------------------------------------------------------------
init([]) ->
_ = persistent_term:erase(?MODULE),
{ok, #{}}.
%% enable
handle_call({enable, Kind, ServerReference}, _From, St) ->
Reply =
case enable_status() of
disabled ->
ok = persistent_term:put(?MODULE, {enabled, Kind, ServerReference});
{enabled, Kind, _ServerReference} ->
ok = persistent_term:put(?MODULE, {enabled, Kind, ServerReference});
{enabled, _OtherKind, _ServerReference} ->
{error, eviction_agent_busy}
end,
{reply, Reply, St};
%% disable
handle_call({disable, Kind}, _From, St) ->
Reply =
case enable_status() of
disabled ->
{error, disabled};
{enabled, Kind, _ServerReference} ->
_ = persistent_term:erase(?MODULE),
ok;
{enabled, _OtherKind, _ServerReference} ->
{error, eviction_agent_busy}
end,
{reply, Reply, St};
handle_call(Msg, _From, St) ->
?SLOG(warning, #{msg => "unknown_call", call => Msg, state => St}),
{reply, {error, unknown_call}, St}.
handle_info(Msg, St) ->
?SLOG(warning, #{msg => "unknown_msg", info => Msg, state => St}),
{noreply, St}.
handle_cast(Msg, St) ->
?SLOG(warning, #{msg => "unknown_cast", cast => Msg, state => St}),
{noreply, St}.
code_change(_Vsn, State, _Extra) ->
{ok, State}.
%%--------------------------------------------------------------------
%% Hook callbacks
%%--------------------------------------------------------------------
on_connect(_ConnInfo, _Props) ->
case enable_status() of
{enabled, _Kind, _ServerReference} ->
{stop, {error, ?RC_USE_ANOTHER_SERVER}};
disabled ->
ignore
end.
on_connack(
#{proto_name := <<"MQTT">>, proto_ver := ?MQTT_PROTO_V5},
use_another_server,
Props
) ->
case enable_status() of
{enabled, _Kind, ServerReference} ->
{ok, Props#{'Server-Reference' => ServerReference}};
disabled ->
{ok, Props}
end;
on_connack(_ClientInfo, _Reason, Props) ->
{ok, Props}.
%%--------------------------------------------------------------------
%% Hook funcs
%%--------------------------------------------------------------------
hook() ->
?tp(debug, eviction_agent_hook, #{}),
ok = emqx_hooks:put('client.connack', {?MODULE, on_connack, []}, ?HP_NODE_REBALANCE),
ok = emqx_hooks:put('client.connect', {?MODULE, on_connect, []}, ?HP_NODE_REBALANCE).
unhook() ->
?tp(debug, eviction_agent_unhook, #{}),
ok = emqx_hooks:del('client.connect', {?MODULE, on_connect}),
ok = emqx_hooks:del('client.connack', {?MODULE, on_connack}).
enable_status() ->
persistent_term:get(?MODULE, disabled).
% connection management
stats() ->
#{
connections => connection_count(),
sessions => session_count()
}.
connection_table() ->
emqx_cm:live_connection_table(?CONN_MODULES).
connection_count() ->
table_count(connection_table()).
channel_with_session_table(any) ->
qlc:q([
{ClientId, ConnInfo, ClientInfo}
|| {ClientId, _, ConnInfo, ClientInfo} <-
emqx_cm:channel_with_session_table(?CONN_MODULES)
]);
channel_with_session_table(RequiredConnState) ->
qlc:q([
{ClientId, ConnInfo, ClientInfo}
|| {ClientId, ConnState, ConnInfo, ClientInfo} <-
emqx_cm:channel_with_session_table(?CONN_MODULES),
RequiredConnState =:= ConnState
]).
session_count() ->
session_count(any).
session_count(ConnState) ->
table_count(channel_with_session_table(ConnState)).
table_count(QH) ->
qlc:fold(fun(_, Acc) -> Acc + 1 end, 0, QH).
take_connections(N) ->
ChanQH = qlc:q([ChanPid || {_ClientId, ChanPid} <- connection_table()]),
ChanPidCursor = qlc:cursor(ChanQH),
ChanPids = qlc:next_answers(ChanPidCursor, N),
ok = qlc:delete_cursor(ChanPidCursor),
ChanPids.
take_channel_with_sessions(N, ConnState) ->
ChanPidCursor = qlc:cursor(channel_with_session_table(ConnState)),
Channels = qlc:next_answers(ChanPidCursor, N),
ok = qlc:delete_cursor(ChanPidCursor),
Channels.
do_evict_connections(N, ServerReference) when N > 0 ->
ChanPids = take_connections(N),
ok = lists:foreach(
fun(ChanPid) ->
disconnect_channel(ChanPid, ServerReference)
end,
ChanPids
).
do_evict_sessions(N, Nodes, ConnState) when N > 0 ->
Channels = take_channel_with_sessions(N, ConnState),
ok = lists:foreach(
fun({ClientId, ConnInfo, ClientInfo}) ->
evict_session_channel(Nodes, ClientId, ConnInfo, ClientInfo)
end,
Channels
).
evict_session_channel(Nodes, ClientId, ConnInfo, ClientInfo) ->
Node = select_random(Nodes),
?SLOG(
info,
#{
msg => "evict_session_channel",
client_id => ClientId,
node => Node,
conn_info => ConnInfo,
client_info => ClientInfo
}
),
case emqx_eviction_agent_proto_v1:evict_session_channel(Node, ClientId, ConnInfo, ClientInfo) of
{badrpc, Reason} ->
?SLOG(
error,
#{
msg => "evict_session_channel_rpc_error",
client_id => ClientId,
node => Node,
reason => Reason
}
),
{error, Reason};
{error, Reason} = Error ->
?SLOG(
error,
#{
msg => "evict_session_channel_error",
client_id => ClientId,
node => Node,
reason => Reason
}
),
Error;
Res ->
Res
end.
-spec evict_session_channel(
emqx_types:clientid(),
emqx_types:conninfo(),
emqx_types:clientinfo()
) -> supervisor:startchild_ret().
evict_session_channel(ClientId, ConnInfo, ClientInfo) ->
?SLOG(info, #{
msg => "evict_session_channel",
client_id => ClientId,
conn_info => ConnInfo,
client_info => ClientInfo
}),
Result = emqx_eviction_agent_channel:start_supervised(
#{
conninfo => ConnInfo,
clientinfo => ClientInfo
}
),
?SLOG(
info,
#{
msg => "evict_session_channel_result",
client_id => ClientId,
result => Result
}
),
Result.
disconnect_channel(ChanPid, ServerReference) ->
ChanPid !
{disconnect, ?RC_USE_ANOTHER_SERVER, use_another_server, #{
'Server-Reference' => ServerReference
}}.
select_random(List) when length(List) > 0 ->
lists:nth(rand:uniform(length(List)), List).

View File

@ -0,0 +1,85 @@
%%--------------------------------------------------------------------
%% Copyright (c) 2022-2023 EMQ Technologies Co., Ltd. All Rights Reserved.
%%--------------------------------------------------------------------
-module(emqx_eviction_agent_api).
-behaviour(minirest_api).
-include_lib("typerefl/include/types.hrl").
-include_lib("hocon/include/hoconsc.hrl").
-include_lib("emqx/include/logger.hrl").
%% Swagger specs from hocon schema
-export([
api_spec/0,
paths/0,
schema/1,
namespace/0
]).
-export([
fields/1,
roots/0
]).
%% API callbacks
-export([
'/node_eviction/status'/2
]).
-import(hoconsc, [mk/2, ref/1, ref/2]).
namespace() -> "node_eviction".
api_spec() ->
emqx_dashboard_swagger:spec(?MODULE, #{check_schema => true}).
paths() ->
[
"/node_eviction/status"
].
schema("/node_eviction/status") ->
#{
'operationId' => '/node_eviction/status',
get => #{
tags => [<<"node_eviction">>],
summary => <<"Get node eviction status">>,
description => ?DESC("node_eviction_status_get"),
responses => #{
200 => schema_status()
}
}
}.
'/node_eviction/status'(_Bindings, _Params) ->
case emqx_eviction_agent:status() of
disabled ->
{200, #{status => disabled}};
{enabled, Stats} ->
{200, #{
status => enabled,
stats => Stats
}}
end.
schema_status() ->
mk(hoconsc:union([ref(status_enabled), ref(status_disabled)]), #{}).
roots() -> [].
fields(status_enabled) ->
[
{status, mk(enabled, #{default => enabled})},
{stats, ref(stats)}
];
fields(stats) ->
[
{connections, mk(integer(), #{})},
{sessions, mk(integer(), #{})}
];
fields(status_disabled) ->
[
{status, mk(disabled, #{default => disabled})}
].

View File

@ -0,0 +1,24 @@
%%--------------------------------------------------------------------
%% Copyright (c) 2022-2023 EMQ Technologies Co., Ltd. All Rights Reserved.
%%--------------------------------------------------------------------
-module(emqx_eviction_agent_app).
-behaviour(application).
-emqx_plugin(?MODULE).
-export([
start/2,
stop/1
]).
start(_Type, _Args) ->
ok = emqx_eviction_agent:hook(),
{ok, Sup} = emqx_eviction_agent_sup:start_link(),
ok = emqx_eviction_agent_cli:load(),
{ok, Sup}.
stop(_State) ->
ok = emqx_eviction_agent:unhook(),
ok = emqx_eviction_agent_cli:unload().

View File

@ -0,0 +1,368 @@
%%--------------------------------------------------------------------
%% Copyright (c) 2022-2023 EMQ Technologies Co., Ltd. All Rights Reserved.
%%--------------------------------------------------------------------
%% MQTT Channel
-module(emqx_eviction_agent_channel).
-include_lib("emqx/include/emqx.hrl").
-include_lib("emqx/include/emqx_channel.hrl").
-include_lib("emqx/include/emqx_mqtt.hrl").
-include_lib("emqx/include/logger.hrl").
-include_lib("emqx/include/types.hrl").
-include_lib("snabbkaffe/include/snabbkaffe.hrl").
-logger_header("[Evicted Channel]").
-export([
start_link/1,
start_supervised/1,
call/2,
call/3,
cast/2,
stop/1
]).
-export([
init/1,
handle_call/3,
handle_cast/2,
handle_info/2,
terminate/2,
code_change/3
]).
-import(
emqx_misc,
[
maybe_apply/2
]
).
-type opts() :: #{
conninfo := emqx_types:conninfo(),
clientinfo := emqx_types:clientinfo()
}.
%%--------------------------------------------------------------------
%% API
%%--------------------------------------------------------------------
-spec start_supervised(opts()) -> supervisor:startchild_ret().
start_supervised(#{clientinfo := #{clientid := ClientId}} = Opts) ->
RandomId = integer_to_binary(erlang:unique_integer([positive])),
ClientIdBin = bin_clientid(ClientId),
Id = <<ClientIdBin/binary, "-", RandomId/binary>>,
ChildSpec = #{
id => Id,
start => {?MODULE, start_link, [Opts]},
restart => temporary,
shutdown => 5000,
type => worker,
modules => [?MODULE]
},
supervisor:start_child(
emqx_eviction_agent_conn_sup,
ChildSpec
).
-spec start_link(opts()) -> startlink_ret().
start_link(Opts) ->
gen_server:start_link(?MODULE, [Opts], []).
-spec cast(pid(), term()) -> ok.
cast(Pid, Req) ->
gen_server:cast(Pid, Req).
-spec call(pid(), term()) -> term().
call(Pid, Req) ->
call(Pid, Req, infinity).
-spec call(pid(), term(), timeout()) -> term().
call(Pid, Req, Timeout) ->
gen_server:call(Pid, Req, Timeout).
-spec stop(pid()) -> ok.
stop(Pid) ->
gen_server:stop(Pid).
%%--------------------------------------------------------------------
%% gen_server API
%%--------------------------------------------------------------------
init([#{conninfo := OldConnInfo, clientinfo := #{clientid := ClientId} = OldClientInfo}]) ->
process_flag(trap_exit, true),
ClientInfo = clientinfo(OldClientInfo),
ConnInfo = conninfo(OldConnInfo),
case open_session(ConnInfo, ClientInfo) of
{ok, Channel0} ->
case set_expiry_timer(Channel0) of
{ok, Channel1} ->
?SLOG(
info,
#{
msg => "channel_initialized",
clientid => ClientId,
node => node()
}
),
ok = emqx_cm:mark_channel_disconnected(self()),
{ok, Channel1, hibernate};
{error, Reason} ->
{stop, Reason}
end;
{error, Reason} ->
{stop, Reason}
end.
handle_call(kick, _From, Channel) ->
{stop, kicked, ok, Channel};
handle_call(discard, _From, Channel) ->
{stop, discarded, ok, Channel};
handle_call({takeover, 'begin'}, _From, #{session := Session} = Channel) ->
{reply, Session, Channel#{takeover => true}};
handle_call(
{takeover, 'end'},
_From,
#{
session := Session,
clientinfo := #{clientid := ClientId},
pendings := Pendings
} = Channel
) ->
ok = emqx_session:takeover(Session),
%% TODO: Should not drain deliver here (side effect)
Delivers = emqx_misc:drain_deliver(),
AllPendings = lists:append(Delivers, Pendings),
?tp(
debug,
emqx_channel_takeover_end,
#{clientid => ClientId}
),
{stop, normal, AllPendings, Channel};
handle_call(list_acl_cache, _From, Channel) ->
{reply, [], Channel};
handle_call({quota, _Policy}, _From, Channel) ->
{reply, ok, Channel};
handle_call(Req, _From, Channel) ->
?SLOG(
error,
#{
msg => "unexpected_call",
req => Req
}
),
{reply, ignored, Channel}.
handle_info(Deliver = {deliver, _Topic, _Msg}, Channel) ->
Delivers = [Deliver | emqx_misc:drain_deliver()],
{noreply, handle_deliver(Delivers, Channel)};
handle_info(expire_session, Channel) ->
{stop, expired, Channel};
handle_info(Info, Channel) ->
?SLOG(
error,
#{
msg => "unexpected_info",
info => Info
}
),
{noreply, Channel}.
handle_cast(Msg, Channel) ->
?SLOG(error, #{msg => "unexpected_cast", cast => Msg}),
{noreply, Channel}.
terminate(Reason, #{conninfo := ConnInfo, clientinfo := ClientInfo, session := Session} = Channel) ->
ok = cancel_expiry_timer(Channel),
(Reason =:= expired) andalso emqx_persistent_session:persist(ClientInfo, ConnInfo, Session),
emqx_session:terminate(ClientInfo, Reason, Session).
code_change(_OldVsn, Channel, _Extra) ->
{ok, Channel}.
%%--------------------------------------------------------------------
%% Internal functions
%%--------------------------------------------------------------------
%% TODO: sync with emqx_channel
handle_deliver(
Delivers,
#{
takeover := true,
pendings := Pendings,
session := Session,
clientinfo := #{clientid := ClientId} = ClientInfo
} = Channel
) ->
%% NOTE: Order is important here. While the takeover is in
%% progress, the session cannot enqueue messages, since it already
%% passed on the queue to the new connection in the session state.
NPendings = lists:append(
Pendings,
emqx_session:ignore_local(ClientInfo, emqx_channel:maybe_nack(Delivers), ClientId, Session)
),
Channel#{pendings => NPendings};
handle_deliver(
Delivers,
#{
takeover := false,
session := Session,
clientinfo := #{clientid := ClientId} = ClientInfo
} = Channel
) ->
Delivers1 = emqx_channel:maybe_nack(Delivers),
Delivers2 = emqx_session:ignore_local(ClientInfo, Delivers1, ClientId, Session),
NSession = emqx_session:enqueue(ClientInfo, Delivers2, Session),
NChannel = persist(NSession, Channel),
%% We consider queued/dropped messages as delivered since they are now in the session state.
emqx_channel:maybe_mark_as_delivered(Session, Delivers),
NChannel.
cancel_expiry_timer(#{expiry_timer := TRef}) when is_reference(TRef) ->
_ = erlang:cancel_timer(TRef),
ok;
cancel_expiry_timer(_) ->
ok.
set_expiry_timer(#{conninfo := ConnInfo} = Channel) ->
case maps:get(expiry_interval, ConnInfo) of
?UINT_MAX ->
{ok, Channel};
I when I > 0 ->
Timer = erlang:send_after(timer:seconds(I), self(), expire_session),
{ok, Channel#{expiry_timer => Timer}};
_ ->
{error, should_be_expired}
end.
open_session(ConnInfo, #{clientid := ClientId} = ClientInfo) ->
Channel = channel(ConnInfo, ClientInfo),
case emqx_cm:open_session(false, ClientInfo, ConnInfo) of
{ok, #{present := false}} ->
?SLOG(
info,
#{
msg => "no_session",
clientid => ClientId,
node => node()
}
),
{error, no_session};
{ok, #{session := Session, present := true, pendings := Pendings0}} ->
?SLOG(
info,
#{
msg => "session_opened",
clientid => ClientId,
node => node()
}
),
Pendings1 = lists:usort(lists:append(Pendings0, emqx_misc:drain_deliver())),
NSession = emqx_session:enqueue(
ClientInfo,
emqx_session:ignore_local(
ClientInfo,
emqx_channel:maybe_nack(Pendings1),
ClientId,
Session
),
Session
),
NChannel = Channel#{session => NSession},
ok = emqx_cm:insert_channel_info(ClientId, info(NChannel), stats(NChannel)),
?SLOG(
info,
#{
msg => "channel_info_updated",
clientid => ClientId,
node => node()
}
),
{ok, NChannel};
{error, Reason} = Error ->
?SLOG(
error,
#{
msg => "session_open_failed",
clientid => ClientId,
node => node(),
reason => Reason
}
),
Error
end.
conninfo(OldConnInfo) ->
DisconnectedAt = maps:get(disconnected_at, OldConnInfo, erlang:system_time(millisecond)),
ConnInfo0 = maps:with(
[
socktype,
sockname,
peername,
peercert,
clientid,
clean_start,
receive_maximum,
expiry_interval,
connected_at,
disconnected_at,
keepalive
],
OldConnInfo
),
ConnInfo0#{
conn_mod => ?MODULE,
connected => false,
disconnected_at => DisconnectedAt
}.
clientinfo(OldClientInfo) ->
maps:with(
[
zone,
protocol,
peerhost,
sockport,
clientid,
username,
is_bridge,
is_superuser,
mountpoint
],
OldClientInfo
).
channel(ConnInfo, ClientInfo) ->
#{
conninfo => ConnInfo,
clientinfo => ClientInfo,
expiry_timer => undefined,
takeover => false,
resuming => false,
pendings => []
}.
persist(Session, #{clientinfo := ClientInfo, conninfo := ConnInfo} = Channel) ->
Session1 = emqx_persistent_session:persist(ClientInfo, ConnInfo, Session),
Channel#{session => Session1}.
info(Channel) ->
#{
conninfo => maps:get(conninfo, Channel, undefined),
clientinfo => maps:get(clientinfo, Channel, undefined),
session => maybe_apply(
fun emqx_session:info/1,
maps:get(session, Channel, undefined)
),
conn_state => disconnected
}.
stats(#{session := Session}) ->
lists:append(emqx_session:stats(Session), emqx_pd:get_counters(?CHANNEL_METRICS)).
bin_clientid(ClientId) when is_binary(ClientId) ->
ClientId;
bin_clientid(ClientId) when is_atom(ClientId) ->
atom_to_binary(ClientId).

View File

@ -0,0 +1,30 @@
%%--------------------------------------------------------------------
%% Copyright (c) 2022-2023 EMQ Technologies Co., Ltd. All Rights Reserved.
%%--------------------------------------------------------------------
-module(emqx_eviction_agent_cli).
%% APIs
-export([
load/0,
unload/0,
cli/1
]).
load() ->
emqx_ctl:register_command(eviction, {?MODULE, cli}, []).
unload() ->
emqx_ctl:unregister_command(eviction).
cli(["status"]) ->
case emqx_eviction_agent:status() of
disabled ->
emqx_ctl:print("Eviction status: disabled~n");
{enabled, _Stats} ->
emqx_ctl:print("Eviction status: enabled~n")
end;
cli(_) ->
emqx_ctl:usage(
[{"eviction status", "Get current node eviction status"}]
).

View File

@ -0,0 +1,21 @@
%%--------------------------------------------------------------------
%% Copyright (c) 2022-2023 EMQ Technologies Co., Ltd. All Rights Reserved.
%%--------------------------------------------------------------------
-module(emqx_eviction_agent_conn_sup).
-behaviour(supervisor).
-export([start_link/0]).
-export([init/1]).
start_link() ->
supervisor:start_link({local, ?MODULE}, ?MODULE, []).
init([]) ->
{ok,
{
#{strategy => one_for_one, intensity => 10, period => 3600},
[]
}}.

View File

@ -0,0 +1,34 @@
%%--------------------------------------------------------------------
%% Copyright (c) 2022-2023 EMQ Technologies Co., Ltd. All Rights Reserved.
%%--------------------------------------------------------------------
-module(emqx_eviction_agent_sup).
-behaviour(supervisor).
-export([start_link/0]).
-export([init/1]).
start_link() ->
supervisor:start_link({local, ?MODULE}, ?MODULE, []).
init([]) ->
Childs = [
child_spec(worker, emqx_eviction_agent, []),
child_spec(supervisor, emqx_eviction_agent_conn_sup, [])
],
{ok, {
#{strategy => one_for_one, intensity => 10, period => 3600},
Childs
}}.
child_spec(Type, Mod, Args) ->
#{
id => Mod,
start => {Mod, start_link, Args},
restart => permanent,
shutdown => 5000,
type => Type,
modules => [Mod]
}.

View File

@ -0,0 +1,27 @@
%%--------------------------------------------------------------------
%% Copyright (c) 2022-2023 EMQ Technologies Co., Ltd. All Rights Reserved.
%%--------------------------------------------------------------------
-module(emqx_eviction_agent_proto_v1).
-behaviour(emqx_bpapi).
-export([
introduced_in/0,
evict_session_channel/4
]).
-include_lib("emqx/include/bpapi.hrl").
introduced_in() ->
"5.0.22".
-spec evict_session_channel(
node(),
emqx_types:clientid(),
emqx_types:conninfo(),
emqx_types:clientinfo()
) -> supervisor:startchild_err() | emqx_rpc:badrpc().
evict_session_channel(Node, ClientId, ConnInfo, ClientInfo) ->
rpc:call(Node, emqx_eviction_agent, evict_session_channel, [ClientId, ConnInfo, ClientInfo]).

View File

@ -0,0 +1,403 @@
%%--------------------------------------------------------------------
%% Copyright (c) 2022-2023 EMQ Technologies Co., Ltd. All Rights Reserved.
%%--------------------------------------------------------------------
-module(emqx_eviction_agent_SUITE).
-compile(export_all).
-compile(nowarn_export_all).
-include_lib("eunit/include/eunit.hrl").
-include_lib("common_test/include/ct.hrl").
-include_lib("emqx/include/emqx_mqtt.hrl").
-include_lib("emqx/include/asserts.hrl").
-import(
emqx_eviction_agent_test_helpers,
[emqtt_connect/0, emqtt_connect/1, emqtt_connect/2]
).
-define(assertPrinted(Printed, Code),
?assertMatch(
{match, _},
re:run(Code, Printed)
)
).
all() ->
emqx_common_test_helpers:all(?MODULE).
init_per_suite(Config) ->
emqx_common_test_helpers:start_apps([emqx_eviction_agent]),
Config.
end_per_suite(_Config) ->
emqx_common_test_helpers:stop_apps([emqx_eviction_agent]).
init_per_testcase(Case, Config) ->
_ = emqx_eviction_agent:disable(test_eviction),
ok = snabbkaffe:start_trace(),
start_slave(Case, Config).
start_slave(t_explicit_session_takeover, Config) ->
ClusterNodes = emqx_eviction_agent_test_helpers:start_cluster(
[{evacuate_test1, 2883}, {evacuate_test2, 3883}],
[emqx_eviction_agent]
),
[{evacuate_nodes, ClusterNodes} | Config];
start_slave(_Case, Config) ->
Config.
end_per_testcase(TestCase, Config) ->
emqx_eviction_agent:disable(test_eviction),
ok = snabbkaffe:stop(),
stop_slave(TestCase, Config).
stop_slave(t_explicit_session_takeover, Config) ->
emqx_eviction_agent_test_helpers:stop_cluster(
?config(evacuate_nodes, Config),
[emqx_eviction_agent]
);
stop_slave(_Case, _Config) ->
ok.
%%--------------------------------------------------------------------
%% Tests
%%--------------------------------------------------------------------
t_enable_disable(_Config) ->
erlang:process_flag(trap_exit, true),
?assertMatch(
disabled,
emqx_eviction_agent:status()
),
{ok, C0} = emqtt_connect(),
ok = emqtt:disconnect(C0),
ok = emqx_eviction_agent:enable(test_eviction, undefined),
?assertMatch(
{error, eviction_agent_busy},
emqx_eviction_agent:enable(bar, undefined)
),
?assertMatch(
ok,
emqx_eviction_agent:enable(test_eviction, <<"srv">>)
),
?assertMatch(
{enabled, #{}},
emqx_eviction_agent:status()
),
?assertMatch(
{error, {use_another_server, #{}}},
emqtt_connect()
),
?assertMatch(
{error, eviction_agent_busy},
emqx_eviction_agent:disable(bar)
),
?assertMatch(
ok,
emqx_eviction_agent:disable(test_eviction)
),
?assertMatch(
{error, disabled},
emqx_eviction_agent:disable(test_eviction)
),
?assertMatch(
disabled,
emqx_eviction_agent:status()
),
{ok, C1} = emqtt_connect(),
ok = emqtt:disconnect(C1).
t_evict_connections_status(_Config) ->
erlang:process_flag(trap_exit, true),
{ok, _C} = emqtt_connect(),
{error, disabled} = emqx_eviction_agent:evict_connections(1),
ok = emqx_eviction_agent:enable(test_eviction, undefined),
?assertMatch(
{enabled, #{connections := 1, sessions := _}},
emqx_eviction_agent:status()
),
ok = emqx_eviction_agent:evict_connections(1),
ct:sleep(100),
?assertMatch(
{enabled, #{connections := 0, sessions := _}},
emqx_eviction_agent:status()
),
ok = emqx_eviction_agent:disable(test_eviction).
t_explicit_session_takeover(Config) ->
_ = erlang:process_flag(trap_exit, true),
ok = restart_emqx(),
[{Node1, Port1}, {Node2, _Port2}] = ?config(evacuate_nodes, Config),
{ok, C0} = emqtt_connect([
{clientid, <<"client_with_session">>},
{clean_start, false},
{port, Port1}
]),
{ok, _, _} = emqtt:subscribe(C0, <<"t1">>),
ok = rpc:call(Node1, emqx_eviction_agent, enable, [test_eviction, undefined]),
?assertEqual(
1,
rpc:call(Node1, emqx_eviction_agent, connection_count, [])
),
[ChanPid] = rpc:call(Node1, emqx_cm, lookup_channels, [<<"client_with_session">>]),
?assertWaitEvent(
begin
ok = rpc:call(Node1, emqx_eviction_agent, evict_connections, [1]),
receive
{'EXIT', C0, {disconnected, ?RC_USE_ANOTHER_SERVER, _}} -> ok
after 1000 ->
?assert(false, "Connection not evicted")
end
end,
#{?snk_kind := emqx_cm_connected_client_count_dec, chan_pid := ChanPid},
2000
),
?assertEqual(
0,
rpc:call(Node1, emqx_eviction_agent, connection_count, [])
),
?assertEqual(
1,
rpc:call(Node1, emqx_eviction_agent, session_count, [])
),
%% First, evacuate to the same node
?assertWaitEvent(
rpc:call(Node1, emqx_eviction_agent, evict_sessions, [1, Node1]),
#{?snk_kind := emqx_channel_takeover_end, clientid := <<"client_with_session">>},
1000
),
ok = rpc:call(Node1, emqx_eviction_agent, disable, [test_eviction]),
{ok, C1} = emqtt_connect([{port, Port1}]),
emqtt:publish(C1, <<"t1">>, <<"MessageToEvictedSession1">>),
ok = emqtt:disconnect(C1),
ok = rpc:call(Node1, emqx_eviction_agent, enable, [test_eviction, undefined]),
%% Evacuate to another node
?assertWaitEvent(
rpc:call(Node1, emqx_eviction_agent, evict_sessions, [1, Node2]),
#{?snk_kind := emqx_channel_takeover_end, clientid := <<"client_with_session">>},
1000
),
?assertEqual(
0,
rpc:call(Node1, emqx_eviction_agent, session_count, [])
),
?assertEqual(
1,
rpc:call(Node2, emqx_eviction_agent, session_count, [])
),
ok = rpc:call(Node1, emqx_eviction_agent, disable, [test_eviction]),
%% Session is on Node2, but we connect to Node1
{ok, C2} = emqtt_connect([{port, Port1}]),
emqtt:publish(C2, <<"t1">>, <<"MessageToEvictedSession2">>),
ok = emqtt:disconnect(C2),
ct:sleep(100),
%% Session is on Node2, but we connect the subscribed client to Node1
%% It should take over the session for the third time and recieve
%% previously published messages
{ok, C3} = emqtt_connect([
{clientid, <<"client_with_session">>},
{clean_start, false},
{port, Port1}
]),
ok = assert_receive_publish(
[
#{payload => <<"MessageToEvictedSession1">>, topic => <<"t1">>},
#{payload => <<"MessageToEvictedSession2">>, topic => <<"t1">>}
]
),
ok = emqtt:disconnect(C3).
t_disable_on_restart(_Config) ->
ok = emqx_eviction_agent:enable(test_eviction, undefined),
ok = supervisor:terminate_child(emqx_eviction_agent_sup, emqx_eviction_agent),
{ok, _} = supervisor:restart_child(emqx_eviction_agent_sup, emqx_eviction_agent),
?assertEqual(
disabled,
emqx_eviction_agent:status()
).
t_session_serialization(_Config) ->
_ = erlang:process_flag(trap_exit, true),
ok = restart_emqx(),
{ok, C0} = emqtt_connect(<<"client_with_session">>, false),
{ok, _, _} = emqtt:subscribe(C0, <<"t1">>),
ok = emqtt:disconnect(C0),
ok = emqx_eviction_agent:enable(test_eviction, undefined),
?assertEqual(
1,
emqx_eviction_agent:session_count()
),
%% Evacuate to the same node
?assertWaitEvent(
emqx_eviction_agent:evict_sessions(1, node()),
#{?snk_kind := emqx_channel_takeover_end, clientid := <<"client_with_session">>},
1000
),
ok = emqx_eviction_agent:disable(test_eviction),
?assertEqual(
1,
emqx_eviction_agent:session_count()
),
?assertMatch(
#{data := [#{clientid := <<"client_with_session">>}]},
emqx_mgmt_api:cluster_query(
emqx_channel_info,
#{},
[],
fun emqx_mgmt_api_clients:qs2ms/2,
fun emqx_mgmt_api_clients:format_channel_info/2
)
),
mock_print(),
?assertPrinted(
"client_with_session",
emqx_mgmt_cli:clients(["list"])
),
?assertPrinted(
"client_with_session",
emqx_mgmt_cli:clients(["show", "client_with_session"])
),
?assertWaitEvent(
emqx_cm:kick_session(<<"client_with_session">>),
#{?snk_kind := emqx_cm_clean_down, client_id := <<"client_with_session">>},
1000
),
?assertEqual(
0,
emqx_eviction_agent:session_count()
).
t_will_msg(_Config) ->
erlang:process_flag(trap_exit, true),
WillMsg = <<"will_msg">>,
WillTopic = <<"will_topic">>,
ClientId = <<"client_with_will">>,
_ = emqtt_connect([
{clean_start, false},
{clientid, ClientId},
{will_payload, WillMsg},
{will_topic, WillTopic}
]),
{ok, C} = emqtt_connect(),
{ok, _, _} = emqtt:subscribe(C, WillTopic),
[ChanPid] = emqx_cm:lookup_channels(ClientId),
ChanPid !
{disconnect, ?RC_USE_ANOTHER_SERVER, use_another_server, #{
'Server-Reference' => <<>>
}},
receive
{publish, #{
payload := WillMsg,
topic := WillTopic
}} ->
ok
after 1000 ->
ct:fail("Will message not received")
end,
ok = emqtt:disconnect(C).
%%--------------------------------------------------------------------
%% Helpers
%%--------------------------------------------------------------------
% sn_connect_and_subscribe(ClientId, Topic) ->
% emqx_eviction_agent_test_helpers:sn_connect_and_subscribe(ClientId, Topic).
assert_receive_publish([]) ->
ok;
assert_receive_publish([#{payload := Msg, topic := Topic} | Rest]) ->
receive
{publish, #{
payload := Msg,
topic := Topic
}} ->
assert_receive_publish(Rest)
after 1000 ->
?assert(false, "Message `" ++ binary_to_list(Msg) ++ "` is lost")
end.
connect_and_publish(Topic, Message) ->
{ok, C} = emqtt_connect(),
emqtt:publish(C, Topic, Message),
ok = emqtt:disconnect(C).
restart_emqx() ->
_ = application:stop(emqx),
_ = application:start(emqx),
_ = application:stop(emqx_eviction_agent),
_ = application:start(emqx_eviction_agent),
ok.
mock_print() ->
catch meck:unload(emqx_ctl),
meck:new(emqx_ctl, [non_strict, passthrough]),
meck:expect(emqx_ctl, print, fun(Arg) -> emqx_ctl:format(Arg, []) end),
meck:expect(emqx_ctl, print, fun(Msg, Arg) -> emqx_ctl:format(Msg, Arg) end),
meck:expect(emqx_ctl, usage, fun(Usages) -> emqx_ctl:format_usage(Usages) end),
meck:expect(emqx_ctl, usage, fun(Cmd, Descr) -> emqx_ctl:format_usage(Cmd, Descr) end).

View File

@ -0,0 +1,69 @@
%%--------------------------------------------------------------------
%% Copyright (c) 2022-2023 EMQ Technologies Co., Ltd. All Rights Reserved.
%%--------------------------------------------------------------------
-module(emqx_eviction_agent_api_SUITE).
-compile(export_all).
-compile(nowarn_export_all).
-include_lib("eunit/include/eunit.hrl").
-include_lib("common_test/include/ct.hrl").
-import(
emqx_mgmt_api_test_util,
[
request_api/2,
uri/1
]
).
all() ->
emqx_common_test_helpers:all(?MODULE).
init_per_suite(Config) ->
emqx_mgmt_api_test_util:init_suite([emqx_eviction_agent]),
Config.
end_per_suite(Config) ->
emqx_mgmt_api_test_util:end_suite([emqx_eviction_agent]),
Config.
%%--------------------------------------------------------------------
%% Tests
%%--------------------------------------------------------------------
t_status(_Config) ->
?assertMatch(
{ok, #{<<"status">> := <<"disabled">>}},
api_get(["node_eviction", "status"])
),
ok = emqx_eviction_agent:enable(apitest, undefined),
?assertMatch(
{ok, #{
<<"status">> := <<"enabled">>,
<<"stats">> := #{}
}},
api_get(["node_eviction", "status"])
),
ok = emqx_eviction_agent:disable(apitest),
?assertMatch(
{ok, #{<<"status">> := <<"disabled">>}},
api_get(["node_eviction", "status"])
).
%%--------------------------------------------------------------------
%% Helpers
%%--------------------------------------------------------------------
api_get(Path) ->
case request_api(get, uri(Path)) of
{ok, ResponseBody} ->
{ok, jiffy:decode(list_to_binary(ResponseBody), [return_maps])};
{error, _} = Error ->
Error
end.

View File

@ -0,0 +1,251 @@
%%--------------------------------------------------------------------
%% Copyright (c) 2022-2023 EMQ Technologies Co., Ltd. All Rights Reserved.
%%--------------------------------------------------------------------
-module(emqx_eviction_agent_channel_SUITE).
-compile(export_all).
-compile(nowarn_export_all).
-include_lib("eunit/include/eunit.hrl").
-include_lib("common_test/include/ct.hrl").
-include_lib("emqx/include/emqx_mqtt.hrl").
-define(CLIENT_ID, <<"client_with_session">>).
-import(
emqx_eviction_agent_test_helpers,
[emqtt_connect/0, emqtt_connect/2]
).
all() ->
emqx_common_test_helpers:all(?MODULE).
init_per_suite(Config) ->
emqx_common_test_helpers:start_apps([emqx_conf, emqx_eviction_agent]),
{ok, _} = emqx:update_config([rpc, port_discovery], manual),
Config.
end_per_suite(_Config) ->
emqx_common_test_helpers:stop_apps([emqx_eviction_agent, emqx_conf]).
init_per_testcase(t_persistence, Config) ->
emqx_config:put([persistent_session_store, enabled], true),
{ok, _} = emqx_persistent_session_sup:start_link(),
emqx_persistent_session:init_db_backend(),
?assert(emqx_persistent_session:is_store_enabled()),
Config;
init_per_testcase(_TestCase, Config) ->
Config.
end_per_testcase(t_persistence, Config) ->
emqx_config:put([persistent_session_store, enabled], false),
emqx_persistent_session:init_db_backend(),
?assertNot(emqx_persistent_session:is_store_enabled()),
Config;
end_per_testcase(_TestCase, _Config) ->
ok.
%%--------------------------------------------------------------------
%% Tests
%%--------------------------------------------------------------------
t_start_no_session(_Config) ->
Opts = #{
clientinfo => #{
clientid => ?CLIENT_ID,
zone => internal
},
conninfo => #{
clientid => ?CLIENT_ID,
receive_maximum => 32,
expiry_interval => 10000
}
},
?assertMatch(
{error, {no_session, _}},
emqx_eviction_agent_channel:start_supervised(Opts)
).
t_start_no_expire(_Config) ->
erlang:process_flag(trap_exit, true),
_ = emqtt_connect(?CLIENT_ID, false),
Opts = #{
clientinfo => #{
clientid => ?CLIENT_ID,
zone => internal
},
conninfo => #{
clientid => ?CLIENT_ID,
receive_maximum => 32,
expiry_interval => 0
}
},
?assertMatch(
{error, {should_be_expired, _}},
emqx_eviction_agent_channel:start_supervised(Opts)
).
t_start_infinite_expire(_Config) ->
erlang:process_flag(trap_exit, true),
_ = emqtt_connect(?CLIENT_ID, false),
Opts = #{
clientinfo => #{
clientid => ?CLIENT_ID,
zone => internal
},
conninfo => #{
clientid => ?CLIENT_ID,
receive_maximum => 32,
expiry_interval => ?UINT_MAX
}
},
?assertMatch(
{ok, _},
emqx_eviction_agent_channel:start_supervised(Opts)
).
t_kick(_Config) ->
erlang:process_flag(trap_exit, true),
_ = emqtt_connect(?CLIENT_ID, false),
Opts = evict_session_opts(?CLIENT_ID),
{ok, Pid} = emqx_eviction_agent_channel:start_supervised(Opts),
?assertEqual(
ok,
emqx_eviction_agent_channel:call(Pid, kick)
).
t_discard(_Config) ->
erlang:process_flag(trap_exit, true),
_ = emqtt_connect(?CLIENT_ID, false),
Opts = evict_session_opts(?CLIENT_ID),
{ok, Pid} = emqx_eviction_agent_channel:start_supervised(Opts),
?assertEqual(
ok,
emqx_eviction_agent_channel:call(Pid, discard)
).
t_stop(_Config) ->
erlang:process_flag(trap_exit, true),
_ = emqtt_connect(?CLIENT_ID, false),
Opts = evict_session_opts(?CLIENT_ID),
{ok, Pid} = emqx_eviction_agent_channel:start_supervised(Opts),
?assertEqual(
ok,
emqx_eviction_agent_channel:stop(Pid)
).
t_ignored_calls(_Config) ->
erlang:process_flag(trap_exit, true),
_ = emqtt_connect(?CLIENT_ID, false),
Opts = evict_session_opts(?CLIENT_ID),
{ok, Pid} = emqx_eviction_agent_channel:start_supervised(Opts),
ok = emqx_eviction_agent_channel:cast(Pid, unknown),
Pid ! unknown,
?assertEqual(
[],
emqx_eviction_agent_channel:call(Pid, list_acl_cache)
),
?assertEqual(
ok,
emqx_eviction_agent_channel:call(Pid, {quota, quota})
),
?assertEqual(
ignored,
emqx_eviction_agent_channel:call(Pid, unknown)
).
t_expire(_Config) ->
erlang:process_flag(trap_exit, true),
_ = emqtt_connect(?CLIENT_ID, false),
#{conninfo := ConnInfo} = Opts0 = evict_session_opts(?CLIENT_ID),
Opts1 = Opts0#{conninfo => ConnInfo#{expiry_interval => 1}},
{ok, Pid} = emqx_eviction_agent_channel:start_supervised(Opts1),
ct:sleep(1500),
?assertNot(is_process_alive(Pid)).
t_get_connected_client_count(_Config) ->
erlang:process_flag(trap_exit, true),
_ = emqtt_connect(?CLIENT_ID, false),
?assertEqual(
1,
emqx_cm:get_connected_client_count()
),
Opts = evict_session_opts(?CLIENT_ID),
{ok, _} = emqx_eviction_agent_channel:start_supervised(Opts),
?assertEqual(
0,
emqx_cm:get_connected_client_count()
).
t_persistence(_Config) ->
erlang:process_flag(trap_exit, true),
Topic = <<"t1">>,
Message = <<"message_to_persist">>,
{ok, C0} = emqtt_connect(?CLIENT_ID, false),
{ok, _, _} = emqtt:subscribe(C0, Topic, 0),
Opts = evict_session_opts(?CLIENT_ID),
{ok, Pid} = emqx_eviction_agent_channel:start_supervised(Opts),
{ok, C1} = emqtt_connect(),
{ok, _} = emqtt:publish(C1, Topic, Message, 1),
ok = emqtt:disconnect(C1),
%% Kill channel so that the session is only persisted
ok = emqx_eviction_agent_channel:call(Pid, kick),
%% Should restore session from persistents storage and receive messages
{ok, C2} = emqtt_connect(?CLIENT_ID, false),
receive
{publish, #{
payload := Message,
topic := Topic
}} ->
ok
after 1000 ->
ct:fail("message not received")
end,
ok = emqtt:disconnect(C2).
%%--------------------------------------------------------------------
%% Helpers
%%--------------------------------------------------------------------
evict_session_opts(ClientId) ->
maps:with(
[conninfo, clientinfo],
emqx_cm:get_chan_info(ClientId)
).

View File

@ -0,0 +1,39 @@
%%--------------------------------------------------------------------
%% Copyright (c) 2022-2023 EMQ Technologies Co., Ltd. All Rights Reserved.
%%--------------------------------------------------------------------
-module(emqx_eviction_agent_cli_SUITE).
-compile(export_all).
-compile(nowarn_export_all).
-include_lib("eunit/include/eunit.hrl").
-include_lib("common_test/include/ct.hrl").
all() ->
emqx_common_test_helpers:all(?MODULE).
init_per_suite(Config) ->
emqx_common_test_helpers:start_apps([emqx_eviction_agent]),
Config.
end_per_suite(Config) ->
_ = emqx_eviction_agent:disable(foo),
emqx_common_test_helpers:stop_apps([emqx_eviction_agent]),
Config.
%%--------------------------------------------------------------------
%% Tests
%%--------------------------------------------------------------------
t_status(_Config) ->
%% usage
ok = emqx_eviction_agent_cli:cli(["foobar"]),
%% status
ok = emqx_eviction_agent_cli:cli(["status"]),
ok = emqx_eviction_agent:enable(foo, undefined),
%% status
ok = emqx_eviction_agent_cli:cli(["status"]).

View File

@ -0,0 +1,141 @@
%%--------------------------------------------------------------------
%% Copyright (c) 2022-2023 EMQ Technologies Co., Ltd. All Rights Reserved.
%%--------------------------------------------------------------------
-module(emqx_eviction_agent_test_helpers).
-export([
emqtt_connect/0,
emqtt_connect/1,
emqtt_connect/2,
emqtt_connect_many/2,
stop_many/1,
emqtt_try_connect/1,
start_cluster/2,
start_cluster/3,
stop_cluster/2,
case_specific_node_name/2,
case_specific_node_name/3,
concat_atoms/1
]).
emqtt_connect() ->
emqtt_connect(<<"client1">>, true).
emqtt_connect(ClientId, CleanStart) ->
emqtt_connect([{clientid, ClientId}, {clean_start, CleanStart}]).
emqtt_connect(Opts) ->
{ok, C} = emqtt:start_link(
Opts ++
[
{proto_ver, v5},
{properties, #{'Session-Expiry-Interval' => 600}}
]
),
case emqtt:connect(C) of
{ok, _} -> {ok, C};
{error, _} = Error -> Error
end.
emqtt_connect_many(Port, Count) ->
lists:map(
fun(N) ->
NBin = integer_to_binary(N),
ClientId = <<"client-", NBin/binary>>,
{ok, C} = emqtt_connect([{clientid, ClientId}, {clean_start, false}, {port, Port}]),
C
end,
lists:seq(1, Count)
).
stop_many(Clients) ->
lists:foreach(
fun(C) ->
catch emqtt:disconnect(C)
end,
Clients
),
ct:sleep(100).
emqtt_try_connect(Opts) ->
case emqtt_connect(Opts) of
{ok, C} ->
emqtt:disconnect(C),
ok;
{error, _} = Error ->
Error
end.
start_cluster(NamesWithPorts, Apps) ->
start_cluster(NamesWithPorts, Apps, []).
start_cluster(NamesWithPorts, Apps, Env) ->
Specs = lists:map(
fun({ShortName, Port}) ->
{core, ShortName, #{listener_ports => [{tcp, Port}]}}
end,
NamesWithPorts
),
Opts0 = [
{env, [{emqx, boot_modules, [broker, listeners]}]},
{apps, Apps},
{conf,
[{[listeners, Proto, default, enabled], false} || Proto <- [ssl, ws, wss]] ++
[{[rpc, mode], async}]},
{env, Env}
],
Cluster = emqx_common_test_helpers:emqx_cluster(
Specs,
Opts0
),
NodesWithPorts = [
{
emqx_common_test_helpers:start_slave(Name, Opts),
proplists:get_value(Name, NamesWithPorts)
}
|| {Name, Opts} <- Cluster
],
ok = lists:foreach(
fun({Node, _Port}) ->
snabbkaffe:forward_trace(Node)
end,
NodesWithPorts
),
NodesWithPorts.
stop_cluster(NodesWithPorts, Apps) ->
lists:foreach(
fun({Node, _Port}) ->
lists:foreach(
fun(App) ->
rpc:call(Node, application, stop, [App])
end,
Apps
),
%% This sleep is just to make logs cleaner
ct:sleep(100),
_ = rpc:call(Node, emqx_common_test_helpers, stop_apps, []),
emqx_common_test_helpers:stop_slave(Node)
end,
NodesWithPorts
).
case_specific_node_name(Module, Case) ->
concat_atoms([Module, '__', Case]).
case_specific_node_name(Module, Case, Node) ->
concat_atoms([Module, '__', Case, '__', Node]).
concat_atoms(Atoms) ->
binary_to_atom(
iolist_to_binary(
lists:map(
fun atom_to_binary/1,
Atoms
)
)
).

View File

@ -0,0 +1,9 @@
emqx_node_rebalance
=====
An OTP library
Build
-----
$ rebar3 compile

View File

@ -0,0 +1,3 @@
##--------------------------------------------------------------------
## EMQX Node Rebalance Plugin
##--------------------------------------------------------------------

View File

@ -0,0 +1,490 @@
emqx_node_rebalance_api {
## API Request Fields
load_rebalance_status {
desc {
en: "Get rebalance status of the current node"
zh: "获取当前节点的rebalance状态"
}
label {
en: "Get rebalance status"
zh: "获取rebalance状态"
}
}
load_rebalance_global_status {
desc {
en: "Get status of all rebalance/evacuation processes across the cluster"
zh: "获取集群中所有rebalance/evacuation进程的状态"
}
label {
en: "Get global rebalance status"
zh: "获取全局rebalance状态"
}
}
load_rebalance_availability_check {
desc {
en: "Check if the node is being evacuated or rebalanced"
zh: "检查节点是否正在被evacuate或rebalance"
}
label {
en: "Availability check"
zh: "可用性检查"
}
}
load_rebalance_start {
desc {
en: "Start rebalance process"
zh: "启动rebalance进程"
}
label {
en: "Start rebalance"
zh: "启动rebalance"
}
}
load_rebalance_stop {
desc {
en: "Stop rebalance process"
zh: "停止rebalance进程"
}
label {
en: "Stop rebalance"
zh: "停止rebalance"
}
}
load_rebalance_evacuation_start {
desc {
en: "Start evacuation process"
zh: "启动evacuation进程"
}
label {
en: "Start evacuation"
zh: "启动evacuation"
}
}
load_rebalance_evacuation_stop {
desc {
en: "Stop evacuation process"
zh: "停止evacuation进程"
}
label {
en: "Stop evacuation"
zh: "停止evacuation"
}
}
param_node {
desc {
en: "Node name"
zh: "节点名称"
}
label {
en: "Node name"
zh: "节点名称"
}
}
wait_health_check {
desc {
en: "Time to wait before starting the rebalance process, in seconds"
zh: "启动rebalance进程前等待的时间单位为秒"
}
label {
en: "Wait health check"
zh: "等待健康检查"
}
}
conn_evict_rate {
desc {
en: "The rate of evicting connections, in connections per second"
zh: "逐出连接的速率,以每秒连接数表示"
}
label {
en: "Connection eviction rate"
zh: "连接驱逐率"
}
}
sess_evict_rate {
desc {
en: "The rate of evicting sessions, in sessions per second"
zh: "逐出会话的速率,以每秒会话为单位"
}
label {
en: "Session eviction rate"
zh: "会话驱逐率"
}
}
abs_conn_threshold {
desc {
en: "Maximum desired difference between the number of connections on the node and the average number of connections on the recipient nodes"
zh: "节点上的连接数与接收节点上的平均连接数之间的最大期望差值"
}
label {
en: "Absolute connection threshold"
zh: "绝对连接阈值"
}
}
rel_conn_threshold {
desc {
en: "Maximum desired fraction between the number of connections on the node and the average number of connections on the recipient nodes"
zh: "节点上的连接数与接收节点上的平均连接数之间的最大期望分数"
}
label {
en: "Relative connection threshold"
zh: "相对连接阈值"
}
}
abs_sess_threshold {
desc {
en: "Maximum desired difference between the number of sessions on the node and the average number of sessions on the recipient nodes"
zh: "节点上的会话数与接收节点上的平均会话数之间的最大期望差异"
}
label {
en: "Absolute session threshold"
zh: "绝对会话阈值"
}
}
rel_sess_threshold {
desc {
en: "Maximum desired fraction between the number of sessions on the node and the average number of sessions on the recipient nodes"
zh: "节点上的会话数与接收节点上的平均会话数之间的最大期望分数"
}
label {
en: "Relative session threshold"
zh: "相对会话阈值"
}
}
wait_takeover {
desc {
en: "Time to wait before starting session evacuation process, in seconds"
zh: "开始会话疏散过程之前等待的时间,以秒为单位"
}
label {
en: "Wait takeover"
zh: "等待接管"
}
}
redirect_to {
desc {
en: "Server reference to redirect clients to (MQTTv5 Server redirection)"
zh: "将客户端重定向到的服务器参考MQTTv5 服务器重定向)"
}
label {
en: "Redirect to"
zh: "重定向至"
}
}
migrate_to {
desc {
en: "Nodes to migrate sessions to"
zh: "将会话迁移到的节点"
}
label {
en: "Migrate to"
zh: "迁移到"
}
}
rebalance_nodes {
desc {
en: "Nodes to participate in rebalance"
zh: "参与rebalance的节点"
}
label {
en: "Rebalance nodes"
zh: "重新平衡节点"
}
}
## API Response Fields
local_status_enabled {
desc {
en: "Whether the node is being evacuated"
zh: "节点是否正在撤离"
}
label {
en: "Local evacuation status"
zh: "当地避难状况"
}
}
local_status_process {
desc {
en: "The process that is being performed on the node: evacuation or rebalance"
zh: "正在节点上执行的过程:疏散或重新平衡"
}
label {
en: "Node process"
zh: "节点进程"
}
}
local_status_state {
desc {
en: "The state of the process that is being performed on the node"
zh: "正在节点上执行的进程的状态"
}
label {
en: "Rebalance/evacuation current state"
zh: "重新平衡/疏散当前状态"
}
}
local_status_coordinator_node {
desc {
en: "The node that is coordinating rebalance process"
zh: "协调再平衡过程的节点"
}
label {
en: "Coordinator node"
zh: "协调节点"
}
}
local_status_connection_eviction_rate {
desc {
en: "The rate of evicting connections, in connections per second"
zh: "逐出连接的速率,以每秒连接数表示"
}
label {
en: "Connection eviction rate"
zh: "连接驱逐率"
}
}
local_status_session_eviction_rate {
desc {
en: "The rate of evicting sessions, in sessions per second"
zh: "逐出会话的速率,以每秒会话为单位"
}
label {
en: "Session eviction rate"
zh: "会话驱逐率"
}
}
local_status_connection_goal {
desc {
en: "The number of connections that the node should have after the rebalance/evacuation process"
zh: "节点在重新平衡/疏散过程后应该拥有的连接数"
}
label {
en: "Connection goal"
zh: "连接目标"
}
}
local_status_session_goal {
desc {
en: "The number of sessions that the node should have after the evacuation process"
zh: "疏散过程后节点应有的会话数"
}
label {
en: "Session goal"
zh: "会话目标"
}
}
local_status_disconnected_session_goal {
desc {
en: "The number of disconnected sessions that the node should have after the rebalance process"
zh: "重新平衡过程后节点应具有的断开连接的会话数"
}
label {
en: "Disconnected session goal"
zh: "断开连接的会话目标"
}
}
local_status_session_recipients {
desc {
en: "List of nodes to which sessions are being evacuated"
zh: "会话被疏散到的节点列表"
}
label {
en: "Session recipients"
zh: "会话收件人"
}
}
local_status_recipients {
desc {
en: "List of nodes to which connections/sessions are being evacuated during rebalance"
zh: "在重新平衡期间连接/会话被疏散到的节点列表"
}
label {
en: "Recipients"
zh: "收件人"
}
}
local_status_stats {
desc {
en: "Statistics of the evacuation/rebalance process"
zh: "疏散/再平衡过程的统计"
}
label {
en: "Statistics"
zh: "统计数据"
}
}
status_stats_initial_connected {
desc {
en: "The number of connections on the node before the evacuation/rebalance process"
zh: "疏散/重新平衡过程之前节点上的连接数"
}
label {
en: "Initial connected"
zh: "初始连接"
}
}
status_stats_current_connected {
desc {
en: "Current number of connections on the node"
zh: "节点上的当前连接数"
}
label {
en: "Current connections"
zh: "当前连接"
}
}
status_stats_initial_sessions {
desc {
en: "The number of sessions on the node before the evacuation/rebalance process"
zh: "疏散/重新平衡过程之前节点上的会话数"
}
label {
en: "Initial sessions"
zh: "初始会话"
}
}
status_stats_current_sessions {
desc {
en: "Current number of sessions on the node"
zh: "节点上的当前会话数"
}
label {
en: "Current sessions"
zh: "当前会话"
}
}
status_stats_current_disconnected_sessions {
desc {
en: "Current number of disconnected sessions on the node"
zh: "节点上当前断开连接的会话数"
}
label {
en: "Current disconnected sessions"
zh: "当前断开连接的会话"
}
}
coordinator_status_donors {
desc {
en: "List of nodes from which connections/sessions are being evacuated"
zh: "正在疏散连接/会话的节点列表"
}
label {
en: "Donors"
zh: "捐助者"
}
}
coordinator_status_donor_conn_avg {
desc {
en: "Average number of connections per donor node"
zh: "每个供体节点的平均连接数"
}
label {
en: "Donor connections average"
zh: "捐助者连接平均值"
}
}
coordinator_status_donor_sess_avg {
desc {
en: "Average number of sessions per donor node"
zh: "每个供体节点的平均会话数"
}
label {
en: "Donor sessions average"
zh: "平均捐助会议"
}
}
coordinator_status_node {
desc {
en: "The node that is coordinating the evacuation/rebalance process"
zh: "协调疏散/再平衡过程的节点"
}
label {
en: "Coordinator node"
zh: "协调节点"
}
}
evacuation_status_node {
desc {
en: "The node that is being evacuated"
zh: "正在撤离的节点"
}
label {
en: "Evacuated node"
zh: "疏散节点"
}
}
global_status_evacuations {
desc {
en: "List of nodes that are being evacuated"
zh: "正在撤离的节点列表"
}
label {
en: "Evacuations"
zh: "疏散"
}
}
global_status_rebalances {
desc {
en: "List of nodes that coordinate a rebalance"
zh: "协调再平衡的节点列表"
}
label {
en: "Rebalances"
zh: "再平衡"
}
}
empty_response {
desc {
en: "The response is empty"
zh: "响应为空"
}
label {
en: "Empty response"
zh: "空响应"
}
}
}

View File

@ -0,0 +1,33 @@
%%--------------------------------------------------------------------
%% Copyright (c) 2022-2023 EMQ Technologies Co., Ltd. All Rights Reserved.
%%
%% Licensed under the Apache License, Version 2.0 (the "License");
%% you may not use this file except in compliance with the License.
%% You may obtain a copy of the License at
%%
%% http://www.apache.org/licenses/LICENSE-2.0
%%
%% Unless required by applicable law or agreed to in writing, software
%% distributed under the License is distributed on an "AS IS" BASIS,
%% WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
%% See the License for the specific language governing permissions and
%% limitations under the License.
%%--------------------------------------------------------------------
-define(DEFAULT_CONN_EVICT_RATE, 500).
-define(DEFAULT_SESS_EVICT_RATE, 500).
%% sec
-define(DEFAULT_WAIT_HEALTH_CHECK, 60).
%% sec
-define(DEFAULT_WAIT_TAKEOVER, 60).
-define(DEFAULT_ABS_CONN_THRESHOLD, 1000).
-define(DEFAULT_ABS_SESS_THRESHOLD, 1000).
-define(DEFAULT_REL_CONN_THRESHOLD, 1.1).
-define(DEFAULT_REL_SESS_THRESHOLD, 1.1).
-define(EVICT_INTERVAL, 1000).
-define(EVACUATION_FILENAME, <<".evacuation">>).

View File

@ -0,0 +1,2 @@
{deps, [{emqx, {path, "../../apps/emqx"}}]}.
{project_plugins, [erlfmt]}.

View File

@ -0,0 +1,22 @@
{application, emqx_node_rebalance, [
{description, "EMQX Node Rebalance"},
{vsn, "5.0.0"},
{registered, [
emqx_node_rebalance_sup,
emqx_node_rebalance,
emqx_node_rebalance_agent,
emqx_node_rebalance_evacuation
]},
{applications, [
kernel,
stdlib
]},
{mod, {emqx_node_rebalance_app, []}},
{env, []},
{modules, []},
{maintainers, ["EMQX Team <contact@emqx.io>"]},
{links, [
{"Homepage", "https://emqx.io/"},
{"Github", "https://github.com/emqx"}
]}
]}.

View File

@ -0,0 +1,3 @@
%% -*- mode: erlang -*-
%% Unless you know what you are doing, DO NOT edit manually!!
{VSN, [{<<".*">>, []}], [{<<".*">>, []}]}.

View File

@ -0,0 +1,438 @@
%%--------------------------------------------------------------------
%% Copyright (c) 2022-2023 EMQ Technologies Co., Ltd. All Rights Reserved.
%%--------------------------------------------------------------------
-module(emqx_node_rebalance).
-include("emqx_node_rebalance.hrl").
-include_lib("emqx/include/logger.hrl").
-include_lib("emqx/include/types.hrl").
-include_lib("snabbkaffe/include/snabbkaffe.hrl").
-export([
start/1,
status/0,
status/1,
stop/0
]).
-export([start_link/0]).
-behaviour(gen_statem).
-export([
init/1,
callback_mode/0,
handle_event/4,
code_change/4
]).
-export([
is_node_available/0,
available_nodes/1,
connection_count/0,
session_count/0,
disconnected_session_count/0
]).
-export_type([
start_opts/0,
start_error/0
]).
%%--------------------------------------------------------------------
%% APIs
%%--------------------------------------------------------------------
-type start_opts() :: #{
conn_evict_rate => pos_integer(),
sess_evict_rate => pos_integer(),
wait_health_check => pos_integer(),
wait_takeover => pos_integer(),
abs_conn_threshold => pos_integer(),
rel_conn_threshold => number(),
abs_sess_threshold => pos_integer(),
rel_sess_threshold => number(),
nodes => [node()]
}.
-type start_error() :: already_started | [{node(), term()}].
-spec start(start_opts()) -> ok_or_error(start_error()).
start(StartOpts) ->
Opts = maps:merge(default_opts(), StartOpts),
gen_statem:call(?MODULE, {start, Opts}).
-spec stop() -> ok_or_error(not_started).
stop() ->
gen_statem:call(?MODULE, stop).
-spec status() -> disabled | {enabled, map()}.
status() ->
gen_statem:call(?MODULE, status).
-spec status(pid()) -> disabled | {enabled, map()}.
status(Pid) ->
gen_statem:call(Pid, status).
-spec start_link() -> startlink_ret().
start_link() ->
gen_statem:start_link({local, ?MODULE}, ?MODULE, [], []).
-spec available_nodes(list(node())) -> list(node()).
available_nodes(Nodes) when is_list(Nodes) ->
{Available, _} = emqx_node_rebalance_proto_v1:available_nodes(Nodes),
lists:filter(fun is_atom/1, Available).
%%--------------------------------------------------------------------
%% gen_statem callbacks
%%--------------------------------------------------------------------
callback_mode() -> handle_event_function.
%% states: disabled, wait_health_check, evicting_conns, wait_takeover, evicting_sessions
init([]) ->
?tp(debug, emqx_node_rebalance_started, #{}),
{ok, disabled, #{}}.
%% start
handle_event(
{call, From},
{start, #{wait_health_check := WaitHealthCheck} = Opts},
disabled,
#{} = Data
) ->
case enable_rebalance(Data#{opts => Opts}) of
{ok, NewData} ->
?SLOG(warning, #{msg => "node_rebalance_enabled", opts => Opts}),
{next_state, wait_health_check, NewData, [
{state_timeout, seconds(WaitHealthCheck), evict_conns},
{reply, From, ok}
]};
{error, Reason} ->
?SLOG(warning, #{
msg => "node_rebalance_enable_failed",
reason => Reason
}),
{keep_state_and_data, [{reply, From, {error, Reason}}]}
end;
handle_event({call, From}, {start, _Opts}, _State, #{}) ->
{keep_state_and_data, [{reply, From, {error, already_started}}]};
%% stop
handle_event({call, From}, stop, disabled, #{}) ->
{keep_state_and_data, [{reply, From, {error, not_started}}]};
handle_event({call, From}, stop, _State, Data) ->
ok = disable_rebalance(Data),
?SLOG(warning, #{msg => "node_rebalance_stopped"}),
{next_state, disabled, deinit(Data), [{reply, From, ok}]};
%% status
handle_event({call, From}, status, disabled, #{}) ->
{keep_state_and_data, [{reply, From, disabled}]};
handle_event({call, From}, status, State, Data) ->
Stats = get_stats(State, Data),
{keep_state_and_data, [
{reply, From,
{enabled, Stats#{
state => State,
coordinator_node => node()
}}}
]};
%% conn eviction
handle_event(
state_timeout,
evict_conns,
wait_health_check,
Data
) ->
?SLOG(warning, #{msg => "node_rebalance_wait_health_check_over"}),
{next_state, evicting_conns, Data, [{state_timeout, 0, evict_conns}]};
handle_event(
state_timeout,
evict_conns,
evicting_conns,
#{
opts := #{
wait_takeover := WaitTakeover,
evict_interval := EvictInterval
}
} = Data
) ->
case evict_conns(Data) of
ok ->
?SLOG(warning, #{msg => "node_rebalance_evict_conns_over"}),
{next_state, wait_takeover, Data, [
{state_timeout, seconds(WaitTakeover), evict_sessions}
]};
{continue, NewData} ->
{keep_state, NewData, [{state_timeout, EvictInterval, evict_conns}]}
end;
handle_event(
state_timeout,
evict_sessions,
wait_takeover,
Data
) ->
?SLOG(warning, #{msg => "node_rebalance_wait_takeover_over"}),
{next_state, evicting_sessions, Data, [{state_timeout, 0, evict_sessions}]};
handle_event(
state_timeout,
evict_sessions,
evicting_sessions,
#{opts := #{evict_interval := EvictInterval}} = Data
) ->
case evict_sessions(Data) of
ok ->
?tp(debug, emqx_node_rebalance_evict_sess_over, #{}),
?SLOG(warning, #{msg => "node_rebalance_evict_sessions_over"}),
ok = disable_rebalance(Data),
?SLOG(warning, #{msg => "node_rebalance_finished_successfully"}),
{next_state, disabled, deinit(Data)};
{continue, NewData} ->
{keep_state, NewData, [{state_timeout, EvictInterval, evict_sessions}]}
end;
handle_event({call, From}, Msg, _State, _Data) ->
?SLOG(warning, #{msg => "node_rebalance_unknown_call", call => Msg}),
{keep_state_and_data, [{reply, From, ignored}]};
handle_event(info, Msg, _State, _Data) ->
?SLOG(warning, #{msg => "node_rebalance_unknown_info", info => Msg}),
keep_state_and_data;
handle_event(cast, Msg, _State, _Data) ->
?SLOG(warning, #{msg => "node_rebalance_unknown_cast", cast => Msg}),
keep_state_and_data.
code_change(_Vsn, State, Data, _Extra) ->
{ok, State, Data}.
%%--------------------------------------------------------------------
%% internal funs
%%--------------------------------------------------------------------
enable_rebalance(#{opts := Opts} = Data) ->
Nodes = maps:get(nodes, Opts),
ConnCounts = multicall(Nodes, connection_counts, []),
SessCounts = multicall(Nodes, session_counts, []),
{_, Counts} = lists:unzip(ConnCounts),
Avg = avg(Counts),
{DonorCounts, RecipientCounts} = lists:partition(
fun({_Node, Count}) ->
Count >= Avg
end,
ConnCounts
),
?SLOG(warning, #{
msg => "node_rebalance_enabling",
conn_counts => ConnCounts,
donor_counts => DonorCounts,
recipient_counts => RecipientCounts
}),
{DonorNodes, _} = lists:unzip(DonorCounts),
{RecipientNodes, _} = lists:unzip(RecipientCounts),
case need_rebalance(DonorNodes, RecipientNodes, ConnCounts, SessCounts, Opts) of
false ->
{error, nothing_to_balance};
true ->
_ = multicall(DonorNodes, enable_rebalance_agent, [self()]),
{ok, Data#{
donors => DonorNodes,
recipients => RecipientNodes,
initial_conn_counts => maps:from_list(ConnCounts),
initial_sess_counts => maps:from_list(SessCounts)
}}
end.
disable_rebalance(#{donors := DonorNodes}) ->
_ = multicall(DonorNodes, disable_rebalance_agent, [self()]),
ok.
evict_conns(#{donors := DonorNodes, recipients := RecipientNodes, opts := Opts} = Data) ->
DonorNodeCounts = multicall(DonorNodes, connection_counts, []),
{_, DonorCounts} = lists:unzip(DonorNodeCounts),
RecipientNodeCounts = multicall(RecipientNodes, connection_counts, []),
{_, RecipientCounts} = lists:unzip(RecipientNodeCounts),
DonorAvg = avg(DonorCounts),
RecipientAvg = avg(RecipientCounts),
Thresholds = thresholds(conn, Opts),
NewData = Data#{
donor_conn_avg => DonorAvg,
recipient_conn_avg => RecipientAvg,
donor_conn_counts => maps:from_list(DonorNodeCounts),
recipient_conn_counts => maps:from_list(RecipientNodeCounts)
},
case within_thresholds(DonorAvg, RecipientAvg, Thresholds) of
true ->
ok;
false ->
ConnEvictRate = maps:get(conn_evict_rate, Opts),
NodesToEvict = nodes_to_evict(RecipientAvg, DonorNodeCounts),
?SLOG(warning, #{
msg => "node_rebalance_evict_conns",
nodes => NodesToEvict,
counts => ConnEvictRate
}),
_ = multicall(NodesToEvict, evict_connections, [ConnEvictRate]),
{continue, NewData}
end.
evict_sessions(#{donors := DonorNodes, recipients := RecipientNodes, opts := Opts} = Data) ->
DonorNodeCounts = multicall(DonorNodes, disconnected_session_counts, []),
{_, DonorCounts} = lists:unzip(DonorNodeCounts),
RecipientNodeCounts = multicall(RecipientNodes, disconnected_session_counts, []),
{_, RecipientCounts} = lists:unzip(RecipientNodeCounts),
DonorAvg = avg(DonorCounts),
RecipientAvg = avg(RecipientCounts),
Thresholds = thresholds(sess, Opts),
NewData = Data#{
donor_sess_avg => DonorAvg,
recipient_sess_avg => RecipientAvg,
donor_sess_counts => maps:from_list(DonorNodeCounts),
recipient_sess_counts => maps:from_list(RecipientNodeCounts)
},
case within_thresholds(DonorAvg, RecipientAvg, Thresholds) of
true ->
ok;
false ->
SessEvictRate = maps:get(sess_evict_rate, Opts),
NodesToEvict = nodes_to_evict(RecipientAvg, DonorNodeCounts),
?SLOG(warning, #{
msg => "node_rebalance_evict_sessions",
nodes => NodesToEvict,
counts => SessEvictRate
}),
_ = multicall(
NodesToEvict,
evict_sessions,
[SessEvictRate, RecipientNodes, disconnected]
),
{continue, NewData}
end.
need_rebalance([] = _DonorNodes, _RecipientNodes, _ConnCounts, _SessCounts, _Opts) ->
false;
need_rebalance(_DonorNodes, [] = _RecipientNodes, _ConnCounts, _SessCounts, _Opts) ->
false;
need_rebalance(DonorNodes, RecipientNodes, ConnCounts, SessCounts, Opts) ->
DonorConnAvg = avg_for_nodes(DonorNodes, ConnCounts),
RecipientConnAvg = avg_for_nodes(RecipientNodes, ConnCounts),
DonorSessAvg = avg_for_nodes(DonorNodes, SessCounts),
RecipientSessAvg = avg_for_nodes(RecipientNodes, SessCounts),
Result =
(not within_thresholds(DonorConnAvg, RecipientConnAvg, thresholds(conn, Opts))) orelse
(not within_thresholds(DonorSessAvg, RecipientSessAvg, thresholds(sess, Opts))),
?tp(
debug,
emqx_node_rebalance_need_rebalance,
#{
donors => DonorNodes,
recipients => RecipientNodes,
conn_counts => ConnCounts,
sess_counts => SessCounts,
opts => Opts,
result => Result
}
),
Result.
avg_for_nodes(Nodes, Counts) ->
avg(maps:values(maps:with(Nodes, maps:from_list(Counts)))).
within_thresholds(Value, GoalValue, {AbsThres, RelThres}) ->
(Value =< GoalValue + AbsThres) orelse (Value =< GoalValue * RelThres).
thresholds(conn, #{abs_conn_threshold := Abs, rel_conn_threshold := Rel}) ->
{Abs, Rel};
thresholds(sess, #{abs_sess_threshold := Abs, rel_sess_threshold := Rel}) ->
{Abs, Rel}.
nodes_to_evict(Goal, NodeCounts) ->
{Nodes, _} = lists:unzip(
lists:filter(
fun({_Node, Count}) ->
Count > Goal
end,
NodeCounts
)
),
Nodes.
get_stats(disabled, _Data) -> #{};
get_stats(_State, Data) -> Data.
avg(List) when length(List) >= 1 ->
lists:sum(List) / length(List).
multicall(Nodes, F, A) ->
case apply(emqx_node_rebalance_proto_v1, F, [Nodes | A]) of
{Results, []} ->
case lists:partition(fun is_ok/1, lists:zip(Nodes, Results)) of
{OkResults, []} ->
[{Node, ok_result(Result)} || {Node, Result} <- OkResults];
{_, BadResults} ->
error({bad_nodes, BadResults})
end;
{_, [_BadNode | _] = BadNodes} ->
error({bad_nodes, BadNodes})
end.
is_ok({_Node, {ok, _}}) -> true;
is_ok({_Node, ok}) -> true;
is_ok(_) -> false.
ok_result({ok, Result}) -> Result;
ok_result(ok) -> ok.
connection_count() ->
{ok, emqx_eviction_agent:connection_count()}.
session_count() ->
{ok, emqx_eviction_agent:session_count()}.
disconnected_session_count() ->
{ok, emqx_eviction_agent:session_count(disconnected)}.
default_opts() ->
#{
conn_evict_rate => ?DEFAULT_CONN_EVICT_RATE,
abs_conn_threshold => ?DEFAULT_ABS_CONN_THRESHOLD,
rel_conn_threshold => ?DEFAULT_REL_CONN_THRESHOLD,
sess_evict_rate => ?DEFAULT_SESS_EVICT_RATE,
abs_sess_threshold => ?DEFAULT_ABS_SESS_THRESHOLD,
rel_sess_threshold => ?DEFAULT_REL_SESS_THRESHOLD,
wait_health_check => ?DEFAULT_WAIT_HEALTH_CHECK,
wait_takeover => ?DEFAULT_WAIT_TAKEOVER,
evict_interval => ?EVICT_INTERVAL,
nodes => all_nodes()
}.
deinit(Data) ->
Keys = [
recipient_conn_avg,
recipient_sess_avg,
donor_conn_avg,
donor_sess_avg,
recipient_conn_counts,
recipient_sess_counts,
donor_conn_counts,
donor_sess_counts,
initial_conn_counts,
initial_sess_counts,
opts
],
maps:without(Keys, Data).
is_node_available() ->
true = is_pid(whereis(emqx_node_rebalance_agent)),
disabled = emqx_eviction_agent:status(),
node().
all_nodes() ->
mria_mnesia:running_nodes().
seconds(Sec) ->
round(timer:seconds(Sec)).

View File

@ -0,0 +1,131 @@
%%--------------------------------------------------------------------
%% Copyright (c) 2022-2023 EMQ Technologies Co., Ltd. All Rights Reserved.
%%--------------------------------------------------------------------
-module(emqx_node_rebalance_agent).
-include_lib("emqx/include/emqx_mqtt.hrl").
-include_lib("emqx/include/logger.hrl").
-include_lib("emqx/include/types.hrl").
-include_lib("stdlib/include/qlc.hrl").
-include_lib("snabbkaffe/include/snabbkaffe.hrl").
-export([
start_link/0,
enable/1,
disable/1,
status/0
]).
-export([
init/1,
handle_call/3,
handle_info/2,
handle_cast/2,
code_change/3
]).
-define(ENABLE_KIND, emqx_node_rebalance).
%%--------------------------------------------------------------------
%% APIs
%%--------------------------------------------------------------------
-type status() :: {enabled, pid()} | disabled.
-spec start_link() -> startlink_ret().
start_link() ->
gen_server:start_link({local, ?MODULE}, ?MODULE, [], []).
-spec enable(pid()) -> ok_or_error(already_enabled | eviction_agent_busy).
enable(CoordinatorPid) ->
gen_server:call(?MODULE, {enable, CoordinatorPid}).
-spec disable(pid()) -> ok_or_error(already_disabled | invalid_coordinator).
disable(CoordinatorPid) ->
gen_server:call(?MODULE, {disable, CoordinatorPid}).
-spec status() -> status().
status() ->
gen_server:call(?MODULE, status).
%%--------------------------------------------------------------------
%% gen_server callbacks
%%--------------------------------------------------------------------
init([]) ->
{ok, #{}}.
handle_call({enable, CoordinatorPid}, _From, St) ->
case St of
#{coordinator_pid := _Pid} ->
{reply, {error, already_enabled}, St};
_ ->
true = link(CoordinatorPid),
EvictionAgentPid = whereis(emqx_eviction_agent),
true = link(EvictionAgentPid),
case emqx_eviction_agent:enable(?ENABLE_KIND, undefined) of
ok ->
{reply, ok, #{
coordinator_pid => CoordinatorPid,
eviction_agent_pid => EvictionAgentPid
}};
{error, eviction_agent_busy} ->
true = unlink(EvictionAgentPid),
true = unlink(CoordinatorPid),
{reply, {error, eviction_agent_busy}, St}
end
end;
handle_call({disable, CoordinatorPid}, _From, St) ->
case St of
#{
coordinator_pid := CoordinatorPid,
eviction_agent_pid := EvictionAgentPid
} ->
_ = emqx_eviction_agent:disable(?ENABLE_KIND),
true = unlink(EvictionAgentPid),
true = unlink(CoordinatorPid),
NewSt = maps:without(
[coordinator_pid, eviction_agent_pid],
St
),
{reply, ok, NewSt};
#{coordinator_pid := _CoordinatorPid} ->
{reply, {error, invalid_coordinator}, St};
#{} ->
{reply, {error, already_disabled}, St}
end;
handle_call(status, _From, St) ->
case St of
#{coordinator_pid := Pid} ->
{reply, {enabled, Pid}, St};
_ ->
{reply, disabled, St}
end;
handle_call(Msg, _From, St) ->
?SLOG(warning, #{
msg => "unknown_call",
call => Msg,
state => St
}),
{reply, ignored, St}.
handle_info(Msg, St) ->
?SLOG(warning, #{
msg => "unknown_info",
info => Msg,
state => St
}),
{noreply, St}.
handle_cast(Msg, St) ->
?SLOG(warning, #{
msg => "unknown_cast",
cast => Msg,
state => St
}),
{noreply, St}.
code_change(_Vsn, State, _Extra) ->
{ok, State}.

View File

@ -0,0 +1,738 @@
%%--------------------------------------------------------------------
%% Copyright (c) 2022-2023 EMQ Technologies Co., Ltd. All Rights Reserved.
%%--------------------------------------------------------------------
-module(emqx_node_rebalance_api).
-behaviour(minirest_api).
-include_lib("typerefl/include/types.hrl").
-include_lib("hocon/include/hoconsc.hrl").
-include_lib("emqx/include/logger.hrl").
%% Swagger specs from hocon schema
-export([
api_spec/0,
paths/0,
schema/1,
namespace/0
]).
-export([
fields/1,
roots/0
]).
%% API callbacks
-export([
'/load_rebalance/status'/2,
'/load_rebalance/global_status'/2,
'/load_rebalance/availability_check'/2,
'/load_rebalance/:node/start'/2,
'/load_rebalance/:node/stop'/2,
'/load_rebalance/:node/evacuation/start'/2,
'/load_rebalance/:node/evacuation/stop'/2
]).
%% Schema examples
-export([
rebalance_example/0,
rebalance_evacuation_example/0,
translate/2
]).
-import(hoconsc, [mk/2, ref/1, ref/2]).
-import(emqx_dashboard_swagger, [error_codes/2]).
-define(BAD_REQUEST, 'BAD_REQUEST').
-define(NODE_UNAVAILABLE, 'NODE_UNAVAILABLE').
-define(NODE_EVACUATING, 'NODE_EVACUATING').
-define(RPC_ERROR, 'RPC_ERROR').
%%--------------------------------------------------------------------
%% API Spec
%%--------------------------------------------------------------------
namespace() -> "load_rebalance".
api_spec() ->
emqx_dashboard_swagger:spec(?MODULE, #{check_schema => true}).
paths() ->
[
"/load_rebalance/status",
"/load_rebalance/global_status",
"/load_rebalance/availability_check",
"/load_rebalance/:node/start",
"/load_rebalance/:node/stop",
"/load_rebalance/:node/evacuation/start",
"/load_rebalance/:node/evacuation/stop"
].
schema("/load_rebalance/status") ->
#{
'operationId' => '/load_rebalance/status',
get => #{
tags => [<<"load_rebalance">>],
summary => <<"Get rebalance status">>,
description => ?DESC("load_rebalance_status"),
responses => #{
200 => local_status_response_schema()
}
}
};
schema("/load_rebalance/global_status") ->
#{
'operationId' => '/load_rebalance/global_status',
get => #{
tags => [<<"load_rebalance">>],
summary => <<"Get global rebalance status">>,
description => ?DESC("load_rebalance_global_status"),
responses => #{
200 => response_schema()
}
}
};
schema("/load_rebalance/availability_check") ->
#{
'operationId' => '/load_rebalance/availability_check',
get => #{
tags => [<<"load_rebalance">>],
summary => <<"Node rebalance availability check">>,
description => ?DESC("load_rebalance_availability_check"),
responses => #{
200 => response_schema(),
503 => error_codes([?NODE_EVACUATING], <<"Node Evacuating">>)
}
}
};
schema("/load_rebalance/:node/start") ->
#{
'operationId' => '/load_rebalance/:node/start',
post => #{
tags => [<<"load_rebalance">>],
summary => <<"Start rebalancing with the node as coordinator">>,
description => ?DESC("load_rebalance_start"),
parameters => [param_node()],
'requestBody' =>
emqx_dashboard_swagger:schema_with_examples(
ref(rebalance_start),
rebalance_example()
),
responses => #{
200 => response_schema(),
400 => error_codes([?BAD_REQUEST, ?NODE_UNAVAILABLE], <<"Bad Request">>)
}
}
};
schema("/load_rebalance/:node/stop") ->
#{
'operationId' => '/load_rebalance/:node/stop',
post => #{
tags => [<<"load_rebalance">>],
summary => <<"Stop rebalancing coordinated by the node">>,
description => ?DESC("load_rebalance_stop"),
parameters => [param_node()],
responses => #{
200 => response_schema(),
400 => error_codes([?BAD_REQUEST, ?NODE_UNAVAILABLE], <<"Bad Request">>)
}
}
};
schema("/load_rebalance/:node/evacuation/start") ->
#{
'operationId' => '/load_rebalance/:node/evacuation/start',
post => #{
tags => [<<"load_rebalance">>],
summary => <<"Start evacuation on a node">>,
description => ?DESC("load_rebalance_evacuation_start"),
parameters => [param_node()],
'requestBody' =>
emqx_dashboard_swagger:schema_with_examples(
ref(rebalance_evacuation_start),
rebalance_evacuation_example()
),
responses => #{
200 => response_schema(),
400 => error_codes([?BAD_REQUEST, ?NODE_UNAVAILABLE], <<"Bad Request">>)
}
}
};
schema("/load_rebalance/:node/evacuation/stop") ->
#{
'operationId' => '/load_rebalance/:node/evacuation/stop',
post => #{
tags => [<<"load_rebalance">>],
summary => <<"Stop evacuation on a node">>,
description => ?DESC("load_rebalance_evacuation_stop"),
parameters => [param_node()],
responses => #{
200 => response_schema(),
400 => error_codes([?BAD_REQUEST, ?NODE_UNAVAILABLE], <<"Bad Request">>)
}
}
}.
%%--------------------------------------------------------------------
%% Handlers
%%--------------------------------------------------------------------
'/load_rebalance/status'(get, #{}) ->
case emqx_node_rebalance_status:local_status() of
disabled ->
{200, #{status => disabled}};
{rebalance, Stats} ->
{200, format_status(rebalance, Stats)};
{evacuation, Stats} ->
{200, format_status(evacuation, Stats)}
end.
'/load_rebalance/global_status'(get, #{}) ->
#{
evacuations := Evacuations,
rebalances := Rebalances
} = emqx_node_rebalance_status:global_status(),
{200, #{
evacuations => format_as_map_list(Evacuations),
rebalances => format_as_map_list(Rebalances)
}}.
'/load_rebalance/availability_check'(get, #{}) ->
case emqx_eviction_agent:status() of
disabled ->
{200, #{}};
{enabled, _Stats} ->
error_response(503, ?NODE_EVACUATING, <<"Node Evacuating">>)
end.
'/load_rebalance/:node/start'(post, #{bindings := #{node := NodeBin}, body := Params0}) ->
with_node(NodeBin, fun(Node) ->
Params1 = translate(rebalance_start, Params0),
with_nodes_at_key(nodes, Params1, fun(Params2) ->
wrap_rpc(
Node, emqx_node_rebalance_api_proto_v1:node_rebalance_start(Node, Params2)
)
end)
end).
'/load_rebalance/:node/stop'(post, #{bindings := #{node := NodeBin}}) ->
with_node(NodeBin, fun(Node) ->
wrap_rpc(
Node, emqx_node_rebalance_api_proto_v1:node_rebalance_stop(Node)
)
end).
'/load_rebalance/:node/evacuation/start'(post, #{
bindings := #{node := NodeBin}, body := Params0
}) ->
with_node(NodeBin, fun(Node) ->
Params1 = translate(rebalance_evacuation_start, Params0),
with_nodes_at_key(migrate_to, Params1, fun(Params2) ->
wrap_rpc(
Node,
emqx_node_rebalance_api_proto_v1:node_rebalance_evacuation_start(
Node, Params2
)
)
end)
end).
'/load_rebalance/:node/evacuation/stop'(post, #{bindings := #{node := NodeBin}}) ->
with_node(NodeBin, fun(Node) ->
wrap_rpc(
Node, emqx_node_rebalance_api_proto_v1:node_rebalance_evacuation_stop(Node)
)
end).
%%--------------------------------------------------------------------
%% Helpers
%%--------------------------------------------------------------------
wrap_rpc(Node, RPCResult) ->
case RPCResult of
ok ->
{200, #{}};
{error, Reason} ->
error_response(
400, ?BAD_REQUEST, io_lib:format("error on node ~p: ~p", [Node, Reason])
);
{badrpc, Reason} ->
error_response(
503, ?RPC_ERROR, io_lib:format("RPC error on node ~p: ~p", [Node, Reason])
)
end.
format_status(Process, Stats) ->
Stats#{process => Process, status => enabled}.
validate_nodes(Key, Params) when is_map_key(Key, Params) ->
BinNodes = maps:get(Key, Params),
{ValidNodes, InvalidNodes} = lists:foldl(
fun(BinNode, {Nodes, UnknownNodes}) ->
case parse_node(BinNode) of
{ok, Node} -> {[Node | Nodes], UnknownNodes};
{error, _} -> {Nodes, [BinNode | UnknownNodes]}
end
end,
{[], []},
BinNodes
),
case InvalidNodes of
[] ->
case emqx_node_rebalance_evacuation:available_nodes(ValidNodes) of
ValidNodes -> {ok, Params#{Key => ValidNodes}};
OtherNodes -> {error, {unavailable, ValidNodes -- OtherNodes}}
end;
_ ->
{error, {invalid, InvalidNodes}}
end;
validate_nodes(_Key, Params) ->
{ok, Params}.
with_node(BinNode, Fun) ->
case parse_node(BinNode) of
{ok, Node} -> Fun(Node);
{error, _} -> error_response(400, ?BAD_REQUEST, [<<"Invalid node: ">>, BinNode])
end.
with_nodes_at_key(Key, Params, Fun) ->
Res = validate_nodes(Key, Params),
case Res of
{ok, Params1} ->
Fun(Params1);
{error, {unavailable, Nodes}} ->
error_response(400, ?NODE_UNAVAILABLE, io_lib:format("Nodes unavailable: ~p", [Nodes]));
{error, {invalid, Nodes}} ->
error_response(400, ?BAD_REQUEST, io_lib:format("Invalid nodes: ~p", [Nodes]))
end.
parse_node(Bin) when is_binary(Bin) ->
try
{ok, binary_to_existing_atom(Bin)}
catch
error:badarg ->
{error, {unknown, Bin}}
end.
format_as_map_list(List) ->
lists:map(
fun({Node, Info}) ->
Info#{node => Node}
end,
List
).
error_response(HttpCode, Code, Message) ->
{HttpCode, #{
code => atom_to_binary(Code),
message => iolist_to_binary(Message)
}}.
without(Keys, Props) ->
lists:filter(
fun({Key, _}) ->
not lists:member(Key, Keys)
end,
Props
).
%%------------------------------------------------------------------------------
%% Schema
%%------------------------------------------------------------------------------
translate(Ref, Conf) ->
Options = #{atom_key => true},
#{Ref := TranslatedConf} = hocon_tconf:check_plain(
?MODULE, #{atom_to_binary(Ref) => Conf}, Options, [Ref]
),
TranslatedConf.
param_node() ->
{
node,
mk(binary(), #{
in => path,
desc => ?DESC(param_node),
required => true
})
}.
fields(rebalance_start) ->
[
{"wait_health_check",
mk(
emqx_schema:duration_s(),
#{
desc => ?DESC(wait_health_check),
required => false
}
)},
{"conn_evict_rate",
mk(
pos_integer(),
#{
desc => ?DESC(conn_evict_rate),
required => false
}
)},
{"sess_evict_rate",
mk(
pos_integer(),
#{
desc => ?DESC(sess_evict_rate),
required => false
}
)},
{"abs_conn_threshold",
mk(
pos_integer(),
#{
desc => ?DESC(abs_conn_threshold),
required => false
}
)},
{"rel_conn_threshold",
mk(
number(),
#{
desc => ?DESC(rel_conn_threshold),
required => false,
validator => [fun(Value) -> Value > 1.0 end]
}
)},
{"abs_sess_threshold",
mk(
pos_integer(),
#{
desc => ?DESC(abs_sess_threshold),
required => false
}
)},
{"rel_sess_threshold",
mk(
number(),
#{
desc => ?DESC(rel_sess_threshold),
required => false,
validator => [fun(Value) -> Value > 1.0 end]
}
)},
{"wait_takeover",
mk(
emqx_schema:duration_s(),
#{
desc => ?DESC(wait_takeover),
required => false
}
)},
{"nodes",
mk(
list(binary()),
#{
desc => ?DESC(rebalance_nodes),
required => false,
validator => [fun(Values) -> length(Values) > 0 end]
}
)}
];
fields(rebalance_evacuation_start) ->
[
{"conn_evict_rate",
mk(
pos_integer(),
#{
desc => ?DESC(conn_evict_rate),
required => false
}
)},
{"sess_evict_rate",
mk(
pos_integer(),
#{
desc => ?DESC(sess_evict_rate),
required => false
}
)},
{"redirect_to",
mk(
binary(),
#{
desc => ?DESC(redirect_to),
required => false
}
)},
{"wait_takeover",
mk(
pos_integer(),
#{
desc => ?DESC(wait_takeover),
required => false
}
)},
{"migrate_to",
mk(
list(binary()),
#{
desc => ?DESC(migrate_to),
required => false,
validator => [fun(Values) -> length(Values) > 0 end]
}
)}
];
fields(local_status_disabled) ->
[
{"status",
mk(
disabled,
#{
desc => ?DESC(local_status_enabled),
required => true
}
)}
];
fields(local_status_enabled) ->
[
{"status",
mk(
enabled,
#{
desc => ?DESC(local_status_enabled),
required => true
}
)},
{"process",
mk(
hoconsc:union([rebalance, evacuation]),
#{
desc => ?DESC(local_status_process),
required => true
}
)},
{"state",
mk(
atom(),
#{
desc => ?DESC(local_status_state),
required => true
}
)},
{"coordinator_node",
mk(
binary(),
#{
desc => ?DESC(local_status_coordinator_node),
required => false
}
)},
{"connection_eviction_rate",
mk(
pos_integer(),
#{
desc => ?DESC(local_status_connection_eviction_rate),
required => false
}
)},
{"session_eviction_rate",
mk(
pos_integer(),
#{
desc => ?DESC(local_status_session_eviction_rate),
required => false
}
)},
{"connection_goal",
mk(
non_neg_integer(),
#{
desc => ?DESC(local_status_connection_goal),
required => false
}
)},
{"session_goal",
mk(
non_neg_integer(),
#{
desc => ?DESC(local_status_session_goal),
required => false
}
)},
{"disconnected_session_goal",
mk(
non_neg_integer(),
#{
desc => ?DESC(local_status_disconnected_session_goal),
required => false
}
)},
{"session_recipients",
mk(
list(binary()),
#{
desc => ?DESC(local_status_session_recipients),
required => false
}
)},
{"recipients",
mk(
list(binary()),
#{
desc => ?DESC(local_status_recipients),
required => false
}
)},
{"stats",
mk(
ref(status_stats),
#{
desc => ?DESC(local_status_stats),
required => false
}
)}
];
fields(status_stats) ->
[
{"initial_connected",
mk(
non_neg_integer(),
#{
desc => ?DESC(status_stats_initial_connected),
required => true
}
)},
{"current_connected",
mk(
non_neg_integer(),
#{
desc => ?DESC(status_stats_current_connected),
required => true
}
)},
{"initial_sessions",
mk(
non_neg_integer(),
#{
desc => ?DESC(status_stats_initial_sessions),
required => true
}
)},
{"current_sessions",
mk(
non_neg_integer(),
#{
desc => ?DESC(status_stats_current_sessions),
required => true
}
)},
{"current_disconnected_sessions",
mk(
non_neg_integer(),
#{
desc => ?DESC(status_stats_current_disconnected_sessions),
required => false
}
)}
];
fields(global_coordinator_status) ->
without(
["status", "process", "session_goal", "session_recipients", "stats"],
fields(local_status_enabled)
) ++
[
{"donors",
mk(
list(binary()),
#{
desc => ?DESC(coordinator_status_donors),
required => false
}
)},
{"donor_conn_avg",
mk(
non_neg_integer(),
#{
desc => ?DESC(coordinator_status_donor_conn_avg),
required => false
}
)},
{"donor_sess_avg",
mk(
non_neg_integer(),
#{
desc => ?DESC(coordinator_status_donor_sess_avg),
required => false
}
)},
{"node",
mk(
binary(),
#{
desc => ?DESC(coordinator_status_node),
required => true
}
)}
];
fields(global_evacuation_status) ->
without(["status", "process"], fields(local_status_enabled)) ++
[
{"node",
mk(
binary(),
#{
desc => ?DESC(evacuation_status_node),
required => true
}
)}
];
fields(global_status) ->
[
{"evacuations",
mk(
hoconsc:array(ref(global_evacuation_status)),
#{
desc => ?DESC(global_status_evacuations),
required => false
}
)},
{"rebalances",
mk(
hoconsc:array(ref(global_coordinator_status)),
#{
desc => ?DESC(global_status_rebalances),
required => false
}
)}
].
rebalance_example() ->
#{
wait_health_check => 10,
conn_evict_rate => 10,
sess_evict_rate => 20,
abs_conn_threshold => 10,
rel_conn_threshold => 1.5,
abs_sess_threshold => 10,
rel_sess_threshold => 1.5,
wait_takeover => 10,
nodes => [<<"othernode@127.0.0.1">>]
}.
rebalance_evacuation_example() ->
#{
conn_evict_rate => 100,
sess_evict_rate => 100,
redirect_to => <<"othernode:1883">>,
wait_takeover => 10,
migrate_to => [<<"othernode@127.0.0.1">>]
}.
local_status_response_schema() ->
hoconsc:union([ref(local_status_disabled), ref(local_status_enabled)]).
response_schema() ->
mk(
map(),
#{
desc => ?DESC(empty_response)
}
).
roots() -> [].

View File

@ -0,0 +1,22 @@
%%--------------------------------------------------------------------
%% Copyright (c) 2022-2023 EMQ Technologies Co., Ltd. All Rights Reserved.
%%--------------------------------------------------------------------
-module(emqx_node_rebalance_app).
-behaviour(application).
-emqx_plugin(?MODULE).
-export([
start/2,
stop/1
]).
start(_Type, _Args) ->
{ok, Sup} = emqx_node_rebalance_sup:start_link(),
ok = emqx_node_rebalance_cli:load(),
{ok, Sup}.
stop(_State) ->
emqx_node_rebalance_cli:unload().

View File

@ -0,0 +1,305 @@
%%--------------------------------------------------------------------
%% Copyright (c) 2022-2023 EMQ Technologies Co., Ltd. All Rights Reserved.
%%--------------------------------------------------------------------
-module(emqx_node_rebalance_cli).
%% APIs
-export([
load/0,
unload/0,
cli/1
]).
load() ->
emqx_ctl:register_command(rebalance, {?MODULE, cli}, []).
unload() ->
emqx_ctl:unregister_command(rebalance).
cli(["start" | StartArgs]) ->
case start_args(StartArgs) of
{evacuation, Opts} ->
case emqx_node_rebalance_evacuation:status() of
disabled ->
ok = emqx_node_rebalance_evacuation:start(Opts),
emqx_ctl:print("Rebalance(evacuation) started~n"),
true;
{enabled, _} ->
emqx_ctl:print("Rebalance is already enabled~n"),
false
end;
{rebalance, Opts} ->
case emqx_node_rebalance:start(Opts) of
ok ->
emqx_ctl:print("Rebalance started~n"),
true;
{error, Reason} ->
emqx_ctl:print("Rebalance start error: ~p~n", [Reason]),
false
end;
{error, Error} ->
emqx_ctl:print("Rebalance start error: ~s~n", [Error]),
false
end;
cli(["node-status", NodeStr]) ->
case emqx_misc:safe_to_existing_atom(NodeStr, utf8) of
{ok, Node} ->
node_status(emqx_node_rebalance_status:local_status(Node));
{error, _} ->
emqx_ctl:print("Node status error: invalid node~n"),
false
end;
cli(["node-status"]) ->
node_status(emqx_node_rebalance_status:local_status());
cli(["status"]) ->
#{
evacuations := Evacuations,
rebalances := Rebalances
} = emqx_node_rebalance_status:global_status(),
lists:foreach(
fun({Node, Status}) ->
emqx_ctl:print(
"--------------------------------------------------------------------~n"
),
emqx_ctl:print(
"Node ~p: evacuation~n~s",
[Node, emqx_node_rebalance_status:format_local_status(Status)]
)
end,
Evacuations
),
lists:foreach(
fun({Node, Status}) ->
emqx_ctl:print(
"--------------------------------------------------------------------~n"
),
emqx_ctl:print(
"Node ~p: rebalance coordinator~n~s",
[Node, emqx_node_rebalance_status:format_coordinator_status(Status)]
)
end,
Rebalances
);
cli(["stop"]) ->
case emqx_node_rebalance_evacuation:status() of
{enabled, _} ->
ok = emqx_node_rebalance_evacuation:stop(),
emqx_ctl:print("Rebalance(evacuation) stopped~n"),
true;
disabled ->
case emqx_node_rebalance:status() of
{enabled, _} ->
ok = emqx_node_rebalance:stop(),
emqx_ctl:print("Rebalance stopped~n"),
true;
disabled ->
emqx_ctl:print("Rebalance is already disabled~n"),
false
end
end;
cli(_) ->
emqx_ctl:usage(
[
{
"rebalance start --evacuation \\\n"
" [--redirect-to \"Host1:Port1 Host2:Port2 ...\"] \\\n"
" [--conn-evict-rate CountPerSec] \\\n"
" [--migrate-to \"node1@host1 node2@host2 ...\"] \\\n"
" [--wait-takeover Secs] \\\n"
" [--sess-evict-rate CountPerSec]",
"Start current node evacuation with optional server redirect to the specified servers"
},
{
"rebalance start \\\n"
" [--nodes \"node1@host1 node2@host2\"] \\\n"
" [--wait-health-check Secs] \\\n"
" [--conn-evict-rate ConnPerSec] \\\n"
" [--abs-conn-threshold Count] \\\n"
" [--rel-conn-threshold Fraction] \\\n"
" [--conn-evict-rate ConnPerSec] \\\n"
" [--wait-takeover Secs] \\\n"
" [--sess-evict-rate CountPerSec] \\\n"
" [--abs-sess-threshold Count] \\\n"
" [--rel-sess-threshold Fraction]",
"Start rebalance on the specified nodes using the current node as the coordinator"
},
{"rebalance node-status", "Get current node rebalance status"},
{"rebalance node-status \"node1@host1\"", "Get remote node rebalance status"},
{"rebalance status",
"Get statuses of all current rebalance/evacuation processes across the cluster"},
{"rebalance stop", "Stop node rebalance"}
]
).
node_status(NodeStatus) ->
case NodeStatus of
{Process, Status} when Process =:= evacuation orelse Process =:= rebalance ->
emqx_ctl:print(
"Rebalance type: ~p~n~s~n",
[Process, emqx_node_rebalance_status:format_local_status(Status)]
);
disabled ->
emqx_ctl:print("Rebalance disabled~n");
Other ->
emqx_ctl:print("Error detecting rebalance status: ~p~n", [Other])
end.
start_args(Args) ->
case collect_args(Args, #{}) of
{ok, #{"--evacuation" := true} = Collected} ->
case validate_evacuation(maps:to_list(Collected), #{}) of
{ok, Validated} ->
{evacuation, Validated};
{error, _} = Error ->
Error
end;
{ok, #{} = Collected} ->
case validate_rebalance(maps:to_list(Collected), #{}) of
{ok, Validated} ->
{rebalance, Validated};
{error, _} = Error ->
Error
end;
{error, _} = Error ->
Error
end.
collect_args([], Map) ->
{ok, Map};
%% evacuation
collect_args(["--evacuation" | Args], Map) ->
collect_args(Args, Map#{"--evacuation" => true});
collect_args(["--redirect-to", ServerReference | Args], Map) ->
collect_args(Args, Map#{"--redirect-to" => ServerReference});
collect_args(["--migrate-to", MigrateTo | Args], Map) ->
collect_args(Args, Map#{"--migrate-to" => MigrateTo});
%% rebalance
collect_args(["--nodes", Nodes | Args], Map) ->
collect_args(Args, Map#{"--nodes" => Nodes});
collect_args(["--wait-health-check", WaitHealthCheck | Args], Map) ->
collect_args(Args, Map#{"--wait-health-check" => WaitHealthCheck});
collect_args(["--abs-conn-threshold", AbsConnThres | Args], Map) ->
collect_args(Args, Map#{"--abs-conn-threshold" => AbsConnThres});
collect_args(["--rel-conn-threshold", RelConnThres | Args], Map) ->
collect_args(Args, Map#{"--rel-conn-threshold" => RelConnThres});
collect_args(["--abs-sess-threshold", AbsSessThres | Args], Map) ->
collect_args(Args, Map#{"--abs-sess-threshold" => AbsSessThres});
collect_args(["--rel-sess-threshold", RelSessThres | Args], Map) ->
collect_args(Args, Map#{"--rel-sess-threshold" => RelSessThres});
%% common
collect_args(["--conn-evict-rate", ConnEvictRate | Args], Map) ->
collect_args(Args, Map#{"--conn-evict-rate" => ConnEvictRate});
collect_args(["--wait-takeover", WaitTakeover | Args], Map) ->
collect_args(Args, Map#{"--wait-takeover" => WaitTakeover});
collect_args(["--sess-evict-rate", SessEvictRate | Args], Map) ->
collect_args(Args, Map#{"--sess-evict-rate" => SessEvictRate});
%% fallback
collect_args(Args, _Map) ->
{error, io_lib:format("unknown arguments: ~p", [Args])}.
validate_evacuation([], Map) ->
{ok, Map};
validate_evacuation([{"--evacuation", _} | Rest], Map) ->
validate_evacuation(Rest, Map);
validate_evacuation([{"--redirect-to", ServerReference} | Rest], Map) ->
validate_evacuation(Rest, Map#{server_reference => list_to_binary(ServerReference)});
validate_evacuation([{"--conn-evict-rate", _} | _] = Opts, Map) ->
validate_pos_int(conn_evict_rate, Opts, Map, fun validate_evacuation/2);
validate_evacuation([{"--sess-evict-rate", _} | _] = Opts, Map) ->
validate_pos_int(sess_evict_rate, Opts, Map, fun validate_evacuation/2);
validate_evacuation([{"--wait-takeover", _} | _] = Opts, Map) ->
validate_pos_int(wait_takeover, Opts, Map, fun validate_evacuation/2);
validate_evacuation([{"--migrate-to", MigrateTo} | Rest], Map) ->
case strings_to_atoms(string:tokens(MigrateTo, ", ")) of
{_, Invalid} when Invalid =/= [] ->
{error, io_lib:format("invalid --migrate-to, invalid nodes: ~p", [Invalid])};
{Nodes, []} ->
case emqx_node_rebalance_evacuation:available_nodes(Nodes) of
[] ->
{error, "invalid --migrate-to, no nodes"};
Nodes ->
validate_evacuation(Rest, Map#{migrate_to => Nodes});
OtherNodes ->
{error,
io_lib:format(
"invalid --migrate-to, unavailable nodes: ~p",
[Nodes -- OtherNodes]
)}
end
end;
validate_evacuation(Rest, _Map) ->
{error, io_lib:format("unknown evacuation arguments: ~p", [Rest])}.
validate_rebalance([], Map) ->
{ok, Map};
validate_rebalance([{"--wait-health-check", _} | _] = Opts, Map) ->
validate_pos_int(wait_health_check, Opts, Map, fun validate_rebalance/2);
validate_rebalance([{"--conn-evict-rate", _} | _] = Opts, Map) ->
validate_pos_int(conn_evict_rate, Opts, Map, fun validate_rebalance/2);
validate_rebalance([{"--sess-evict-rate", _} | _] = Opts, Map) ->
validate_pos_int(sess_evict_rate, Opts, Map, fun validate_rebalance/2);
validate_rebalance([{"--abs-conn-threshold", _} | _] = Opts, Map) ->
validate_pos_int(abs_conn_threshold, Opts, Map, fun validate_rebalance/2);
validate_rebalance([{"--rel-conn-threshold", _} | _] = Opts, Map) ->
validate_fraction(rel_conn_threshold, Opts, Map, fun validate_rebalance/2);
validate_rebalance([{"--abs-sess-threshold", _} | _] = Opts, Map) ->
validate_pos_int(abs_sess_threshold, Opts, Map, fun validate_rebalance/2);
validate_rebalance([{"--rel-sess-threshold", _} | _] = Opts, Map) ->
validate_fraction(rel_sess_threshold, Opts, Map, fun validate_rebalance/2);
validate_rebalance([{"--wait-takeover", _} | _] = Opts, Map) ->
validate_pos_int(wait_takeover, Opts, Map, fun validate_rebalance/2);
validate_rebalance([{"--nodes", NodeStr} | Rest], Map) ->
case strings_to_atoms(string:tokens(NodeStr, ", ")) of
{_, Invalid} when Invalid =/= [] ->
{error, io_lib:format("invalid --nodes, invalid nodes: ~p", [Invalid])};
{Nodes, []} ->
case emqx_node_rebalance:available_nodes(Nodes) of
[] ->
{error, "invalid --nodes, no nodes"};
Nodes ->
validate_rebalance(Rest, Map#{nodes => Nodes});
OtherNodes ->
{error,
io_lib:format(
"invalid --nodes, unavailable nodes: ~p",
[Nodes -- OtherNodes]
)}
end
end;
validate_rebalance(Rest, _Map) ->
{error, io_lib:format("unknown rebalance arguments: ~p", [Rest])}.
validate_fraction(Name, [{OptionName, Value} | Rest], Map, Next) ->
case string:to_float(Value) of
{Num, ""} when Num > 1.0 ->
Next(Rest, Map#{Name => Num});
_ ->
{error, "invalid " ++ OptionName ++ " value"}
end.
validate_pos_int(Name, [{OptionName, Value} | Rest], Map, Next) ->
case string:to_integer(Value) of
{Int, ""} when Int > 0 ->
Next(Rest, Map#{Name => Int});
_ ->
{error, "invalid " ++ OptionName ++ " value"}
end.
strings_to_atoms(Strings) ->
strings_to_atoms(Strings, [], []).
strings_to_atoms([], Atoms, Invalid) ->
{lists:reverse(Atoms), lists:reverse(Invalid)};
strings_to_atoms([Str | Rest], Atoms, Invalid) ->
case emqx_misc:safe_to_existing_atom(Str, utf8) of
{ok, Atom} ->
strings_to_atoms(Rest, [Atom | Atoms], Invalid);
{error, _} ->
strings_to_atoms(Rest, Atoms, [Str | Invalid])
end.

View File

@ -0,0 +1,308 @@
%%--------------------------------------------------------------------
%% Copyright (c) 2022-2023 EMQ Technologies Co., Ltd. All Rights Reserved.
%%--------------------------------------------------------------------
-module(emqx_node_rebalance_evacuation).
-include("emqx_node_rebalance.hrl").
-include_lib("emqx/include/logger.hrl").
-include_lib("emqx/include/types.hrl").
-include_lib("snabbkaffe/include/snabbkaffe.hrl").
-export([
start/1,
status/0,
stop/0
]).
-export([start_link/0]).
-behaviour(gen_statem).
-export([
init/1,
callback_mode/0,
handle_event/4,
code_change/4
]).
-export([
is_node_available/0,
available_nodes/1
]).
-export_type([
start_opts/0,
start_error/0
]).
-ifdef(TEST).
-export([migrate_to/1]).
-endif.
%%--------------------------------------------------------------------
%% APIs
%%--------------------------------------------------------------------
-define(EVICT_INTERVAL_NO_NODES, 30000).
-type migrate_to() :: [node()] | undefined.
-type start_opts() :: #{
server_reference => emqx_eviction_agent:server_reference(),
conn_evict_rate => pos_integer(),
sess_evict_rate => pos_integer(),
wait_takeover => pos_integer(),
migrate_to => migrate_to()
}.
-type start_error() :: already_started | eviction_agent_busy.
-type stats() :: #{
initial_conns := non_neg_integer(),
initial_sessions := non_neg_integer(),
current_conns := non_neg_integer(),
current_sessions := non_neg_integer(),
conn_evict_rate := pos_integer(),
sess_evict_rate := pos_integer(),
server_reference := emqx_eviction_agent:server_reference(),
migrate_to := migrate_to()
}.
-type status() :: {enabled, stats()} | disabled.
-spec start(start_opts()) -> ok_or_error(start_error()).
start(StartOpts) ->
Opts = maps:merge(default_opts(), StartOpts),
gen_statem:call(?MODULE, {start, Opts}).
-spec stop() -> ok_or_error(not_started).
stop() ->
gen_statem:call(?MODULE, stop).
-spec status() -> status().
status() ->
gen_statem:call(?MODULE, status).
-spec start_link() -> startlink_ret().
start_link() ->
gen_statem:start_link({local, ?MODULE}, ?MODULE, [], []).
-spec available_nodes(list(node())) -> list(node()).
available_nodes(Nodes) when is_list(Nodes) ->
{Available, _} = emqx_node_rebalance_evacuation_proto_v1:available_nodes(Nodes),
lists:filter(fun is_atom/1, Available).
%%--------------------------------------------------------------------
%% gen_statem callbacks
%%--------------------------------------------------------------------
callback_mode() -> handle_event_function.
%% states: disabled, evicting_conns, waiting_takeover, evicting_sessions, prohibiting
init([]) ->
case emqx_node_rebalance_evacuation_persist:read(default_opts()) of
{ok, #{server_reference := ServerReference} = Opts} ->
?SLOG(warning, #{msg => "restoring_evacuation_state", opts => Opts}),
case emqx_eviction_agent:enable(?MODULE, ServerReference) of
ok ->
Data = init_data(#{}, Opts),
ok = warn_enabled(),
{ok, evicting_conns, Data, [{state_timeout, 0, evict_conns}]};
{error, eviction_agent_busy} ->
emqx_node_rebalance_evacuation_persist:clear(),
{ok, disabled, #{}}
end;
none ->
{ok, disabled, #{}}
end.
%% start
handle_event(
{call, From},
{start, #{server_reference := ServerReference} = Opts},
disabled,
#{} = Data
) ->
case emqx_eviction_agent:enable(?MODULE, ServerReference) of
ok ->
NewData = init_data(Data, Opts),
ok = emqx_node_rebalance_evacuation_persist:save(Opts),
?SLOG(warning, #{
msg => "node_evacuation_started",
opts => Opts
}),
{next_state, evicting_conns, NewData, [
{state_timeout, 0, evict_conns},
{reply, From, ok}
]};
{error, eviction_agent_busy} ->
{keep_state_and_data, [{reply, From, {error, eviction_agent_busy}}]}
end;
handle_event({call, From}, {start, _Opts}, _State, #{}) ->
{keep_state_and_data, [{reply, From, {error, already_started}}]};
%% stop
handle_event({call, From}, stop, disabled, #{}) ->
{keep_state_and_data, [{reply, From, {error, not_started}}]};
handle_event({call, From}, stop, _State, Data) ->
ok = emqx_node_rebalance_evacuation_persist:clear(),
_ = emqx_eviction_agent:disable(?MODULE),
?SLOG(warning, #{msg => "node_evacuation_stopped"}),
{next_state, disabled, deinit(Data), [{reply, From, ok}]};
%% status
handle_event({call, From}, status, disabled, #{}) ->
{keep_state_and_data, [{reply, From, disabled}]};
handle_event({call, From}, status, State, #{migrate_to := MigrateTo} = Data) ->
Stats = maps:with(
[
initial_conns,
current_conns,
initial_sessions,
current_sessions,
server_reference,
conn_evict_rate,
sess_evict_rate
],
Data
),
{keep_state_and_data, [
{reply, From, {enabled, Stats#{state => State, migrate_to => migrate_to(MigrateTo)}}}
]};
%% conn eviction
handle_event(
state_timeout,
evict_conns,
evicting_conns,
#{
conn_evict_rate := ConnEvictRate,
wait_takeover := WaitTakeover
} = Data
) ->
case emqx_eviction_agent:status() of
{enabled, #{connections := Conns}} when Conns > 0 ->
ok = emqx_eviction_agent:evict_connections(ConnEvictRate),
?tp(debug, node_evacuation_evict_conn, #{conn_evict_rate => ConnEvictRate}),
?SLOG(
warning,
#{
msg => "node_evacuation_evict_conns",
count => Conns,
conn_evict_rate => ConnEvictRate
}
),
NewData = Data#{current_conns => Conns},
{keep_state, NewData, [{state_timeout, ?EVICT_INTERVAL, evict_conns}]};
{enabled, #{connections := 0}} ->
NewData = Data#{current_conns => 0},
?SLOG(warning, #{msg => "node_evacuation_evict_conns_done"}),
{next_state, waiting_takeover, NewData, [
{state_timeout, timer:seconds(WaitTakeover), evict_sessions}
]}
end;
handle_event(
state_timeout,
evict_sessions,
waiting_takeover,
Data
) ->
?SLOG(warning, #{msg => "node_evacuation_waiting_takeover_done"}),
{next_state, evicting_sessions, Data, [{state_timeout, 0, evict_sessions}]};
%% session eviction
handle_event(
state_timeout,
evict_sessions,
evicting_sessions,
#{
sess_evict_rate := SessEvictRate,
migrate_to := MigrateTo,
current_sessions := CurrSessCount
} = Data
) ->
case emqx_eviction_agent:status() of
{enabled, #{sessions := SessCount}} when SessCount > 0 ->
case migrate_to(MigrateTo) of
[] ->
?SLOG(warning, #{
msg => "no_nodes_to_evacuate_sessions", session_count => CurrSessCount
}),
{keep_state_and_data, [
{state_timeout, ?EVICT_INTERVAL_NO_NODES, evict_sessions}
]};
Nodes ->
ok = emqx_eviction_agent:evict_sessions(SessEvictRate, Nodes),
?SLOG(
warning,
#{
msg => "node_evacuation_evict_sessions",
session_count => SessCount,
session_evict_rate => SessEvictRate,
target_nodes => Nodes
}
),
NewData = Data#{current_sessions => SessCount},
{keep_state, NewData, [{state_timeout, ?EVICT_INTERVAL, evict_sessions}]}
end;
{enabled, #{sessions := 0}} ->
?tp(debug, node_evacuation_evict_sess_over, #{}),
?SLOG(warning, #{msg => "node_evacuation_evict_sessions_over"}),
NewData = Data#{current_sessions => 0},
{next_state, prohibiting, NewData}
end;
handle_event({call, From}, Msg, State, Data) ->
?SLOG(warning, #{msg => "unknown_call", call => Msg, state => State, data => Data}),
{keep_state_and_data, [{reply, From, ignored}]};
handle_event(info, Msg, State, Data) ->
?SLOG(warning, #{msg => "unknown_info", info => Msg, state => State, data => Data}),
keep_state_and_data;
handle_event(cast, Msg, State, Data) ->
?SLOG(warning, #{msg => "unknown_cast", cast => Msg, state => State, data => Data}),
keep_state_and_data.
code_change(_Vsn, State, Data, _Extra) ->
{ok, State, Data}.
%%--------------------------------------------------------------------
%% internal funs
%%--------------------------------------------------------------------
default_opts() ->
#{
server_reference => undefined,
conn_evict_rate => ?DEFAULT_CONN_EVICT_RATE,
sess_evict_rate => ?DEFAULT_SESS_EVICT_RATE,
wait_takeover => ?DEFAULT_WAIT_TAKEOVER,
migrate_to => undefined
}.
init_data(Data0, Opts) ->
Data1 = maps:merge(Data0, Opts),
{enabled, #{connections := ConnCount, sessions := SessCount}} = emqx_eviction_agent:status(),
Data1#{
initial_conns => ConnCount,
current_conns => ConnCount,
initial_sessions => SessCount,
current_sessions => SessCount
}.
deinit(Data) ->
Keys =
[initial_conns, current_conns, initial_sessions, current_sessions] ++
maps:keys(default_opts()),
maps:without(Keys, Data).
warn_enabled() ->
?SLOG(warning, #{msg => "node_evacuation_enabled"}),
io:format(
standard_error, "Node evacuation is enabled. The node will not receive connections.~n", []
).
migrate_to(undefined) ->
migrate_to(all_nodes());
migrate_to(Nodes) when is_list(Nodes) ->
available_nodes(Nodes).
is_node_available() ->
disabled = emqx_eviction_agent:status(),
node().
all_nodes() ->
mria_mnesia:running_nodes() -- [node()].

View File

@ -0,0 +1,120 @@
%%--------------------------------------------------------------------
%% Copyright (c) 2022-2023 EMQ Technologies Co., Ltd. All Rights Reserved.
%%--------------------------------------------------------------------
-module(emqx_node_rebalance_evacuation_persist).
-export([
save/1,
clear/0,
read/1
]).
-ifdef(TEST).
-export([evacuation_filepath/0]).
-endif.
-include("emqx_node_rebalance.hrl").
-include_lib("emqx/include/types.hrl").
%%--------------------------------------------------------------------
%% APIs
%%--------------------------------------------------------------------
%% do not persist `migrate_to`:
%% * after restart there is nothing to migrate
%% * this value may be invalid after node was offline
-type persisted_start_opts() :: #{
server_reference => emqx_eviction_agent:server_reference(),
conn_evict_rate => pos_integer(),
sess_evict_rate => pos_integer(),
wait_takeover => pos_integer()
}.
-type start_opts() :: #{
server_reference => emqx_eviction_agent:server_reference(),
conn_evict_rate => pos_integer(),
sess_evict_rate => pos_integer(),
wait_takeover => pos_integer(),
migrate_to => emqx_node_rebalance_evacuation:migrate_to()
}.
-spec save(persisted_start_opts()) -> ok_or_error(term()).
save(
#{
server_reference := ServerReference,
conn_evict_rate := ConnEvictRate,
sess_evict_rate := SessEvictRate,
wait_takeover := WaitTakeover
} = Data
) when
(is_binary(ServerReference) orelse ServerReference =:= undefined) andalso
is_integer(ConnEvictRate) andalso ConnEvictRate > 0 andalso
is_integer(SessEvictRate) andalso SessEvictRate > 0 andalso
is_integer(WaitTakeover) andalso WaitTakeover >= 0
->
Filepath = evacuation_filepath(),
case filelib:ensure_dir(Filepath) of
ok ->
JsonData = emqx_json:encode(
prepare_for_encode(maps:with(persist_keys(), Data)),
[pretty]
),
file:write_file(Filepath, JsonData);
{error, _} = Error ->
Error
end.
-spec clear() -> ok.
clear() ->
file:delete(evacuation_filepath()).
-spec read(start_opts()) -> {ok, start_opts()} | none.
read(DefaultOpts) ->
case file:read_file(evacuation_filepath()) of
{ok, Data} ->
case emqx_json:safe_decode(Data, [return_maps]) of
{ok, Map} when is_map(Map) ->
{ok, map_to_opts(DefaultOpts, Map)};
_NotAMap ->
{ok, DefaultOpts}
end;
{error, _} ->
none
end.
%%--------------------------------------------------------------------
%% Internal funcs
%%--------------------------------------------------------------------
persist_keys() ->
[
server_reference,
conn_evict_rate,
sess_evict_rate,
wait_takeover
].
prepare_for_encode(#{server_reference := undefined} = Data) ->
Data#{server_reference => null};
prepare_for_encode(Data) ->
Data.
format_after_decode(#{server_reference := null} = Data) ->
Data#{server_reference => undefined};
format_after_decode(Data) ->
Data.
map_to_opts(DefaultOpts, Map) ->
format_after_decode(
map_to_opts(
maps:to_list(DefaultOpts), Map, #{}
)
).
map_to_opts([], _Map, Opts) ->
Opts;
map_to_opts([{Key, DefaultVal} | Rest], Map, Opts) ->
map_to_opts(Rest, Map, Opts#{Key => maps:get(atom_to_binary(Key), Map, DefaultVal)}).
evacuation_filepath() ->
filename:join([emqx:data_dir(), ?EVACUATION_FILENAME]).

View File

@ -0,0 +1,238 @@
%%--------------------------------------------------------------------
%% Copyright (c) 2022-2023 EMQ Technologies Co., Ltd. All Rights Reserved.
%%--------------------------------------------------------------------
-module(emqx_node_rebalance_status).
-export([
local_status/0,
local_status/1,
global_status/0,
format_local_status/1,
format_coordinator_status/1
]).
%% For RPC
-export([
evacuation_status/0,
rebalance_status/0
]).
%%--------------------------------------------------------------------
%% APIs
%%--------------------------------------------------------------------
-spec local_status() -> disabled | {evacuation, map()} | {rebalance, map()}.
local_status() ->
case emqx_node_rebalance_evacuation:status() of
{enabled, Status} ->
{evacuation, evacuation(Status)};
disabled ->
case emqx_node_rebalance_agent:status() of
{enabled, CoordinatorPid} ->
case emqx_node_rebalance:status(CoordinatorPid) of
{enabled, Status} ->
local_rebalance(Status, node());
disabled ->
disabled
end;
disabled ->
disabled
end
end.
-spec local_status(node()) -> disabled | {evacuation, map()} | {rebalance, map()}.
local_status(Node) ->
emqx_node_rebalance_status_proto_v1:local_status(Node).
-spec format_local_status(map()) -> iodata().
format_local_status(Status) ->
format_status(Status, local_status_field_format_order()).
-spec global_status() -> #{rebalances := [{node(), map()}], evacuations := [{node(), map()}]}.
global_status() ->
Nodes = mria_mnesia:running_nodes(),
{RebalanceResults, _} = emqx_node_rebalance_status_proto_v1:rebalance_status(Nodes),
Rebalances = [
{Node, coordinator_rebalance(Status)}
|| {Node, {enabled, Status}} <- RebalanceResults
],
{EvacuatioResults, _} = emqx_node_rebalance_status_proto_v1:evacuation_status(Nodes),
Evacuations = [{Node, evacuation(Status)} || {Node, {enabled, Status}} <- EvacuatioResults],
#{rebalances => Rebalances, evacuations => Evacuations}.
-spec format_coordinator_status(map()) -> iodata().
format_coordinator_status(Status) ->
format_status(Status, coordinator_status_field_format_order()).
%%--------------------------------------------------------------------
%% Internal functions
%%--------------------------------------------------------------------
evacuation(Status) ->
#{
state => maps:get(state, Status),
connection_eviction_rate => maps:get(conn_evict_rate, Status),
session_eviction_rate => maps:get(sess_evict_rate, Status),
connection_goal => 0,
session_goal => 0,
session_recipients => maps:get(migrate_to, Status),
stats => #{
initial_connected => maps:get(initial_conns, Status),
current_connected => maps:get(current_conns, Status),
initial_sessions => maps:get(initial_sessions, Status),
current_sessions => maps:get(current_sessions, Status)
}
}.
local_rebalance(#{donors := Donors} = Stats, Node) ->
case lists:member(Node, Donors) of
true -> {rebalance, donor_rebalance(Stats, Node)};
false -> disabled
end.
donor_rebalance(Status, Node) ->
Opts = maps:get(opts, Status),
InitialConnCounts = maps:get(initial_conn_counts, Status),
InitialSessCounts = maps:get(initial_sess_counts, Status),
CurrentStats = #{
initial_connected => maps:get(Node, InitialConnCounts),
initial_sessions => maps:get(Node, InitialSessCounts),
current_connected => emqx_eviction_agent:connection_count(),
current_sessions => emqx_eviction_agent:session_count(),
current_disconnected_sessions => emqx_eviction_agent:session_count(
disconnected
)
},
maps:from_list(
[
{state, maps:get(state, Status)},
{coordinator_node, maps:get(coordinator_node, Status)},
{connection_eviction_rate, maps:get(conn_evict_rate, Opts)},
{session_eviction_rate, maps:get(sess_evict_rate, Opts)},
{recipients, maps:get(recipients, Status)},
{stats, CurrentStats}
] ++
[
{connection_goal, maps:get(recipient_conn_avg, Status)}
|| maps:is_key(recipient_conn_avg, Status)
] ++
[
{disconnected_session_goal, maps:get(recipient_sess_avg, Status)}
|| maps:is_key(recipient_sess_avg, Status)
]
).
coordinator_rebalance(Status) ->
Opts = maps:get(opts, Status),
maps:from_list(
[
{state, maps:get(state, Status)},
{coordinator_node, maps:get(coordinator_node, Status)},
{connection_eviction_rate, maps:get(conn_evict_rate, Opts)},
{session_eviction_rate, maps:get(sess_evict_rate, Opts)},
{recipients, maps:get(recipients, Status)},
{donors, maps:get(donors, Status)}
] ++
[
{connection_goal, maps:get(recipient_conn_avg, Status)}
|| maps:is_key(recipient_conn_avg, Status)
] ++
[
{disconnected_session_goal, maps:get(recipient_sess_avg, Status)}
|| maps:is_key(recipient_sess_avg, Status)
] ++
[
{donor_conn_avg, maps:get(donor_conn_avg, Status)}
|| maps:is_key(donor_conn_avg, Status)
] ++
[
{donor_sess_avg, maps:get(donor_sess_avg, Status)}
|| maps:is_key(donor_sess_avg, Status)
]
).
local_status_field_format_order() ->
[
state,
coordinator_node,
connection_eviction_rate,
session_eviction_rate,
connection_goal,
session_goal,
disconnected_session_goal,
session_recipients,
recipients,
stats
].
coordinator_status_field_format_order() ->
[
state,
coordinator_node,
donors,
recipients,
connection_eviction_rate,
session_eviction_rate,
connection_goal,
disconnected_session_goal,
donor_conn_avg,
donor_sess_avg
].
format_status(Status, FieldOrder) ->
Fields = lists:flatmap(
fun(FieldName) ->
maps:to_list(maps:with([FieldName], Status))
end,
FieldOrder
),
lists:map(
fun format_local_status_field/1,
Fields
).
format_local_status_field({state, State}) ->
io_lib:format("Rebalance state: ~p~n", [State]);
format_local_status_field({coordinator_node, Node}) ->
io_lib:format("Coordinator node: ~p~n", [Node]);
format_local_status_field({connection_eviction_rate, ConnEvictRate}) ->
io_lib:format("Connection eviction rate: ~p connections/second~n", [ConnEvictRate]);
format_local_status_field({session_eviction_rate, SessEvictRate}) ->
io_lib:format("Session eviction rate: ~p sessions/second~n", [SessEvictRate]);
format_local_status_field({connection_goal, ConnGoal}) ->
io_lib:format("Connection goal: ~p~n", [ConnGoal]);
format_local_status_field({session_goal, SessGoal}) ->
io_lib:format("Session goal: ~p~n", [SessGoal]);
format_local_status_field({disconnected_session_goal, DisconnSessGoal}) ->
io_lib:format("Disconnected session goal: ~p~n", [DisconnSessGoal]);
format_local_status_field({session_recipients, SessionRecipients}) ->
io_lib:format("Session recipient nodes: ~p~n", [SessionRecipients]);
format_local_status_field({recipients, Recipients}) ->
io_lib:format("Recipient nodes: ~p~n", [Recipients]);
format_local_status_field({donors, Donors}) ->
io_lib:format("Donor nodes: ~p~n", [Donors]);
format_local_status_field({donor_conn_avg, DonorConnAvg}) ->
io_lib:format("Current average donor node connection count: ~p~n", [DonorConnAvg]);
format_local_status_field({donor_sess_avg, DonorSessAvg}) ->
io_lib:format("Current average donor node disconnected session count: ~p~n", [DonorSessAvg]);
format_local_status_field({stats, Stats}) ->
format_local_stats(Stats).
format_local_stats(Stats) ->
[
"Channel statistics:\n"
| lists:map(
fun({Name, Value}) ->
io_lib:format(" ~p: ~p~n", [Name, Value])
end,
maps:to_list(Stats)
)
].
evacuation_status() ->
{node(), emqx_node_rebalance_evacuation:status()}.
rebalance_status() ->
{node(), emqx_node_rebalance:status()}.

View File

@ -0,0 +1,35 @@
%%--------------------------------------------------------------------
%% Copyright (c) 2022-2023 EMQ Technologies Co., Ltd. All Rights Reserved.
%%--------------------------------------------------------------------
-module(emqx_node_rebalance_sup).
-behaviour(supervisor).
-export([start_link/0]).
-export([init/1]).
start_link() ->
supervisor:start_link({local, ?MODULE}, ?MODULE, []).
init([]) ->
Childs = [
child_spec(emqx_node_rebalance_evacuation, []),
child_spec(emqx_node_rebalance_agent, []),
child_spec(emqx_node_rebalance, [])
],
{ok, {
#{strategy => one_for_one, intensity => 10, period => 3600},
Childs
}}.
child_spec(Mod, Args) ->
#{
id => Mod,
start => {Mod, start_link, Args},
restart => permanent,
shutdown => 5000,
type => worker,
modules => [Mod]
}.

View File

@ -0,0 +1,43 @@
%%--------------------------------------------------------------------
%% Copyright (c) 2022-2023 EMQ Technologies Co., Ltd. All Rights Reserved.
%%--------------------------------------------------------------------
-module(emqx_node_rebalance_api_proto_v1).
-behaviour(emqx_bpapi).
-export([
introduced_in/0,
node_rebalance_evacuation_start/2,
node_rebalance_evacuation_stop/1,
node_rebalance_start/2,
node_rebalance_stop/1
]).
-include_lib("emqx/include/bpapi.hrl").
-include_lib("emqx/include/types.hrl").
introduced_in() ->
"5.0.22".
-spec node_rebalance_evacuation_start(node(), emqx_node_rebalance_evacuation:start_opts()) ->
emqx_rpc:badrpc() | ok_or_error(emqx_node_rebalance_evacuation:start_error()).
node_rebalance_evacuation_start(Node, #{} = Opts) ->
rpc:call(Node, emqx_node_rebalance_evacuation, start, [Opts]).
-spec node_rebalance_evacuation_stop(node()) ->
emqx_rpc:badrpc() | ok_or_error(not_started).
node_rebalance_evacuation_stop(Node) ->
rpc:call(Node, emqx_node_rebalance_evacuation, stop, []).
-spec node_rebalance_start(node(), emqx_node_rebalance:start_opts()) ->
emqx_rpc:badrpc() | ok_or_error(emqx_node_rebalance:start_error()).
node_rebalance_start(Node, Opts) ->
rpc:call(Node, emqx_node_rebalance, start, [Opts]).
-spec node_rebalance_stop(node()) ->
emqx_rpc:badrpc() | ok_or_error(not_started).
node_rebalance_stop(Node) ->
rpc:call(Node, emqx_node_rebalance, stop, []).

View File

@ -0,0 +1,22 @@
%%--------------------------------------------------------------------
%% Copyright (c) 2022-2023 EMQ Technologies Co., Ltd. All Rights Reserved.
%%--------------------------------------------------------------------
-module(emqx_node_rebalance_evacuation_proto_v1).
-behaviour(emqx_bpapi).
-export([
introduced_in/0,
available_nodes/1
]).
-include_lib("emqx/include/bpapi.hrl").
introduced_in() ->
"5.0.22".
-spec available_nodes([node()]) -> emqx_rpc:multicall_result(node()).
available_nodes(Nodes) ->
rpc:multicall(Nodes, emqx_node_rebalance_evacuation, is_node_available, []).

View File

@ -0,0 +1,62 @@
%%--------------------------------------------------------------------
%% Copyright (c) 2022-2023 EMQ Technologies Co., Ltd. All Rights Reserved.
%%--------------------------------------------------------------------
-module(emqx_node_rebalance_proto_v1).
-behaviour(emqx_bpapi).
-export([
introduced_in/0,
available_nodes/1,
evict_connections/2,
evict_sessions/4,
connection_counts/1,
session_counts/1,
enable_rebalance_agent/2,
disable_rebalance_agent/2,
disconnected_session_counts/1
]).
-include_lib("emqx/include/bpapi.hrl").
-include_lib("emqx/include/types.hrl").
introduced_in() ->
"5.0.22".
-spec available_nodes([node()]) -> emqx_rpc:multicall_result(node()).
available_nodes(Nodes) ->
rpc:multicall(Nodes, emqx_node_rebalance, is_node_available, []).
-spec evict_connections([node()], non_neg_integer()) ->
emqx_rpc:multicall_result(ok_or_error(disabled)).
evict_connections(Nodes, Count) ->
rpc:multicall(Nodes, emqx_eviction_agent, evict_connections, [Count]).
-spec evict_sessions([node()], non_neg_integer(), [node()], emqx_channel:conn_state()) ->
emqx_rpc:multicall_result(ok_or_error(disabled)).
evict_sessions(Nodes, Count, RecipientNodes, ConnState) ->
rpc:multicall(Nodes, emqx_eviction_agent, evict_sessions, [Count, RecipientNodes, ConnState]).
-spec connection_counts([node()]) -> emqx_rpc:multicall_result({ok, non_neg_integer()}).
connection_counts(Nodes) ->
rpc:multicall(Nodes, emqx_node_rebalance, connection_count, []).
-spec session_counts([node()]) -> emqx_rpc:multicall_result({ok, non_neg_integer()}).
session_counts(Nodes) ->
rpc:multicall(Nodes, emqx_node_rebalance, session_count, []).
-spec enable_rebalance_agent([node()], pid()) ->
emqx_rpc:multicall_result(ok_or_error(already_enabled | eviction_agent_busy)).
enable_rebalance_agent(Nodes, OwnerPid) ->
rpc:multicall(Nodes, emqx_node_rebalance_agent, enable, [OwnerPid]).
-spec disable_rebalance_agent([node()], pid()) ->
emqx_rpc:multicall_result(ok_or_error(already_disabled | invalid_coordinator)).
disable_rebalance_agent(Nodes, OwnerPid) ->
rpc:multicall(Nodes, emqx_node_rebalance_agent, disable, [OwnerPid]).
-spec disconnected_session_counts([node()]) -> emqx_rpc:multicall_result({ok, non_neg_integer()}).
disconnected_session_counts(Nodes) ->
rpc:multicall(Nodes, emqx_node_rebalance, disconnected_session_count, []).

View File

@ -0,0 +1,36 @@
%%--------------------------------------------------------------------
%% Copyright (c) 2022-2023 EMQ Technologies Co., Ltd. All Rights Reserved.
%%--------------------------------------------------------------------
-module(emqx_node_rebalance_status_proto_v1).
-behaviour(emqx_bpapi).
-export([
introduced_in/0,
local_status/1,
rebalance_status/1,
evacuation_status/1
]).
-include_lib("emqx/include/bpapi.hrl").
-include_lib("emqx/include/types.hrl").
introduced_in() ->
"5.0.22".
-spec local_status(node()) ->
emqx_rpc:badrpc() | disabled | {evacuation, map()} | {rebalance, map()}.
local_status(Node) ->
rpc:call(Node, emqx_node_rebalance_status, local_status, []).
-spec rebalance_status([node()]) ->
emqx_rpc:multicall_result({node(), map()}).
rebalance_status(Nodes) ->
rpc:multicall(Nodes, emqx_node_rebalance_status, rebalance_status, []).
-spec evacuation_status([node()]) ->
emqx_rpc:multicall_result({node(), map()}).
evacuation_status(Nodes) ->
rpc:multicall(Nodes, emqx_node_rebalance_status, evacuation_status, []).

View File

@ -0,0 +1,229 @@
%%--------------------------------------------------------------------
%% Copyright (c) 2022-2023 EMQ Technologies Co., Ltd. All Rights Reserved.
%%--------------------------------------------------------------------
-module(emqx_node_rebalance_SUITE).
-compile(export_all).
-compile(nowarn_export_all).
-include_lib("emqx/include/emqx.hrl").
-include_lib("emqx/include/emqx_mqtt.hrl").
-include_lib("emqx/include/asserts.hrl").
-include_lib("eunit/include/eunit.hrl").
-include_lib("common_test/include/ct.hrl").
-include_lib("snabbkaffe/include/snabbkaffe.hrl").
-import(
emqx_eviction_agent_test_helpers,
[emqtt_connect_many/1, emqtt_connect_many/2, stop_many/1, case_specific_node_name/3]
).
-define(START_APPS, [emqx_eviction_agent, emqx_node_rebalance]).
all() ->
emqx_common_test_helpers:all(?MODULE).
init_per_suite(Config) ->
ok = emqx_common_test_helpers:start_apps([]),
Config.
end_per_suite(_Config) ->
ok = emqx_common_test_helpers:stop_apps([]),
ok.
init_per_testcase(Case, Config) ->
ClusterNodes = emqx_eviction_agent_test_helpers:start_cluster(
[
{case_specific_node_name(?MODULE, Case, '_donor'), 2883},
{case_specific_node_name(?MODULE, Case, '_recipient'), 3883}
],
?START_APPS
),
ok = snabbkaffe:start_trace(),
[{cluster_nodes, ClusterNodes} | Config].
end_per_testcase(_Case, Config) ->
ok = snabbkaffe:stop(),
ok = emqx_eviction_agent_test_helpers:stop_cluster(
?config(cluster_nodes, Config),
?START_APPS
).
%%--------------------------------------------------------------------
%% Tests
%%--------------------------------------------------------------------
t_rebalance(Config) ->
process_flag(trap_exit, true),
[{DonorNode, DonorPort}, {RecipientNode, _RecipientPort}] = ?config(cluster_nodes, Config),
Nodes = [DonorNode, RecipientNode],
Conns = emqtt_connect_many(DonorPort, 500),
Opts = #{
conn_evict_rate => 10,
sess_evict_rate => 10,
evict_interval => 10,
abs_conn_threshold => 50,
abs_sess_threshold => 50,
rel_conn_threshold => 1.0,
rel_sess_threshold => 1.0,
wait_health_check => 0.01,
wait_takeover => 0.01,
nodes => Nodes
},
?assertWaitEvent(
ok = rpc:call(DonorNode, emqx_node_rebalance, start, [Opts]),
#{?snk_kind := emqx_node_rebalance_evict_sess_over},
10000
),
DonorConnCount = rpc:call(DonorNode, emqx_eviction_agent, connection_count, []),
DonorSessCount = rpc:call(DonorNode, emqx_eviction_agent, session_count, []),
DonorDSessCount = rpc:call(DonorNode, emqx_eviction_agent, session_count, [disconnected]),
RecipientConnCount = rpc:call(RecipientNode, emqx_eviction_agent, connection_count, []),
RecipientSessCount = rpc:call(RecipientNode, emqx_eviction_agent, session_count, []),
RecipientDSessCount = rpc:call(RecipientNode, emqx_eviction_agent, session_count, [disconnected]),
ct:pal(
"Donor: conn=~p, sess=~p, dsess=~p",
[DonorConnCount, DonorSessCount, DonorDSessCount]
),
ct:pal(
"Recipient: conn=~p, sess=~p, dsess=~p",
[RecipientConnCount, RecipientSessCount, RecipientDSessCount]
),
?assert(DonorConnCount - 50 =< RecipientConnCount),
?assert(DonorDSessCount - 50 =< RecipientDSessCount),
ok = stop_many(Conns).
t_rebalance_node_crash(Config) ->
process_flag(trap_exit, true),
[{DonorNode, DonorPort}, {RecipientNode, _RecipientPort}] = ?config(cluster_nodes, Config),
Nodes = [DonorNode, RecipientNode],
Conns = emqtt_connect_many(DonorPort, 500),
Opts = #{
conn_evict_rate => 10,
sess_evict_rate => 10,
evict_interval => 10,
abs_conn_threshold => 50,
abs_sess_threshold => 50,
rel_conn_threshold => 1.0,
rel_sess_threshold => 1.0,
wait_health_check => 0.01,
wait_takeover => 0.01,
nodes => Nodes
},
?assertWaitEvent(
begin
ok = rpc:call(DonorNode, emqx_node_rebalance, start, [Opts]),
emqx_common_test_helpers:stop_slave(RecipientNode)
end,
#{?snk_kind := emqx_node_rebalance_started},
1000
),
?assertEqual(
disabled,
rpc:call(DonorNode, emqx_node_rebalance, status, [])
),
ok = stop_many(Conns).
t_no_need_to_rebalance(Config) ->
process_flag(trap_exit, true),
[{DonorNode, DonorPort}, {RecipientNode, _RecipientPort}] = ?config(cluster_nodes, Config),
Nodes = [DonorNode, RecipientNode],
Opts = #{
conn_evict_rate => 10,
sess_evict_rate => 10,
evict_interval => 10,
abs_conn_threshold => 50,
abs_sess_threshold => 50,
rel_conn_threshold => 1.0,
rel_sess_threshold => 1.0,
wait_health_check => 0.01,
wait_takeover => 0.01,
nodes => Nodes
},
?assertEqual(
{error, nothing_to_balance},
rpc:call(DonorNode, emqx_node_rebalance, start, [Opts])
),
Conns = emqtt_connect_many(DonorPort, 50),
?assertEqual(
{error, nothing_to_balance},
rpc:call(DonorNode, emqx_node_rebalance, start, [Opts])
),
ok = stop_many(Conns).
t_unknown_mesages(Config) ->
process_flag(trap_exit, true),
[{DonorNode, DonorPort}, {RecipientNode, _RecipientPort}] = ?config(cluster_nodes, Config),
Nodes = [DonorNode, RecipientNode],
Conns = emqtt_connect_many(DonorPort, 500),
Opts = #{
wait_health_check => 100,
abs_conn_threshold => 50,
nodes => Nodes
},
Pid = rpc:call(DonorNode, erlang, whereis, [emqx_node_rebalance]),
Pid ! unknown,
ok = gen_server:cast(Pid, unknown),
?assertEqual(
ignored,
gen_server:call(Pid, unknown)
),
ok = rpc:call(DonorNode, emqx_node_rebalance, start, [Opts]),
Pid ! unknown,
ok = gen_server:cast(Pid, unknown),
?assertEqual(
ignored,
gen_server:call(Pid, unknown)
),
ok = stop_many(Conns).
t_available_nodes(Config) ->
[{DonorNode, _DonorPort}, {RecipientNode, _RecipientPort}] = ?config(cluster_nodes, Config),
%% Start eviction agent on RecipientNode so that it will be "occupied"
%% and not available for rebalance
ok = rpc:call(RecipientNode, emqx_eviction_agent, enable, [test_rebalance, undefined]),
%% Only DonorNode should be is available for rebalance, since RecipientNode is "occupied"
?assertEqual(
[DonorNode],
rpc:call(
DonorNode,
emqx_node_rebalance,
available_nodes,
[[DonorNode, RecipientNode]]
)
).

View File

@ -0,0 +1,214 @@
%%--------------------------------------------------------------------
%% Copyright (c) 2022-2023 EMQ Technologies Co., Ltd. All Rights Reserved.
%%--------------------------------------------------------------------
-module(emqx_node_rebalance_agent_SUITE).
-compile(export_all).
-compile(nowarn_export_all).
-include_lib("emqx/include/emqx.hrl").
-include_lib("emqx/include/emqx_mqtt.hrl").
-include_lib("eunit/include/eunit.hrl").
-include_lib("common_test/include/ct.hrl").
-include_lib("snabbkaffe/include/snabbkaffe.hrl").
-import(
emqx_eviction_agent_test_helpers,
[case_specific_node_name/2]
).
all() ->
[
{group, local},
{group, cluster}
].
groups() ->
[
{local, [], [
t_enable_disable,
t_enable_egent_busy,
t_unknown_messages
]},
{cluster, [], [
t_rebalance_agent_coordinator_fail,
t_rebalance_agent_fail
]}
].
init_per_suite(Config) ->
ok = emqx_common_test_helpers:start_apps([emqx_eviction_agent, emqx_node_rebalance]),
Config.
end_per_suite(_Config) ->
ok = emqx_common_test_helpers:stop_apps([emqx_eviction_agent, emqx_node_rebalance]),
ok.
init_per_group(local, Config) ->
[{cluster, false} | Config];
init_per_group(cluster, Config) ->
[{cluster, true} | Config].
end_per_group(_Group, _Config) ->
ok.
init_per_testcase(Case, Config) ->
case ?config(cluster, Config) of
true ->
ClusterNodes = emqx_eviction_agent_test_helpers:start_cluster(
[{case_specific_node_name(?MODULE, Case), 2883}],
[emqx_eviction_agent, emqx_node_rebalance]
),
[{cluster_nodes, ClusterNodes} | Config];
false ->
Config
end.
end_per_testcase(_Case, Config) ->
case ?config(cluster, Config) of
true ->
emqx_eviction_agent_test_helpers:stop_cluster(
?config(cluster_nodes, Config),
[emqx_eviction_agent, emqx_node_rebalance]
);
false ->
ok
end.
%%--------------------------------------------------------------------
%% Tests
%%--------------------------------------------------------------------
%% Local tests
t_enable_disable(_Config) ->
?assertEqual(
disabled,
emqx_node_rebalance_agent:status()
),
?assertEqual(
ok,
emqx_node_rebalance_agent:enable(self())
),
?assertEqual(
{error, already_enabled},
emqx_node_rebalance_agent:enable(self())
),
?assertEqual(
{enabled, self()},
emqx_node_rebalance_agent:status()
),
?assertEqual(
{error, invalid_coordinator},
emqx_node_rebalance_agent:disable(spawn_link(fun() -> ok end))
),
?assertEqual(
ok,
emqx_node_rebalance_agent:disable(self())
),
?assertEqual(
{error, already_disabled},
emqx_node_rebalance_agent:disable(self())
),
?assertEqual(
disabled,
emqx_node_rebalance_agent:status()
).
t_enable_egent_busy(_Config) ->
ok = emqx_eviction_agent:enable(rebalance_test, undefined),
?assertEqual(
{error, eviction_agent_busy},
emqx_node_rebalance_agent:enable(self())
),
ok = emqx_eviction_agent:disable(rebalance_test).
t_unknown_messages(_Config) ->
Pid = whereis(emqx_node_rebalance_agent),
ok = gen_server:cast(Pid, unknown),
Pid ! unknown,
ignored = gen_server:call(Pid, unknown).
%% Cluster tests
% The following tests verify that emqx_node_rebalance_agent correctly links
% coordinator process with emqx_eviction_agent-s.
t_rebalance_agent_coordinator_fail(Config) ->
process_flag(trap_exit, true),
[{Node, _}] = ?config(cluster_nodes, Config),
CoordinatorPid = spawn_link(
fun() ->
receive
done -> ok
end
end
),
?assertEqual(
disabled,
rpc:call(Node, emqx_eviction_agent, status, [])
),
?assertEqual(
ok,
rpc:call(Node, emqx_node_rebalance_agent, enable, [CoordinatorPid])
),
?assertMatch(
{enabled, _},
rpc:call(Node, emqx_eviction_agent, status, [])
),
EvictionAgentPid = rpc:call(Node, erlang, whereis, [emqx_eviction_agent]),
true = link(EvictionAgentPid),
true = exit(CoordinatorPid, kill),
receive
{'EXIT', EvictionAgentPid, _} -> true
after 1000 ->
ct:fail("emqx_eviction_agent did not exit")
end.
t_rebalance_agent_fail(Config) ->
process_flag(trap_exit, true),
[{Node, _}] = ?config(cluster_nodes, Config),
CoordinatorPid = spawn_link(
fun() ->
receive
done -> ok
end
end
),
?assertEqual(
ok,
rpc:call(Node, emqx_node_rebalance_agent, enable, [CoordinatorPid])
),
EvictionAgentPid = rpc:call(Node, erlang, whereis, [emqx_eviction_agent]),
true = exit(EvictionAgentPid, kill),
receive
{'EXIT', CoordinatorPid, _} -> true
after 1000 ->
ct:fail("emqx_node_rebalance_agent did not exit")
end.

View File

@ -0,0 +1,444 @@
%%--------------------------------------------------------------------
%% Copyright (c) 2022-2023 EMQ Technologies Co., Ltd. All Rights Reserved.
%%--------------------------------------------------------------------
-module(emqx_node_rebalance_api_SUITE).
-compile(export_all).
-compile(nowarn_export_all).
-include_lib("eunit/include/eunit.hrl").
-include_lib("common_test/include/ct.hrl").
-import(
emqx_mgmt_api_test_util,
[
request/2,
request/3,
uri/1
]
).
-import(
emqx_eviction_agent_test_helpers,
[emqtt_connect_many/2, stop_many/1, case_specific_node_name/3]
).
-define(START_APPS, [emqx_eviction_agent, emqx_node_rebalance]).
all() ->
emqx_common_test_helpers:all(?MODULE).
init_per_suite(Config) ->
ok = emqx_common_test_helpers:start_apps(?START_APPS),
Config.
end_per_suite(_Config) ->
ok = emqx_common_test_helpers:stop_apps(?START_APPS),
ok.
init_per_testcase(Case, Config) ->
[{DonorNode, _} | _] =
ClusterNodes = emqx_eviction_agent_test_helpers:start_cluster(
[
{case_specific_node_name(?MODULE, Case, '_donor'), 2883},
{case_specific_node_name(?MODULE, Case, '_recipient'), 3883}
],
?START_APPS,
[{emqx, data_dir, case_specific_data_dir(Case, Config)}]
),
ok = rpc:call(DonorNode, emqx_mgmt_api_test_util, init_suite, []),
ok = take_auth_header_from(DonorNode),
[{cluster_nodes, ClusterNodes} | Config].
end_per_testcase(_Case, Config) ->
_ = emqx_eviction_agent_test_helpers:stop_cluster(
?config(cluster_nodes, Config),
?START_APPS
).
%%--------------------------------------------------------------------
%% Tests
%%--------------------------------------------------------------------
t_start_evacuation_validation(Config) ->
[{DonorNode, _}, {RecipientNode, _}] = ?config(cluster_nodes, Config),
BadOpts = [
#{conn_evict_rate => <<"conn">>},
#{sess_evict_rate => <<"sess">>},
#{redirect_to => 123},
#{wait_takeover => <<"wait">>},
#{migrate_to => []},
#{migrate_to => <<"migrate_to">>},
#{migrate_to => [<<"bad_node">>]},
#{migrate_to => [<<"bad_node">>, atom_to_binary(DonorNode)]},
#{unknown => <<"Value">>}
],
lists:foreach(
fun(Opts) ->
?assertMatch(
{ok, 400, #{}},
api_post(
["load_rebalance", atom_to_list(DonorNode), "evacuation", "start"],
Opts
)
)
end,
BadOpts
),
?assertMatch(
{ok, 400, #{}},
api_post(
["load_rebalance", "bad@node", "evacuation", "start"],
#{}
)
),
?assertMatch(
{ok, 200, #{}},
api_post(
["load_rebalance", atom_to_list(DonorNode), "evacuation", "start"],
#{
conn_evict_rate => 10,
sess_evict_rate => 10,
wait_takeover => 10,
redirect_to => <<"srv">>,
migrate_to => [atom_to_binary(RecipientNode)]
}
)
),
DonorNodeBin = atom_to_binary(DonorNode),
?assertMatch(
{ok, 200, #{<<"evacuations">> := [#{<<"node">> := DonorNodeBin}]}},
api_get(["load_rebalance", "global_status"])
).
t_start_rebalance_validation(Config) ->
process_flag(trap_exit, true),
[{DonorNode, DonorPort}, {RecipientNode, _}] = ?config(cluster_nodes, Config),
BadOpts = [
#{conn_evict_rate => <<"conn">>},
#{sess_evict_rate => <<"sess">>},
#{abs_conn_threshold => <<"act">>},
#{rel_conn_threshold => <<"rct">>},
#{abs_sess_threshold => <<"act">>},
#{rel_sess_threshold => <<"rct">>},
#{wait_takeover => <<"wait">>},
#{wait_health_check => <<"wait">>},
#{nodes => <<"nodes">>},
#{nodes => []},
#{nodes => [<<"bad_node">>]},
#{nodes => [<<"bad_node">>, atom_to_binary(DonorNode)]},
#{unknown => <<"Value">>}
],
lists:foreach(
fun(Opts) ->
?assertMatch(
{ok, 400, #{}},
api_post(
["load_rebalance", atom_to_list(DonorNode), "start"],
Opts
)
)
end,
BadOpts
),
?assertMatch(
{ok, 400, #{}},
api_post(
["load_rebalance", "bad@node", "start"],
#{}
)
),
Conns = emqtt_connect_many(DonorPort, 50),
?assertMatch(
{ok, 200, #{}},
api_post(
["load_rebalance", atom_to_list(DonorNode), "start"],
#{
conn_evict_rate => 10,
sess_evict_rate => 10,
wait_takeover => 10,
wait_health_check => 10,
abs_conn_threshold => 10,
rel_conn_threshold => 1.001,
abs_sess_threshold => 10,
rel_sess_threshold => 1.001,
nodes => [
atom_to_binary(DonorNode),
atom_to_binary(RecipientNode)
]
}
)
),
DonorNodeBin = atom_to_binary(DonorNode),
?assertMatch(
{ok, 200, #{<<"rebalances">> := [#{<<"node">> := DonorNodeBin}]}},
api_get(["load_rebalance", "global_status"])
),
ok = stop_many(Conns).
t_start_stop_evacuation(Config) ->
[{DonorNode, _}, {RecipientNode, _}] = ?config(cluster_nodes, Config),
StartOpts = maps:merge(
emqx_node_rebalance_api:rebalance_evacuation_example(),
#{migrate_to => [atom_to_binary(RecipientNode)]}
),
?assertMatch(
{ok, 200, #{}},
api_post(
["load_rebalance", atom_to_list(DonorNode), "evacuation", "start"],
StartOpts
)
),
StatusResponse = api_get(["load_rebalance", "status"]),
?assertMatch(
{ok, 200, _},
StatusResponse
),
{ok, 200, Status} = StatusResponse,
?assertMatch(
#{
process := evacuation,
connection_eviction_rate := 100,
session_eviction_rate := 100,
connection_goal := 0,
session_goal := 0,
stats := #{
initial_connected := _,
current_connected := _,
initial_sessions := _,
current_sessions := _
}
},
emqx_node_rebalance_api:translate(local_status_enabled, Status)
),
DonorNodeBin = atom_to_binary(DonorNode),
GlobalStatusResponse = api_get(["load_rebalance", "global_status"]),
?assertMatch(
{ok, 200, _},
GlobalStatusResponse
),
{ok, 200, GlobalStatus} = GlobalStatusResponse,
?assertMatch(
#{
rebalances := [],
evacuations := [
#{
node := DonorNodeBin,
connection_eviction_rate := 100,
session_eviction_rate := 100,
connection_goal := 0,
session_goal := 0,
stats := #{
initial_connected := _,
current_connected := _,
initial_sessions := _,
current_sessions := _
}
}
]
},
emqx_node_rebalance_api:translate(global_status, GlobalStatus)
),
?assertMatch(
{ok, 200, #{}},
api_post(
["load_rebalance", atom_to_list(DonorNode), "evacuation", "stop"],
#{}
)
),
?assertMatch(
{ok, 200, #{<<"status">> := <<"disabled">>}},
api_get(["load_rebalance", "status"])
),
?assertMatch(
{ok, 200, #{<<"evacuations">> := [], <<"rebalances">> := []}},
api_get(["load_rebalance", "global_status"])
).
t_start_stop_rebalance(Config) ->
process_flag(trap_exit, true),
[{DonorNode, DonorPort}, {RecipientNode, _}] = ?config(cluster_nodes, Config),
?assertMatch(
{ok, 200, #{<<"status">> := <<"disabled">>}},
api_get(["load_rebalance", "status"])
),
Conns = emqtt_connect_many(DonorPort, 100),
StartOpts = maps:without(
[nodes],
emqx_node_rebalance_api:rebalance_example()
),
?assertMatch(
{ok, 200, #{}},
api_post(
["load_rebalance", atom_to_list(DonorNode), "start"],
StartOpts
)
),
StatusResponse = api_get(["load_rebalance", "status"]),
?assertMatch(
{ok, 200, _},
StatusResponse
),
{ok, 200, Status} = StatusResponse,
?assertMatch(
#{process := rebalance, connection_eviction_rate := 10, session_eviction_rate := 20},
emqx_node_rebalance_api:translate(local_status_enabled, Status)
),
DonorNodeBin = atom_to_binary(DonorNode),
RecipientNodeBin = atom_to_binary(RecipientNode),
GlobalStatusResponse = api_get(["load_rebalance", "global_status"]),
?assertMatch(
{ok, 200, _},
GlobalStatusResponse
),
{ok, 200, GlobalStatus} = GlobalStatusResponse,
?assertMatch(
{ok, 200, #{
<<"evacuations">> := [],
<<"rebalances">> :=
[
#{
<<"state">> := _,
<<"node">> := DonorNodeBin,
<<"coordinator_node">> := _,
<<"connection_eviction_rate">> := 10,
<<"session_eviction_rate">> := 20,
<<"donors">> := [DonorNodeBin],
<<"recipients">> := [RecipientNodeBin]
}
]
}},
api_get(["load_rebalance", "global_status"])
),
?assertMatch(
#{
evacuations := [],
rebalances := [
#{
state := _,
node := DonorNodeBin,
coordinator_node := _,
connection_eviction_rate := 10,
session_eviction_rate := 20,
donors := [DonorNodeBin],
recipients := [RecipientNodeBin]
}
]
},
emqx_node_rebalance_api:translate(global_status, GlobalStatus)
),
?assertMatch(
{ok, 200, #{}},
api_post(
["load_rebalance", atom_to_list(DonorNode), "stop"],
#{}
)
),
?assertMatch(
{ok, 200, #{<<"status">> := <<"disabled">>}},
api_get(["load_rebalance", "status"])
),
?assertMatch(
{ok, 200, #{<<"evacuations">> := [], <<"rebalances">> := []}},
api_get(["load_rebalance", "global_status"])
),
ok = stop_many(Conns).
t_availability_check(Config) ->
[{DonorNode, _} | _] = ?config(cluster_nodes, Config),
?assertMatch(
{ok, 200, #{}},
api_get(["load_rebalance", "availability_check"])
),
ok = rpc:call(DonorNode, emqx_node_rebalance_evacuation, start, [#{}]),
?assertMatch(
{ok, 503, _},
api_get(["load_rebalance", "availability_check"])
),
ok = rpc:call(DonorNode, emqx_node_rebalance_evacuation, stop, []),
?assertMatch(
{ok, 200, #{}},
api_get(["load_rebalance", "availability_check"])
).
%%--------------------------------------------------------------------
%% Helpers
%%--------------------------------------------------------------------
api_get(Path) ->
case request(get, uri(Path)) of
{ok, Code, ResponseBody} ->
{ok, Code, jiffy:decode(ResponseBody, [return_maps])};
{error, _} = Error ->
Error
end.
api_post(Path, Data) ->
case request(post, uri(Path), Data) of
{ok, Code, ResponseBody} ->
{ok, Code, jiffy:decode(ResponseBody, [return_maps])};
{error, _} = Error ->
Error
end.
take_auth_header_from(Node) ->
meck:new(emqx_common_test_http, [passthrough]),
meck:expect(
emqx_common_test_http,
default_auth_header,
fun() -> rpc:call(Node, emqx_common_test_http, default_auth_header, []) end
),
ok.
case_specific_data_dir(Case, Config) ->
case ?config(priv_dir, Config) of
undefined -> undefined;
PrivDir -> filename:join(PrivDir, atom_to_list(Case))
end.

View File

@ -0,0 +1,291 @@
%%--------------------------------------------------------------------
%% Copyright (c) 2022-2023 EMQ Technologies Co., Ltd. All Rights Reserved.
%%%--------------------------------------------------------------------
-module(emqx_node_rebalance_cli_SUITE).
-compile(export_all).
-compile(nowarn_export_all).
-include_lib("eunit/include/eunit.hrl").
-include_lib("common_test/include/ct.hrl").
-import(
emqx_eviction_agent_test_helpers,
[emqtt_connect_many/2, stop_many/1, case_specific_node_name/3]
).
-define(START_APPS, [emqx_eviction_agent, emqx_node_rebalance]).
all() ->
emqx_common_test_helpers:all(?MODULE).
init_per_suite(Config) ->
emqx_common_test_helpers:start_apps(?START_APPS),
Config.
end_per_suite(Config) ->
emqx_common_test_helpers:stop_apps(lists:reverse(?START_APPS)),
Config.
init_per_testcase(Case = t_rebalance, Config) ->
_ = emqx_node_rebalance_evacuation:stop(),
ClusterNodes = emqx_eviction_agent_test_helpers:start_cluster(
[
{case_specific_node_name(?MODULE, Case, '_donor'), 2883},
{case_specific_node_name(?MODULE, Case, '_recipient'), 3883}
],
?START_APPS
),
[{cluster_nodes, ClusterNodes} | Config];
init_per_testcase(_Case, Config) ->
_ = emqx_node_rebalance_evacuation:stop(),
_ = emqx_node_rebalance:stop(),
Config.
end_per_testcase(t_rebalance, Config) ->
_ = emqx_node_rebalance_evacuation:stop(),
_ = emqx_node_rebalance:stop(),
_ = emqx_eviction_agent_test_helpers:stop_cluster(
?config(cluster_nodes, Config),
?START_APPS
);
end_per_testcase(_Case, _Config) ->
_ = emqx_node_rebalance_evacuation:stop(),
_ = emqx_node_rebalance:stop().
%%--------------------------------------------------------------------
%% Tests
%%--------------------------------------------------------------------
t_evacuation(_Config) ->
%% usage
ok = emqx_node_rebalance_cli:cli(["foobar"]),
%% status
ok = emqx_node_rebalance_cli:cli(["status"]),
ok = emqx_node_rebalance_cli:cli(["node-status"]),
ok = emqx_node_rebalance_cli:cli(["node-status", atom_to_list(node())]),
%% start with invalid args
?assertNot(
emqx_node_rebalance_cli:cli(["start", "--evacuation", "--foo-bar"])
),
?assertNot(
emqx_node_rebalance_cli:cli(["start", "--evacuation", "--conn-evict-rate", "foobar"])
),
?assertNot(
emqx_node_rebalance_cli:cli(["start", "--evacuation", "--sess-evict-rate", "foobar"])
),
?assertNot(
emqx_node_rebalance_cli:cli(["start", "--evacuation", "--wait-takeover", "foobar"])
),
?assertNot(
emqx_node_rebalance_cli:cli([
"start",
"--evacuation",
"--migrate-to",
"nonexistent@node"
])
),
?assertNot(
emqx_node_rebalance_cli:cli([
"start",
"--evacuation",
"--migrate-to",
""
])
),
?assertNot(
emqx_node_rebalance_cli:cli([
"start",
"--evacuation",
"--unknown-arg"
])
),
?assert(
emqx_node_rebalance_cli:cli([
"start",
"--evacuation",
"--conn-evict-rate",
"10",
"--sess-evict-rate",
"10",
"--wait-takeover",
"10",
"--migrate-to",
atom_to_list(node()),
"--redirect-to",
"srv"
])
),
%% status
ok = emqx_node_rebalance_cli:cli(["status"]),
ok = emqx_node_rebalance_cli:cli(["node-status"]),
ok = emqx_node_rebalance_cli:cli(["node-status", atom_to_list(node())]),
?assertMatch(
{enabled, #{}},
emqx_node_rebalance_evacuation:status()
),
%% already enabled
?assertNot(
emqx_node_rebalance_cli:cli([
"start",
"--evacuation",
"--conn-evict-rate",
"10",
"--redirect-to",
"srv"
])
),
%% stop
true = emqx_node_rebalance_cli:cli(["stop"]),
false = emqx_node_rebalance_cli:cli(["stop"]),
?assertEqual(
disabled,
emqx_node_rebalance_evacuation:status()
).
t_rebalance(Config) ->
process_flag(trap_exit, true),
[{DonorNode, DonorPort}, {RecipientNode, _}] = ?config(cluster_nodes, Config),
%% start with invalid args
?assertNot(
emqx_node_rebalance_cli(DonorNode, ["start", "--foo-bar"])
),
?assertNot(
emqx_node_rebalance_cli(DonorNode, ["start", "--conn-evict-rate", "foobar"])
),
?assertNot(
emqx_node_rebalance_cli(DonorNode, ["start", "--abs-conn-threshold", "foobar"])
),
?assertNot(
emqx_node_rebalance_cli(DonorNode, ["start", "--rel-conn-threshold", "foobar"])
),
?assertNot(
emqx_node_rebalance_cli(DonorNode, ["start", "--sess-evict-rate", "foobar"])
),
?assertNot(
emqx_node_rebalance_cli(DonorNode, ["start", "--abs-sess-threshold", "foobar"])
),
?assertNot(
emqx_node_rebalance_cli(DonorNode, ["start", "--rel-sess-threshold", "foobar"])
),
?assertNot(
emqx_node_rebalance_cli(DonorNode, ["start", "--wait-takeover", "foobar"])
),
?assertNot(
emqx_node_rebalance_cli(DonorNode, ["start", "--wait-health-check", "foobar"])
),
?assertNot(
emqx_node_rebalance_cli(DonorNode, [
"start",
"--nodes",
"nonexistent@node"
])
),
?assertNot(
emqx_node_rebalance_cli(DonorNode, [
"start",
"--nodes",
""
])
),
?assertNot(
emqx_node_rebalance_cli(DonorNode, [
"start",
"--nodes",
atom_to_list(RecipientNode)
])
),
?assertNot(
emqx_node_rebalance_cli(DonorNode, [
"start",
"--unknown-arg"
])
),
Conns = emqtt_connect_many(DonorPort, 20),
?assert(
emqx_node_rebalance_cli(DonorNode, [
"start",
"--conn-evict-rate",
"10",
"--abs-conn-threshold",
"10",
"--rel-conn-threshold",
"1.1",
"--sess-evict-rate",
"10",
"--abs-sess-threshold",
"10",
"--rel-sess-threshold",
"1.1",
"--wait-takeover",
"10",
"--nodes",
atom_to_list(DonorNode) ++ "," ++
atom_to_list(RecipientNode)
])
),
%% status
ok = emqx_node_rebalance_cli(DonorNode, ["status"]),
ok = emqx_node_rebalance_cli(DonorNode, ["node-status"]),
ok = emqx_node_rebalance_cli(DonorNode, ["node-status", atom_to_list(DonorNode)]),
?assertMatch(
{enabled, #{}},
rpc:call(DonorNode, emqx_node_rebalance, status, [])
),
%% already enabled
?assertNot(
emqx_node_rebalance_cli(DonorNode, ["start"])
),
%% stop
true = emqx_node_rebalance_cli(DonorNode, ["stop"]),
false = emqx_node_rebalance_cli(DonorNode, ["stop"]),
?assertEqual(
disabled,
rpc:call(DonorNode, emqx_node_rebalance, status, [])
),
ok = stop_many(Conns).
%%--------------------------------------------------------------------
%% Helpers
%%--------------------------------------------------------------------
emqx_node_rebalance_cli(Node, Args) ->
case rpc:call(Node, emqx_node_rebalance_cli, cli, [Args]) of
{badrpc, Reason} ->
error(Reason);
Result ->
Result
end.

View File

@ -0,0 +1,271 @@
%%--------------------------------------------------------------------
%% Copyright (c) 2022-2023 EMQ Technologies Co., Ltd. All Rights Reserved.
%%--------------------------------------------------------------------
-module(emqx_node_rebalance_evacuation_SUITE).
-compile(export_all).
-compile(nowarn_export_all).
-include_lib("emqx/include/emqx_mqtt.hrl").
-include_lib("emqx/include/asserts.hrl").
-include_lib("eunit/include/eunit.hrl").
-include_lib("common_test/include/ct.hrl").
-include_lib("snabbkaffe/include/snabbkaffe.hrl").
-import(
emqx_eviction_agent_test_helpers,
[emqtt_connect/1, emqtt_try_connect/1, case_specific_node_name/3]
).
all() -> [{group, one_node}, {group, two_node}].
groups() ->
[
{one_node, [], [
t_agent_busy,
t_already_started,
t_not_started,
t_start,
t_persistence,
t_unknown_messages
]},
{two_node, [], [
t_conn_evicted,
t_migrate_to,
t_session_evicted
]}
].
init_per_suite(Config) ->
ok = emqx_common_test_helpers:start_apps([]),
Config.
end_per_suite(_Config) ->
ok = emqx_common_test_helpers:stop_apps([]),
ok.
init_per_group(one_node, Config) ->
[{cluster_type, one_node} | Config];
init_per_group(two_node, Config) ->
[{cluster_type, two_node} | Config].
end_per_group(_Group, _Config) ->
ok.
init_per_testcase(Case, Config) ->
NodesWithPorts =
case ?config(cluster_type, Config) of
one_node ->
[{case_specific_node_name(?MODULE, Case, '_evacuated'), 2883}];
two_node ->
[
{case_specific_node_name(?MODULE, Case, '_evacuated'), 2883},
{case_specific_node_name(?MODULE, Case, '_recipient'), 3883}
]
end,
ClusterNodes = emqx_eviction_agent_test_helpers:start_cluster(
NodesWithPorts,
[emqx_eviction_agent, emqx_node_rebalance],
[{emqx, data_dir, case_specific_data_dir(Case, Config)}]
),
ok = snabbkaffe:start_trace(),
[{cluster_nodes, ClusterNodes} | Config].
end_per_testcase(_Case, Config) ->
ok = snabbkaffe:stop(),
ok = emqx_eviction_agent_test_helpers:stop_cluster(
?config(cluster_nodes, Config),
[emqx_eviction_agent, emqx_node_rebalance]
).
%%--------------------------------------------------------------------
%% Tests
%%--------------------------------------------------------------------
%% One node tests
t_agent_busy(Config) ->
[{DonorNode, _DonorPort}] = ?config(cluster_nodes, Config),
ok = rpc:call(DonorNode, emqx_eviction_agent, enable, [other_rebalance, undefined]),
?assertEqual(
{error, eviction_agent_busy},
rpc:call(DonorNode, emqx_node_rebalance_evacuation, start, [opts(Config)])
).
t_already_started(Config) ->
[{DonorNode, _DonorPort}] = ?config(cluster_nodes, Config),
ok = rpc:call(DonorNode, emqx_node_rebalance_evacuation, start, [opts(Config)]),
?assertEqual(
{error, already_started},
rpc:call(DonorNode, emqx_node_rebalance_evacuation, start, [opts(Config)])
).
t_not_started(Config) ->
[{DonorNode, _DonorPort}] = ?config(cluster_nodes, Config),
?assertEqual(
{error, not_started},
rpc:call(DonorNode, emqx_node_rebalance_evacuation, stop, [])
).
t_start(Config) ->
process_flag(trap_exit, true),
[{DonorNode, DonorPort}] = ?config(cluster_nodes, Config),
ok = rpc:call(DonorNode, emqx_node_rebalance_evacuation, start, [opts(Config)]),
?assertMatch(
{error, {use_another_server, #{}}},
emqtt_try_connect([{port, DonorPort}])
).
t_persistence(Config) ->
process_flag(trap_exit, true),
[{DonorNode, DonorPort}] = ?config(cluster_nodes, Config),
ok = rpc:call(DonorNode, emqx_node_rebalance_evacuation, start, [opts(Config)]),
?assertMatch(
{error, {use_another_server, #{}}},
emqtt_try_connect([{port, DonorPort}])
),
ok = rpc:call(DonorNode, supervisor, terminate_child, [
emqx_node_rebalance_sup, emqx_node_rebalance_evacuation
]),
{ok, _} = rpc:call(DonorNode, supervisor, restart_child, [
emqx_node_rebalance_sup, emqx_node_rebalance_evacuation
]),
?assertMatch(
{error, {use_another_server, #{}}},
emqtt_try_connect([{port, DonorPort}])
),
?assertMatch(
{enabled, #{conn_evict_rate := 10}},
rpc:call(DonorNode, emqx_node_rebalance_evacuation, status, [])
).
t_unknown_messages(Config) ->
process_flag(trap_exit, true),
[{DonorNode, _DonorPort}] = ?config(cluster_nodes, Config),
ok = rpc:call(DonorNode, emqx_node_rebalance_evacuation, start, [opts(Config)]),
Pid = rpc:call(DonorNode, erlang, whereis, [emqx_node_rebalance_evacuation]),
Pid ! unknown,
ok = gen_server:cast(Pid, unknown),
?assertEqual(
ignored,
gen_server:call(Pid, unknown)
).
%% Two node tests
t_conn_evicted(Config) ->
process_flag(trap_exit, true),
[{DonorNode, DonorPort}, _] = ?config(cluster_nodes, Config),
{ok, C} = emqtt_connect([{clientid, <<"evacuated">>}, {port, DonorPort}]),
?assertWaitEvent(
ok = rpc:call(DonorNode, emqx_node_rebalance_evacuation, start, [opts(Config)]),
#{?snk_kind := node_evacuation_evict_conn},
1000
),
?assertMatch(
{error, {use_another_server, #{}}},
emqtt_try_connect([{clientid, <<"connecting">>}, {port, DonorPort}])
),
receive
{'EXIT', C, {disconnected, 156, _}} -> ok
after 1000 ->
ct:fail("Connection not evicted")
end.
t_migrate_to(Config) ->
[{DonorNode, _DonorPort}, {RecipientNode, _RecipientPort}] = ?config(cluster_nodes, Config),
?assertEqual(
[RecipientNode],
rpc:call(DonorNode, emqx_node_rebalance_evacuation, migrate_to, [undefined])
),
?assertEqual(
[],
rpc:call(DonorNode, emqx_node_rebalance_evacuation, migrate_to, [['unknown@node']])
),
ok = rpc:call(RecipientNode, emqx_eviction_agent, enable, [test_rebalance, undefined]),
?assertEqual(
[],
rpc:call(DonorNode, emqx_node_rebalance_evacuation, migrate_to, [undefined])
).
t_session_evicted(Config) ->
process_flag(trap_exit, true),
[{DonorNode, DonorPort}, {RecipientNode, _RecipientPort}] = ?config(cluster_nodes, Config),
{ok, C} = emqtt_connect([
{port, DonorPort}, {clientid, <<"client_with_sess">>}, {clean_start, false}
]),
?assertWaitEvent(
ok = rpc:call(DonorNode, emqx_node_rebalance_evacuation, start, [opts(Config)]),
#{?snk_kind := node_evacuation_evict_sess_over},
5000
),
receive
{'EXIT', C, {disconnected, ?RC_USE_ANOTHER_SERVER, _}} -> ok
after 1000 ->
ct:fail("Connection not evicted")
end,
[ChannelPid] = rpc:call(DonorNode, emqx_cm_registry, lookup_channels, [<<"client_with_sess">>]),
?assertEqual(
RecipientNode,
node(ChannelPid)
).
%%--------------------------------------------------------------------
%% Helpers
%%--------------------------------------------------------------------
opts(Config) ->
#{
server_reference => <<"srv">>,
conn_evict_rate => 10,
sess_evict_rate => 10,
wait_takeover => 1,
migrate_to => migrate_to(Config)
}.
migrate_to(Config) ->
case ?config(cluster_type, Config) of
one_node ->
[];
two_node ->
[_, {RecipientNode, _RecipientPort}] = ?config(cluster_nodes, Config),
[RecipientNode]
end.
case_specific_data_dir(Case, Config) ->
case ?config(priv_dir, Config) of
undefined -> undefined;
PrivDir -> filename:join(PrivDir, atom_to_list(Case))
end.

View File

@ -0,0 +1,108 @@
%%--------------------------------------------------------------------
%% Copyright (c) 2022-2023 EMQ Technologies Co., Ltd. All Rights Reserved.
%%--------------------------------------------------------------------
-module(emqx_node_rebalance_evacuation_persist_SUITE).
-compile(export_all).
-compile(nowarn_export_all).
-include_lib("eunit/include/eunit.hrl").
-include_lib("common_test/include/ct.hrl").
all() ->
emqx_common_test_helpers:all(?MODULE).
init_per_suite(Config) ->
Config.
end_per_suite(_Config) ->
ok.
init_per_testcase(_Case, Config) ->
_ = emqx_node_rebalance_evacuation_persist:clear(),
Config.
end_per_testcase(_Case, _Config) ->
_ = emqx_node_rebalance_evacuation_persist:clear().
%%--------------------------------------------------------------------
%% Tests
%%--------------------------------------------------------------------
t_save_read(_Config) ->
DefaultOpts = #{
server_reference => <<"default_ref">>,
conn_evict_rate => 2001,
sess_evict_rate => 2002,
wait_takeover => 2003
},
Opts0 = #{
server_reference => <<"ref">>,
conn_evict_rate => 1001,
sess_evict_rate => 1002,
wait_takeover => 1003
},
ok = emqx_node_rebalance_evacuation_persist:save(Opts0),
{ok, ReadOpts0} = emqx_node_rebalance_evacuation_persist:read(DefaultOpts),
?assertEqual(Opts0, ReadOpts0),
Opts1 = Opts0#{server_reference => undefined},
ok = emqx_node_rebalance_evacuation_persist:save(Opts1),
{ok, ReadOpts1} = emqx_node_rebalance_evacuation_persist:read(DefaultOpts),
?assertEqual(Opts1, ReadOpts1).
t_read_default(_Config) ->
ok = write_evacuation_file(<<"{}">>),
DefaultOpts = #{
server_reference => <<"ref">>,
conn_evict_rate => 1001,
sess_evict_rate => 1002,
wait_takeover => 1003
},
{ok, ReadOpts} = emqx_node_rebalance_evacuation_persist:read(DefaultOpts),
?assertEqual(DefaultOpts, ReadOpts).
t_read_bad_data(_Config) ->
ok = write_evacuation_file(<<"{bad json">>),
DefaultOpts = #{
server_reference => <<"ref">>,
conn_evict_rate => 1001,
sess_evict_rate => 1002,
wait_takeover => 1003
},
{ok, ReadOpts} = emqx_node_rebalance_evacuation_persist:read(DefaultOpts),
?assertEqual(DefaultOpts, ReadOpts).
t_clear(_Config) ->
ok = write_evacuation_file(<<"{}">>),
?assertMatch(
{ok, _},
emqx_node_rebalance_evacuation_persist:read(#{})
),
ok = emqx_node_rebalance_evacuation_persist:clear(),
?assertEqual(
none,
emqx_node_rebalance_evacuation_persist:read(#{})
).
%%--------------------------------------------------------------------
%% Helpers
%%--------------------------------------------------------------------
write_evacuation_file(Json) ->
ok = filelib:ensure_dir(emqx_node_rebalance_evacuation_persist:evacuation_filepath()),
ok = file:write_file(
emqx_node_rebalance_evacuation_persist:evacuation_filepath(),
Json
).

View File

@ -402,7 +402,9 @@ defmodule EMQXUmbrella.MixProject do
emqx_oracle: :permanent,
emqx_bridge_oracle: :permanent,
emqx_bridge_rabbitmq: :permanent,
emqx_ee_schema_registry: :permanent
emqx_ee_schema_registry: :permanent,
emqx_eviction_agent: :permanent,
emqx_node_rebalance: :permanent
],
else: []
)

View File

@ -478,7 +478,9 @@ relx_apps_per_edition(ee) ->
emqx_oracle,
emqx_bridge_oracle,
emqx_bridge_rabbitmq,
emqx_ee_schema_registry
emqx_ee_schema_registry,
emqx_eviction_agent,
emqx_node_rebalance
];
relx_apps_per_edition(ce) ->
[].