Merge branch 'release-50' into file-transfer

* release-50:
  chore(rebalance): fix app metadata
  chore(rebalance): move apps from lib-ee, add READMEs
  docs: refine zh tr
  docs: delete APL header from ee file
  docs: delete zh changelog
  chore(rebalance): review fixes
  chore(rebalance): rebase and review fixes
  feat(rebalance): port apps from 4.x
This commit is contained in:
Ilya Averyanov 2023-05-10 11:55:23 +05:00
commit 8d9b785bd7
64 changed files with 6977 additions and 66 deletions

View File

@ -179,6 +179,7 @@ clean-all:
@rm -f rebar.lock
@rm -rf deps
@rm -rf _build
@rm -f emqx_dialyzer_*_plt
.PHONY: deps-all
deps-all: $(REBAR) $(PROFILES:%=deps-%)

View File

@ -14,26 +14,19 @@
%% limitations under the License.
%%--------------------------------------------------------------------
%% This file contains common macros for testing.
%% It must not be used anywhere except in test suites.
-include_lib("snabbkaffe/include/snabbkaffe.hrl").
-define(assertWaitEvent(Code, EventMatch, Timeout),
?check_trace(
?assertMatch(
{_, {ok, EventMatch}},
?wait_async_action(
Code,
EventMatch,
Timeout
),
fun(Trace) ->
?assert(
lists:any(
fun
(EventMatch) -> true;
(_) -> false
end,
Trace
)
)
end
)
)
).

View File

@ -0,0 +1,42 @@
%%--------------------------------------------------------------------
%% Copyright (c) 2017-2023 EMQ Technologies Co., Ltd. All Rights Reserved.
%%
%% Licensed under the Apache License, Version 2.0 (the "License");
%% you may not use this file except in compliance with the License.
%% You may obtain a copy of the License at
%%
%% http://www.apache.org/licenses/LICENSE-2.0
%%
%% Unless required by applicable law or agreed to in writing, software
%% distributed under the License is distributed on an "AS IS" BASIS,
%% WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
%% See the License for the specific language governing permissions and
%% limitations under the License.
%%--------------------------------------------------------------------
-define(CHANNEL_METRICS, [
recv_pkt,
recv_msg,
'recv_msg.qos0',
'recv_msg.qos1',
'recv_msg.qos2',
'recv_msg.dropped',
'recv_msg.dropped.await_pubrel_timeout',
send_pkt,
send_msg,
'send_msg.qos0',
'send_msg.qos1',
'send_msg.qos2',
'send_msg.dropped',
'send_msg.dropped.expired',
'send_msg.dropped.queue_full',
'send_msg.dropped.too_large'
]).
-define(INFO_KEYS, [
conninfo,
conn_state,
clientinfo,
session,
will_msg
]).

View File

@ -34,6 +34,7 @@
-define(HP_BRIDGE, 870).
-define(HP_DELAY_PUB, 860).
%% apps that can stop the hooks chain from continuing
-define(HP_NODE_REBALANCE, 110).
-define(HP_EXHOOK, 100).
%% == Lowest Priority = 0, don't change this value as the plugins may depend on it.

View File

@ -14,6 +14,7 @@
{emqx_conf,2}.
{emqx_dashboard,1}.
{emqx_delayed,1}.
{emqx_eviction_agent,1}.
{emqx_exhook,1}.
{emqx_ft_storage_exporter_fs,1}.
{emqx_ft_storage_fs,1}.
@ -30,6 +31,10 @@
{emqx_mgmt_cluster,1}.
{emqx_mgmt_trace,1}.
{emqx_mgmt_trace,2}.
{emqx_node_rebalance,1}.
{emqx_node_rebalance_api,1}.
{emqx_node_rebalance_evacuation,1}.
{emqx_node_rebalance_status,1}.
{emqx_persistent_session,1}.
{emqx_plugin_libs,1}.
{emqx_plugins,1}.

View File

@ -18,6 +18,7 @@
-module(emqx_channel).
-include("emqx.hrl").
-include("emqx_channel.hrl").
-include("emqx_mqtt.hrl").
-include("logger.hrl").
-include("types.hrl").
@ -57,6 +58,12 @@
clear_keepalive/1
]).
%% Export for emqx_channel implementations
-export([
maybe_nack/1,
maybe_mark_as_delivered/2
]).
%% Exports for CT
-export([set_field/3]).
@ -69,7 +76,7 @@
]
).
-export_type([channel/0, opts/0]).
-export_type([channel/0, opts/0, conn_state/0]).
-record(channel, {
%% MQTT ConnInfo
@ -131,33 +138,6 @@
quota_timer => expire_quota_limit
}).
-define(CHANNEL_METRICS, [
recv_pkt,
recv_msg,
'recv_msg.qos0',
'recv_msg.qos1',
'recv_msg.qos2',
'recv_msg.dropped',
'recv_msg.dropped.await_pubrel_timeout',
send_pkt,
send_msg,
'send_msg.qos0',
'send_msg.qos1',
'send_msg.qos2',
'send_msg.dropped',
'send_msg.dropped.expired',
'send_msg.dropped.queue_full',
'send_msg.dropped.too_large'
]).
-define(INFO_KEYS, [
conninfo,
conn_state,
clientinfo,
session,
will_msg
]).
-define(LIMITER_ROUTING, message_routing).
-dialyzer({no_match, [shutdown/4, ensure_timer/2, interval/2]}).
@ -1091,10 +1071,12 @@ handle_out(unsuback, {PacketId, _ReasonCodes}, Channel) ->
handle_out(disconnect, ReasonCode, Channel) when is_integer(ReasonCode) ->
ReasonName = disconnect_reason(ReasonCode),
handle_out(disconnect, {ReasonCode, ReasonName}, Channel);
handle_out(disconnect, {ReasonCode, ReasonName}, Channel = ?IS_MQTT_V5) ->
Packet = ?DISCONNECT_PACKET(ReasonCode),
handle_out(disconnect, {ReasonCode, ReasonName}, Channel) ->
handle_out(disconnect, {ReasonCode, ReasonName, #{}}, Channel);
handle_out(disconnect, {ReasonCode, ReasonName, Props}, Channel = ?IS_MQTT_V5) ->
Packet = ?DISCONNECT_PACKET(ReasonCode, Props),
{ok, [{outgoing, Packet}, {close, ReasonName}], Channel};
handle_out(disconnect, {_ReasonCode, ReasonName}, Channel) ->
handle_out(disconnect, {_ReasonCode, ReasonName, _Props}, Channel) ->
{ok, {close, ReasonName}, Channel};
handle_out(auth, {ReasonCode, Properties}, Channel) ->
{ok, ?AUTH_PACKET(ReasonCode, Properties), Channel};
@ -1211,13 +1193,19 @@ handle_call(
{takeover, 'end'},
Channel = #channel{
session = Session,
pendings = Pendings
pendings = Pendings,
conninfo = #{clientid := ClientId}
}
) ->
ok = emqx_session:takeover(Session),
%% TODO: Should not drain deliver here (side effect)
Delivers = emqx_utils:drain_deliver(),
AllPendings = lists:append(Delivers, Pendings),
?tp(
debug,
emqx_channel_takeover_end,
#{clientid => ClientId}
),
disconnect_and_shutdown(takenover, AllPendings, Channel);
handle_call(list_authz_cache, Channel) ->
{reply, emqx_authz_cache:list_authz_cache(), Channel};
@ -1289,6 +1277,8 @@ handle_info(die_if_test = Info, Channel) ->
die_if_test_compiled(),
?SLOG(error, #{msg => "unexpected_info", info => Info}),
{ok, Channel};
handle_info({disconnect, ReasonCode, ReasonName, Props}, Channel) ->
handle_out(disconnect, {ReasonCode, ReasonName, Props}, Channel);
handle_info({puback, PacketId, PubRes, RC}, Channel) ->
do_finish_publish(PacketId, PubRes, RC, Channel);
handle_info(Info, Channel) ->

View File

@ -23,6 +23,8 @@
-include("logger.hrl").
-include("types.hrl").
-include_lib("snabbkaffe/include/snabbkaffe.hrl").
-include_lib("stdlib/include/qlc.hrl").
-include_lib("stdlib/include/ms_transform.hrl").
-export([start_link/0]).
@ -72,6 +74,12 @@
get_session_confs/2
]).
%% Client management
-export([
channel_with_session_table/1,
live_connection_table/1
]).
%% gen_server callbacks
-export([
init/1,
@ -597,6 +605,40 @@ all_channels() ->
Pat = [{{'_', '$1'}, [], ['$1']}],
ets:select(?CHAN_TAB, Pat).
%% @doc Get clientinfo for all clients with sessions
channel_with_session_table(ConnModuleList) ->
Ms = ets:fun2ms(
fun({{ClientId, _ChanPid}, Info, _Stats}) ->
{ClientId, Info}
end
),
Table = ets:table(?CHAN_INFO_TAB, [{traverse, {select, Ms}}]),
ConnModules = sets:from_list(ConnModuleList, [{version, 2}]),
qlc:q([
{ClientId, ConnState, ConnInfo, ClientInfo}
|| {ClientId, #{
conn_state := ConnState,
clientinfo := ClientInfo,
conninfo := #{clean_start := false, conn_mod := ConnModule} = ConnInfo
}} <-
Table,
sets:is_element(ConnModule, ConnModules)
]).
%% @doc Get all local connection query handle
live_connection_table(ConnModules) ->
Ms = lists:map(fun live_connection_ms/1, ConnModules),
Table = ets:table(?CHAN_CONN_TAB, [{traverse, {select, Ms}}]),
qlc:q([{ClientId, ChanPid} || {ClientId, ChanPid} <- Table, is_channel_connected(ChanPid)]).
live_connection_ms(ConnModule) ->
{{{'$1', '$2'}, ConnModule}, [], [{{'$1', '$2'}}]}.
is_channel_connected(ChanPid) when node(ChanPid) =:= node() ->
ets:member(?CHAN_LIVE_TAB, ChanPid);
is_channel_connected(_ChanPid) ->
false.
%% @doc Get all registered clientIDs. Debug/test interface
all_client_ids() ->
Pat = [{{'$1', '_'}, [], ['$1']}],
@ -697,7 +739,8 @@ code_change(_OldVsn, State, _Extra) ->
%%--------------------------------------------------------------------
clean_down({ChanPid, ClientId}) ->
do_unregister_channel({ClientId, ChanPid}).
do_unregister_channel({ClientId, ChanPid}),
ok = ?tp(debug, emqx_cm_clean_down, #{client_id => ClientId}).
stats_fun() ->
lists:foreach(fun update_stats/1, ?CHAN_STATS).
@ -723,12 +766,12 @@ get_chann_conn_mod(ClientId, ChanPid) ->
wrap_rpc(emqx_cm_proto_v2:get_chann_conn_mod(ClientId, ChanPid)).
mark_channel_connected(ChanPid) ->
?tp(emqx_cm_connected_client_count_inc, #{}),
?tp(emqx_cm_connected_client_count_inc, #{chan_pid => ChanPid}),
ets:insert_new(?CHAN_LIVE_TAB, {ChanPid, true}),
ok.
mark_channel_disconnected(ChanPid) ->
?tp(emqx_cm_connected_client_count_dec, #{}),
?tp(emqx_cm_connected_client_count_dec, #{chan_pid => ChanPid}),
ets:delete(?CHAN_LIVE_TAB, ChanPid),
ok.

View File

@ -167,9 +167,15 @@ handle_info(Info, State) ->
{noreply, State}.
terminate(_Reason, _State) ->
ok = ekka:unmonitor(membership),
emqx_stats:cancel_update(route_stats),
mnesia:unsubscribe({table, ?ROUTING_NODE, simple}).
try
ok = ekka:unmonitor(membership),
emqx_stats:cancel_update(route_stats),
mnesia:unsubscribe({table, ?ROUTING_NODE, simple})
catch
exit:{noproc, {gen_server, call, [mria_membership, _]}} ->
?SLOG(warning, #{msg => "mria_membership_down"}),
ok
end.
code_change(_OldVsn, State, _Extra) ->
{ok, State}.

View File

@ -20,6 +20,7 @@
set_default_config/0,
set_default_config/1,
set_default_config/2,
set_default_config/3,
request/2,
request/3,
request/4,
@ -41,11 +42,14 @@ set_default_config(DefaultUsername) ->
set_default_config(DefaultUsername, false).
set_default_config(DefaultUsername, HAProxyEnabled) ->
set_default_config(DefaultUsername, HAProxyEnabled, #{}).
set_default_config(DefaultUsername, HAProxyEnabled, Opts) ->
Config = #{
listeners => #{
http => #{
enable => true,
bind => 18083,
bind => maps:get(bind, Opts, 18083),
inet6 => false,
ipv6_v6only => false,
max_connections => 512,

View File

@ -0,0 +1,94 @@
Business Source License 1.1
Licensor: Hangzhou EMQ Technologies Co., Ltd.
Licensed Work: EMQX Enterprise Edition
The Licensed Work is (c) 2023
Hangzhou EMQ Technologies Co., Ltd.
Additional Use Grant: Students and educators are granted right to copy,
modify, and create derivative work for research
or education.
Change Date: 2027-02-01
Change License: Apache License, Version 2.0
For information about alternative licensing arrangements for the Software,
please contact Licensor: https://www.emqx.com/en/contact
Notice
The Business Source License (this document, or the “License”) is not an Open
Source license. However, the Licensed Work will eventually be made available
under an Open Source License, as stated in this License.
License text copyright (c) 2017 MariaDB Corporation Ab, All Rights Reserved.
“Business Source License” is a trademark of MariaDB Corporation Ab.
-----------------------------------------------------------------------------
Business Source License 1.1
Terms
The Licensor hereby grants you the right to copy, modify, create derivative
works, redistribute, and make non-production use of the Licensed Work. The
Licensor may make an Additional Use Grant, above, permitting limited
production use.
Effective on the Change Date, or the fourth anniversary of the first publicly
available distribution of a specific version of the Licensed Work under this
License, whichever comes first, the Licensor hereby grants you rights under
the terms of the Change License, and the rights granted in the paragraph
above terminate.
If your use of the Licensed Work does not comply with the requirements
currently in effect as described in this License, you must purchase a
commercial license from the Licensor, its affiliated entities, or authorized
resellers, or you must refrain from using the Licensed Work.
All copies of the original and modified Licensed Work, and derivative works
of the Licensed Work, are subject to this License. This License applies
separately for each version of the Licensed Work and the Change Date may vary
for each version of the Licensed Work released by Licensor.
You must conspicuously display this License on each original or modified copy
of the Licensed Work. If you receive the Licensed Work in original or
modified form from a third party, the terms and conditions set forth in this
License apply to your use of that work.
Any use of the Licensed Work in violation of this License will automatically
terminate your rights under this License for the current and all other
versions of the Licensed Work.
This License does not grant you any right in any trademark or logo of
Licensor or its affiliates (provided that you may use a trademark or logo of
Licensor as expressly required by this License).
TO THE EXTENT PERMITTED BY APPLICABLE LAW, THE LICENSED WORK IS PROVIDED ON
AN “AS IS” BASIS. LICENSOR HEREBY DISCLAIMS ALL WARRANTIES AND CONDITIONS,
EXPRESS OR IMPLIED, INCLUDING (WITHOUT LIMITATION) WARRANTIES OF
MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NON-INFRINGEMENT, AND
TITLE.
MariaDB hereby grants you permission to use this Licenses text to license
your works, and to refer to it using the trademark “Business Source License”,
as long as you comply with the Covenants of Licensor below.
Covenants of Licensor
In consideration of the right to use this Licenses text and the “Business
Source License” name and trademark, Licensor covenants to MariaDB, and to all
other recipients of the licensed work to be provided by Licensor:
1. To specify as the Change License the GPL Version 2.0 or any later version,
or a license that is compatible with GPL Version 2.0 or a later version,
where “compatible” means that software provided under the Change License can
be included in a program with software provided under GPL Version 2.0 or a
later version. Licensor may specify additional Change Licenses without
limitation.
2. To either: (a) specify an additional grant of rights to use that does not
impose any additional restriction on the right granted in this License, as
the Additional Use Grant; or (b) insert the text “None”.
3. To specify a Change Date.
4. Not to modify this License in any other way.

View File

@ -0,0 +1,35 @@
# EMQX Eviction Agent
`emqx_eviction_agent` is a part of the node evacuation/node rebalance feature in EMQX.
It is a low-level application that encapsulates working with actual MQTT connections.
## Application Responsibilities
`emqx_eviction_agent` application:
* Blocks incoming connection to the node it is running on.
* Serves as a facade for connection/session eviction operations.
* Reports blocking status via HTTP API.
The `emqx_eviction_agent` is relatively passive and has no eviction/rebalancing logic. It allows
`emqx_node_rebalance` to perform eviction/rebalancing operations using high-level API, without having to deal with
MQTT connections directly.
## EMQX Integration
`emqx_eviction_agent` interacts with the following EMQX components:
* `emqx_cm` - to get the list of active MQTT connections;
* `emqx_hooks` subsystem - to block/unblock incoming connections;
* `emqx_channel` and the corresponding connection modules to perform the eviction.
## User Facing API
The application provided a very simple API (CLI and HTTP) to inspect the current blocking status.
# Documentation
The rebalancing concept is described in the corresponding [EIP](https://github.com/emqx/eip/blob/main/active/0020-node-rebalance.md).
# Contributing
Please see our [contributing.md](../../CONTRIBUTING.md).

View File

@ -0,0 +1,3 @@
##--------------------------------------------------------------------
## EMQX Eviction Agent Plugin
##--------------------------------------------------------------------

View File

@ -0,0 +1,2 @@
{deps, [{emqx, {path, "../../apps/emqx"}}]}.
{project_plugins, [erlfmt]}.

View File

@ -0,0 +1,21 @@
{application, emqx_eviction_agent, [
{description, "EMQX Eviction Agent"},
{vsn, "5.0.0"},
{registered, [
emqx_eviction_agent_sup,
emqx_eviction_agent,
emqx_eviction_agent_conn_sup
]},
{applications, [
kernel,
stdlib,
emqx_ctl
]},
{mod, {emqx_eviction_agent_app, []}},
{env, []},
{modules, []},
{links, [
{"Homepage", "https://www.emqx.com/"},
{"Github", "https://github.com/emqx"}
]}
]}.

View File

@ -0,0 +1,3 @@
%% -*- mode: erlang -*-
%% Unless you know what you are doing, DO NOT edit manually!!
{VSN, [{<<".*">>, []}], [{<<".*">>, []}]}.

View File

@ -0,0 +1,348 @@
%%--------------------------------------------------------------------
%% Copyright (c) 2022-2023 EMQ Technologies Co., Ltd. All Rights Reserved.
%%--------------------------------------------------------------------
-module(emqx_eviction_agent).
-include_lib("emqx/include/emqx_mqtt.hrl").
-include_lib("emqx/include/logger.hrl").
-include_lib("emqx/include/types.hrl").
-include_lib("emqx/include/emqx_hooks.hrl").
-include_lib("stdlib/include/qlc.hrl").
-include_lib("snabbkaffe/include/snabbkaffe.hrl").
-export([
start_link/0,
enable/2,
disable/1,
status/0,
connection_count/0,
session_count/0,
session_count/1,
evict_connections/1,
evict_sessions/2,
evict_sessions/3,
evict_session_channel/3
]).
-behaviour(gen_server).
-export([
init/1,
handle_call/3,
handle_info/2,
handle_cast/2,
code_change/3
]).
-export([
on_connect/2,
on_connack/3
]).
-export([
hook/0,
unhook/0
]).
-export_type([server_reference/0]).
-define(CONN_MODULES, [
emqx_connection, emqx_ws_connection, emqx_quic_connection, emqx_eviction_agent_channel
]).
%%--------------------------------------------------------------------
%% APIs
%%--------------------------------------------------------------------
-type server_reference() :: binary() | undefined.
-type status() :: {enabled, conn_stats()} | disabled.
-type conn_stats() :: #{
connections := non_neg_integer(),
sessions := non_neg_integer()
}.
-type kind() :: atom().
-spec start_link() -> startlink_ret().
start_link() ->
gen_server:start_link({local, ?MODULE}, ?MODULE, [], []).
-spec enable(kind(), server_reference()) -> ok_or_error(eviction_agent_busy).
enable(Kind, ServerReference) ->
gen_server:call(?MODULE, {enable, Kind, ServerReference}).
-spec disable(kind()) -> ok.
disable(Kind) ->
gen_server:call(?MODULE, {disable, Kind}).
-spec status() -> status().
status() ->
case enable_status() of
{enabled, _Kind, _ServerReference} ->
{enabled, stats()};
disabled ->
disabled
end.
-spec evict_connections(pos_integer()) -> ok_or_error(disabled).
evict_connections(N) ->
case enable_status() of
{enabled, _Kind, ServerReference} ->
ok = do_evict_connections(N, ServerReference);
disabled ->
{error, disabled}
end.
-spec evict_sessions(pos_integer(), node() | [node()]) -> ok_or_error(disabled).
evict_sessions(N, Node) when is_atom(Node) ->
evict_sessions(N, [Node]);
evict_sessions(N, Nodes) when is_list(Nodes) andalso length(Nodes) > 0 ->
evict_sessions(N, Nodes, any).
-spec evict_sessions(pos_integer(), node() | [node()], atom()) -> ok_or_error(disabled).
evict_sessions(N, Node, ConnState) when is_atom(Node) ->
evict_sessions(N, [Node], ConnState);
evict_sessions(N, Nodes, ConnState) when
is_list(Nodes) andalso length(Nodes) > 0
->
case enable_status() of
{enabled, _Kind, _ServerReference} ->
ok = do_evict_sessions(N, Nodes, ConnState);
disabled ->
{error, disabled}
end.
%%--------------------------------------------------------------------
%% gen_server callbacks
%%--------------------------------------------------------------------
init([]) ->
_ = persistent_term:erase(?MODULE),
{ok, #{}}.
%% enable
handle_call({enable, Kind, ServerReference}, _From, St) ->
Reply =
case enable_status() of
disabled ->
ok = persistent_term:put(?MODULE, {enabled, Kind, ServerReference});
{enabled, Kind, _ServerReference} ->
ok = persistent_term:put(?MODULE, {enabled, Kind, ServerReference});
{enabled, _OtherKind, _ServerReference} ->
{error, eviction_agent_busy}
end,
{reply, Reply, St};
%% disable
handle_call({disable, Kind}, _From, St) ->
Reply =
case enable_status() of
disabled ->
{error, disabled};
{enabled, Kind, _ServerReference} ->
_ = persistent_term:erase(?MODULE),
ok;
{enabled, _OtherKind, _ServerReference} ->
{error, eviction_agent_busy}
end,
{reply, Reply, St};
handle_call(Msg, _From, St) ->
?SLOG(warning, #{msg => "unknown_call", call => Msg, state => St}),
{reply, {error, unknown_call}, St}.
handle_info(Msg, St) ->
?SLOG(warning, #{msg => "unknown_msg", info => Msg, state => St}),
{noreply, St}.
handle_cast(Msg, St) ->
?SLOG(warning, #{msg => "unknown_cast", cast => Msg, state => St}),
{noreply, St}.
code_change(_Vsn, State, _Extra) ->
{ok, State}.
%%--------------------------------------------------------------------
%% Hook callbacks
%%--------------------------------------------------------------------
on_connect(_ConnInfo, _Props) ->
case enable_status() of
{enabled, _Kind, _ServerReference} ->
{stop, {error, ?RC_USE_ANOTHER_SERVER}};
disabled ->
ignore
end.
on_connack(
#{proto_name := <<"MQTT">>, proto_ver := ?MQTT_PROTO_V5},
use_another_server,
Props
) ->
case enable_status() of
{enabled, _Kind, ServerReference} ->
{ok, Props#{'Server-Reference' => ServerReference}};
disabled ->
{ok, Props}
end;
on_connack(_ClientInfo, _Reason, Props) ->
{ok, Props}.
%%--------------------------------------------------------------------
%% Hook funcs
%%--------------------------------------------------------------------
hook() ->
?tp(debug, eviction_agent_hook, #{}),
ok = emqx_hooks:put('client.connack', {?MODULE, on_connack, []}, ?HP_NODE_REBALANCE),
ok = emqx_hooks:put('client.connect', {?MODULE, on_connect, []}, ?HP_NODE_REBALANCE).
unhook() ->
?tp(debug, eviction_agent_unhook, #{}),
ok = emqx_hooks:del('client.connect', {?MODULE, on_connect}),
ok = emqx_hooks:del('client.connack', {?MODULE, on_connack}).
enable_status() ->
persistent_term:get(?MODULE, disabled).
% connection management
stats() ->
#{
connections => connection_count(),
sessions => session_count()
}.
connection_table() ->
emqx_cm:live_connection_table(?CONN_MODULES).
connection_count() ->
table_count(connection_table()).
channel_with_session_table(any) ->
qlc:q([
{ClientId, ConnInfo, ClientInfo}
|| {ClientId, _, ConnInfo, ClientInfo} <-
emqx_cm:channel_with_session_table(?CONN_MODULES)
]);
channel_with_session_table(RequiredConnState) ->
qlc:q([
{ClientId, ConnInfo, ClientInfo}
|| {ClientId, ConnState, ConnInfo, ClientInfo} <-
emqx_cm:channel_with_session_table(?CONN_MODULES),
RequiredConnState =:= ConnState
]).
session_count() ->
session_count(any).
session_count(ConnState) ->
table_count(channel_with_session_table(ConnState)).
table_count(QH) ->
qlc:fold(fun(_, Acc) -> Acc + 1 end, 0, QH).
take_connections(N) ->
ChanQH = qlc:q([ChanPid || {_ClientId, ChanPid} <- connection_table()]),
ChanPidCursor = qlc:cursor(ChanQH),
ChanPids = qlc:next_answers(ChanPidCursor, N),
ok = qlc:delete_cursor(ChanPidCursor),
ChanPids.
take_channel_with_sessions(N, ConnState) ->
ChanPidCursor = qlc:cursor(channel_with_session_table(ConnState)),
Channels = qlc:next_answers(ChanPidCursor, N),
ok = qlc:delete_cursor(ChanPidCursor),
Channels.
do_evict_connections(N, ServerReference) when N > 0 ->
ChanPids = take_connections(N),
ok = lists:foreach(
fun(ChanPid) ->
disconnect_channel(ChanPid, ServerReference)
end,
ChanPids
).
do_evict_sessions(N, Nodes, ConnState) when N > 0 ->
Channels = take_channel_with_sessions(N, ConnState),
ok = lists:foreach(
fun({ClientId, ConnInfo, ClientInfo}) ->
evict_session_channel(Nodes, ClientId, ConnInfo, ClientInfo)
end,
Channels
).
evict_session_channel(Nodes, ClientId, ConnInfo, ClientInfo) ->
Node = select_random(Nodes),
?SLOG(
info,
#{
msg => "evict_session_channel",
client_id => ClientId,
node => Node,
conn_info => ConnInfo,
client_info => ClientInfo
}
),
case emqx_eviction_agent_proto_v1:evict_session_channel(Node, ClientId, ConnInfo, ClientInfo) of
{badrpc, Reason} ->
?SLOG(
error,
#{
msg => "evict_session_channel_rpc_error",
client_id => ClientId,
node => Node,
reason => Reason
}
),
{error, Reason};
{error, Reason} = Error ->
?SLOG(
error,
#{
msg => "evict_session_channel_error",
client_id => ClientId,
node => Node,
reason => Reason
}
),
Error;
Res ->
Res
end.
-spec evict_session_channel(
emqx_types:clientid(),
emqx_types:conninfo(),
emqx_types:clientinfo()
) -> supervisor:startchild_ret().
evict_session_channel(ClientId, ConnInfo, ClientInfo) ->
?SLOG(info, #{
msg => "evict_session_channel",
client_id => ClientId,
conn_info => ConnInfo,
client_info => ClientInfo
}),
Result = emqx_eviction_agent_channel:start_supervised(
#{
conninfo => ConnInfo,
clientinfo => ClientInfo
}
),
?SLOG(
info,
#{
msg => "evict_session_channel_result",
client_id => ClientId,
result => Result
}
),
Result.
disconnect_channel(ChanPid, ServerReference) ->
ChanPid !
{disconnect, ?RC_USE_ANOTHER_SERVER, use_another_server, #{
'Server-Reference' => ServerReference
}}.
select_random(List) when length(List) > 0 ->
lists:nth(rand:uniform(length(List)), List).

View File

@ -0,0 +1,85 @@
%%--------------------------------------------------------------------
%% Copyright (c) 2022-2023 EMQ Technologies Co., Ltd. All Rights Reserved.
%%--------------------------------------------------------------------
-module(emqx_eviction_agent_api).
-behaviour(minirest_api).
-include_lib("typerefl/include/types.hrl").
-include_lib("hocon/include/hoconsc.hrl").
-include_lib("emqx/include/logger.hrl").
%% Swagger specs from hocon schema
-export([
api_spec/0,
paths/0,
schema/1,
namespace/0
]).
-export([
fields/1,
roots/0
]).
%% API callbacks
-export([
'/node_eviction/status'/2
]).
-import(hoconsc, [mk/2, ref/1, ref/2]).
namespace() -> "node_eviction".
api_spec() ->
emqx_dashboard_swagger:spec(?MODULE, #{check_schema => true}).
paths() ->
[
"/node_eviction/status"
].
schema("/node_eviction/status") ->
#{
'operationId' => '/node_eviction/status',
get => #{
tags => [<<"node_eviction">>],
summary => <<"Get node eviction status">>,
description => ?DESC("node_eviction_status_get"),
responses => #{
200 => schema_status()
}
}
}.
'/node_eviction/status'(_Bindings, _Params) ->
case emqx_eviction_agent:status() of
disabled ->
{200, #{status => disabled}};
{enabled, Stats} ->
{200, #{
status => enabled,
stats => Stats
}}
end.
schema_status() ->
mk(hoconsc:union([ref(status_enabled), ref(status_disabled)]), #{}).
roots() -> [].
fields(status_enabled) ->
[
{status, mk(enabled, #{default => enabled})},
{stats, ref(stats)}
];
fields(stats) ->
[
{connections, mk(integer(), #{})},
{sessions, mk(integer(), #{})}
];
fields(status_disabled) ->
[
{status, mk(disabled, #{default => disabled})}
].

View File

@ -0,0 +1,22 @@
%%--------------------------------------------------------------------
%% Copyright (c) 2022-2023 EMQ Technologies Co., Ltd. All Rights Reserved.
%%--------------------------------------------------------------------
-module(emqx_eviction_agent_app).
-behaviour(application).
-export([
start/2,
stop/1
]).
start(_Type, _Args) ->
ok = emqx_eviction_agent:hook(),
{ok, Sup} = emqx_eviction_agent_sup:start_link(),
ok = emqx_eviction_agent_cli:load(),
{ok, Sup}.
stop(_State) ->
ok = emqx_eviction_agent:unhook(),
ok = emqx_eviction_agent_cli:unload().

View File

@ -0,0 +1,358 @@
%%--------------------------------------------------------------------
%% Copyright (c) 2022-2023 EMQ Technologies Co., Ltd. All Rights Reserved.
%%--------------------------------------------------------------------
%% MQTT Channel
-module(emqx_eviction_agent_channel).
-include_lib("emqx/include/emqx.hrl").
-include_lib("emqx/include/emqx_channel.hrl").
-include_lib("emqx/include/emqx_mqtt.hrl").
-include_lib("emqx/include/logger.hrl").
-include_lib("emqx/include/types.hrl").
-include_lib("snabbkaffe/include/snabbkaffe.hrl").
-export([
start_link/1,
start_supervised/1,
call/2,
call/3,
cast/2,
stop/1
]).
-export([
init/1,
handle_call/3,
handle_cast/2,
handle_info/2,
terminate/2,
code_change/3
]).
-type opts() :: #{
conninfo := emqx_types:conninfo(),
clientinfo := emqx_types:clientinfo()
}.
%%--------------------------------------------------------------------
%% API
%%--------------------------------------------------------------------
-spec start_supervised(opts()) -> supervisor:startchild_ret().
start_supervised(#{clientinfo := #{clientid := ClientId}} = Opts) ->
RandomId = integer_to_binary(erlang:unique_integer([positive])),
ClientIdBin = bin_clientid(ClientId),
Id = <<ClientIdBin/binary, "-", RandomId/binary>>,
ChildSpec = #{
id => Id,
start => {?MODULE, start_link, [Opts]},
restart => temporary,
shutdown => 5000,
type => worker,
modules => [?MODULE]
},
supervisor:start_child(
emqx_eviction_agent_conn_sup,
ChildSpec
).
-spec start_link(opts()) -> startlink_ret().
start_link(Opts) ->
gen_server:start_link(?MODULE, [Opts], []).
-spec cast(pid(), term()) -> ok.
cast(Pid, Req) ->
gen_server:cast(Pid, Req).
-spec call(pid(), term()) -> term().
call(Pid, Req) ->
call(Pid, Req, infinity).
-spec call(pid(), term(), timeout()) -> term().
call(Pid, Req, Timeout) ->
gen_server:call(Pid, Req, Timeout).
-spec stop(pid()) -> ok.
stop(Pid) ->
gen_server:stop(Pid).
%%--------------------------------------------------------------------
%% gen_server API
%%--------------------------------------------------------------------
init([#{conninfo := OldConnInfo, clientinfo := #{clientid := ClientId} = OldClientInfo}]) ->
process_flag(trap_exit, true),
ClientInfo = clientinfo(OldClientInfo),
ConnInfo = conninfo(OldConnInfo),
case open_session(ConnInfo, ClientInfo) of
{ok, Channel0} ->
case set_expiry_timer(Channel0) of
{ok, Channel1} ->
?SLOG(
info,
#{
msg => "channel_initialized",
clientid => ClientId,
node => node()
}
),
ok = emqx_cm:mark_channel_disconnected(self()),
{ok, Channel1, hibernate};
{error, Reason} ->
{stop, Reason}
end;
{error, Reason} ->
{stop, Reason}
end.
handle_call(kick, _From, Channel) ->
{stop, kicked, ok, Channel};
handle_call(discard, _From, Channel) ->
{stop, discarded, ok, Channel};
handle_call({takeover, 'begin'}, _From, #{session := Session} = Channel) ->
{reply, Session, Channel#{takeover => true}};
handle_call(
{takeover, 'end'},
_From,
#{
session := Session,
clientinfo := #{clientid := ClientId},
pendings := Pendings
} = Channel
) ->
ok = emqx_session:takeover(Session),
%% TODO: Should not drain deliver here (side effect)
Delivers = emqx_utils:drain_deliver(),
AllPendings = lists:append(Delivers, Pendings),
?tp(
debug,
emqx_channel_takeover_end,
#{clientid => ClientId}
),
{stop, normal, AllPendings, Channel};
handle_call(list_acl_cache, _From, Channel) ->
{reply, [], Channel};
handle_call({quota, _Policy}, _From, Channel) ->
{reply, ok, Channel};
handle_call(Req, _From, Channel) ->
?SLOG(
error,
#{
msg => "unexpected_call",
req => Req
}
),
{reply, ignored, Channel}.
handle_info(Deliver = {deliver, _Topic, _Msg}, Channel) ->
Delivers = [Deliver | emqx_utils:drain_deliver()],
{noreply, handle_deliver(Delivers, Channel)};
handle_info(expire_session, Channel) ->
{stop, expired, Channel};
handle_info(Info, Channel) ->
?SLOG(
error,
#{
msg => "unexpected_info",
info => Info
}
),
{noreply, Channel}.
handle_cast(Msg, Channel) ->
?SLOG(error, #{msg => "unexpected_cast", cast => Msg}),
{noreply, Channel}.
terminate(Reason, #{conninfo := ConnInfo, clientinfo := ClientInfo, session := Session} = Channel) ->
ok = cancel_expiry_timer(Channel),
(Reason =:= expired) andalso emqx_persistent_session:persist(ClientInfo, ConnInfo, Session),
emqx_session:terminate(ClientInfo, Reason, Session).
code_change(_OldVsn, Channel, _Extra) ->
{ok, Channel}.
%%--------------------------------------------------------------------
%% Internal functions
%%--------------------------------------------------------------------
handle_deliver(
Delivers,
#{
takeover := true,
pendings := Pendings,
session := Session,
clientinfo := #{clientid := ClientId} = ClientInfo
} = Channel
) ->
%% NOTE: Order is important here. While the takeover is in
%% progress, the session cannot enqueue messages, since it already
%% passed on the queue to the new connection in the session state.
NPendings = lists:append(
Pendings,
emqx_session:ignore_local(ClientInfo, emqx_channel:maybe_nack(Delivers), ClientId, Session)
),
Channel#{pendings => NPendings};
handle_deliver(
Delivers,
#{
takeover := false,
session := Session,
clientinfo := #{clientid := ClientId} = ClientInfo
} = Channel
) ->
Delivers1 = emqx_channel:maybe_nack(Delivers),
Delivers2 = emqx_session:ignore_local(ClientInfo, Delivers1, ClientId, Session),
NSession = emqx_session:enqueue(ClientInfo, Delivers2, Session),
NChannel = persist(NSession, Channel),
%% We consider queued/dropped messages as delivered since they are now in the session state.
emqx_channel:maybe_mark_as_delivered(Session, Delivers),
NChannel.
cancel_expiry_timer(#{expiry_timer := TRef}) when is_reference(TRef) ->
_ = erlang:cancel_timer(TRef),
ok;
cancel_expiry_timer(_) ->
ok.
set_expiry_timer(#{conninfo := ConnInfo} = Channel) ->
case maps:get(expiry_interval, ConnInfo) of
?UINT_MAX ->
{ok, Channel};
I when I > 0 ->
Timer = erlang:send_after(timer:seconds(I), self(), expire_session),
{ok, Channel#{expiry_timer => Timer}};
_ ->
{error, should_be_expired}
end.
open_session(ConnInfo, #{clientid := ClientId} = ClientInfo) ->
Channel = channel(ConnInfo, ClientInfo),
case emqx_cm:open_session(_CleanSession = false, ClientInfo, ConnInfo) of
{ok, #{present := false}} ->
?SLOG(
info,
#{
msg => "no_session",
clientid => ClientId,
node => node()
}
),
{error, no_session};
{ok, #{session := Session, present := true, pendings := Pendings0}} ->
?SLOG(
info,
#{
msg => "session_opened",
clientid => ClientId,
node => node()
}
),
Pendings1 = lists:usort(lists:append(Pendings0, emqx_utils:drain_deliver())),
NSession = emqx_session:enqueue(
ClientInfo,
emqx_session:ignore_local(
ClientInfo,
emqx_channel:maybe_nack(Pendings1),
ClientId,
Session
),
Session
),
NChannel = Channel#{session => NSession},
ok = emqx_cm:insert_channel_info(ClientId, info(NChannel), stats(NChannel)),
?SLOG(
info,
#{
msg => "channel_info_updated",
clientid => ClientId,
node => node()
}
),
{ok, NChannel};
{error, Reason} = Error ->
?SLOG(
error,
#{
msg => "session_open_failed",
clientid => ClientId,
node => node(),
reason => Reason
}
),
Error
end.
conninfo(OldConnInfo) ->
DisconnectedAt = maps:get(disconnected_at, OldConnInfo, erlang:system_time(millisecond)),
ConnInfo0 = maps:with(
[
socktype,
sockname,
peername,
peercert,
clientid,
clean_start,
receive_maximum,
expiry_interval,
connected_at,
disconnected_at,
keepalive
],
OldConnInfo
),
ConnInfo0#{
conn_mod => ?MODULE,
connected => false,
disconnected_at => DisconnectedAt
}.
clientinfo(OldClientInfo) ->
maps:with(
[
zone,
protocol,
peerhost,
sockport,
clientid,
username,
is_bridge,
is_superuser,
mountpoint
],
OldClientInfo
).
channel(ConnInfo, ClientInfo) ->
#{
conninfo => ConnInfo,
clientinfo => ClientInfo,
expiry_timer => undefined,
takeover => false,
resuming => false,
pendings => []
}.
persist(Session, #{clientinfo := ClientInfo, conninfo := ConnInfo} = Channel) ->
Session1 = emqx_persistent_session:persist(ClientInfo, ConnInfo, Session),
Channel#{session => Session1}.
info(Channel) ->
#{
conninfo => maps:get(conninfo, Channel, undefined),
clientinfo => maps:get(clientinfo, Channel, undefined),
session => emqx_utils:maybe_apply(
fun emqx_session:info/1,
maps:get(session, Channel, undefined)
),
conn_state => disconnected
}.
stats(#{session := Session}) ->
lists:append(emqx_session:stats(Session), emqx_pd:get_counters(?CHANNEL_METRICS)).
bin_clientid(ClientId) when is_binary(ClientId) ->
ClientId;
bin_clientid(ClientId) when is_atom(ClientId) ->
atom_to_binary(ClientId).

View File

@ -0,0 +1,30 @@
%%--------------------------------------------------------------------
%% Copyright (c) 2022-2023 EMQ Technologies Co., Ltd. All Rights Reserved.
%%--------------------------------------------------------------------
-module(emqx_eviction_agent_cli).
%% APIs
-export([
load/0,
unload/0,
cli/1
]).
load() ->
emqx_ctl:register_command(eviction, {?MODULE, cli}, []).
unload() ->
emqx_ctl:unregister_command(eviction).
cli(["status"]) ->
case emqx_eviction_agent:status() of
disabled ->
emqx_ctl:print("Eviction status: disabled~n");
{enabled, _Stats} ->
emqx_ctl:print("Eviction status: enabled~n")
end;
cli(_) ->
emqx_ctl:usage(
[{"eviction status", "Get current node eviction status"}]
).

View File

@ -0,0 +1,21 @@
%%--------------------------------------------------------------------
%% Copyright (c) 2022-2023 EMQ Technologies Co., Ltd. All Rights Reserved.
%%--------------------------------------------------------------------
-module(emqx_eviction_agent_conn_sup).
-behaviour(supervisor).
-export([start_link/0]).
-export([init/1]).
start_link() ->
supervisor:start_link({local, ?MODULE}, ?MODULE, []).
init([]) ->
{ok,
{
#{strategy => one_for_one, intensity => 10, period => 3600},
[]
}}.

View File

@ -0,0 +1,34 @@
%%--------------------------------------------------------------------
%% Copyright (c) 2022-2023 EMQ Technologies Co., Ltd. All Rights Reserved.
%%--------------------------------------------------------------------
-module(emqx_eviction_agent_sup).
-behaviour(supervisor).
-export([start_link/0]).
-export([init/1]).
start_link() ->
supervisor:start_link({local, ?MODULE}, ?MODULE, []).
init([]) ->
Childs = [
child_spec(worker, emqx_eviction_agent, []),
child_spec(supervisor, emqx_eviction_agent_conn_sup, [])
],
{ok, {
#{strategy => one_for_one, intensity => 10, period => 3600},
Childs
}}.
child_spec(Type, Mod, Args) ->
#{
id => Mod,
start => {Mod, start_link, Args},
restart => permanent,
shutdown => 5000,
type => Type,
modules => [Mod]
}.

View File

@ -0,0 +1,27 @@
%%--------------------------------------------------------------------
%% Copyright (c) 2022-2023 EMQ Technologies Co., Ltd. All Rights Reserved.
%%--------------------------------------------------------------------
-module(emqx_eviction_agent_proto_v1).
-behaviour(emqx_bpapi).
-export([
introduced_in/0,
evict_session_channel/4
]).
-include_lib("emqx/include/bpapi.hrl").
introduced_in() ->
"5.0.22".
-spec evict_session_channel(
node(),
emqx_types:clientid(),
emqx_types:conninfo(),
emqx_types:clientinfo()
) -> supervisor:startchild_err() | emqx_rpc:badrpc().
evict_session_channel(Node, ClientId, ConnInfo, ClientInfo) ->
rpc:call(Node, emqx_eviction_agent, evict_session_channel, [ClientId, ConnInfo, ClientInfo]).

View File

@ -0,0 +1,467 @@
%%--------------------------------------------------------------------
%% Copyright (c) 2022-2023 EMQ Technologies Co., Ltd. All Rights Reserved.
%%--------------------------------------------------------------------
-module(emqx_eviction_agent_SUITE).
-compile(export_all).
-compile(nowarn_export_all).
-include_lib("eunit/include/eunit.hrl").
-include_lib("common_test/include/ct.hrl").
-include_lib("emqx/include/emqx_mqtt.hrl").
-include_lib("emqx/include/asserts.hrl").
-import(
emqx_eviction_agent_test_helpers,
[emqtt_connect/0, emqtt_connect/1, emqtt_connect/2]
).
-define(assertPrinted(Printed, Code),
?assertMatch(
{match, _},
re:run(Code, Printed)
)
).
all() ->
emqx_common_test_helpers:all(?MODULE).
init_per_suite(Config) ->
emqx_common_test_helpers:start_apps([emqx_eviction_agent]),
Config.
end_per_suite(_Config) ->
emqx_common_test_helpers:stop_apps([emqx_eviction_agent]).
init_per_testcase(Case, Config) ->
_ = emqx_eviction_agent:disable(test_eviction),
ok = snabbkaffe:start_trace(),
start_slave(Case, Config).
start_slave(t_explicit_session_takeover, Config) ->
ClusterNodes = emqx_eviction_agent_test_helpers:start_cluster(
[{evacuate_test1, 2883}, {evacuate_test2, 3883}],
[emqx_eviction_agent]
),
[{evacuate_nodes, ClusterNodes} | Config];
start_slave(_Case, Config) ->
Config.
end_per_testcase(TestCase, Config) ->
emqx_eviction_agent:disable(test_eviction),
ok = snabbkaffe:stop(),
stop_slave(TestCase, Config).
stop_slave(t_explicit_session_takeover, Config) ->
emqx_eviction_agent_test_helpers:stop_cluster(
?config(evacuate_nodes, Config),
[emqx_eviction_agent]
);
stop_slave(_Case, _Config) ->
ok.
%%--------------------------------------------------------------------
%% Tests
%%--------------------------------------------------------------------
t_enable_disable(_Config) ->
erlang:process_flag(trap_exit, true),
?assertMatch(
disabled,
emqx_eviction_agent:status()
),
{ok, C0} = emqtt_connect(),
ok = emqtt:disconnect(C0),
ok = emqx_eviction_agent:enable(test_eviction, undefined),
?assertMatch(
{error, eviction_agent_busy},
emqx_eviction_agent:enable(bar, undefined)
),
?assertMatch(
ok,
emqx_eviction_agent:enable(test_eviction, <<"srv">>)
),
?assertMatch(
{enabled, #{}},
emqx_eviction_agent:status()
),
?assertMatch(
{error, {use_another_server, #{}}},
emqtt_connect()
),
?assertMatch(
{error, eviction_agent_busy},
emqx_eviction_agent:disable(bar)
),
?assertMatch(
ok,
emqx_eviction_agent:disable(test_eviction)
),
?assertMatch(
{error, disabled},
emqx_eviction_agent:disable(test_eviction)
),
?assertMatch(
disabled,
emqx_eviction_agent:status()
),
{ok, C1} = emqtt_connect(),
ok = emqtt:disconnect(C1).
t_evict_connections_status(_Config) ->
erlang:process_flag(trap_exit, true),
{ok, _C} = emqtt_connect(),
{error, disabled} = emqx_eviction_agent:evict_connections(1),
ok = emqx_eviction_agent:enable(test_eviction, undefined),
?assertMatch(
{enabled, #{connections := 1, sessions := _}},
emqx_eviction_agent:status()
),
ok = emqx_eviction_agent:evict_connections(1),
ct:sleep(100),
?assertMatch(
{enabled, #{connections := 0, sessions := _}},
emqx_eviction_agent:status()
),
ok = emqx_eviction_agent:disable(test_eviction).
t_explicit_session_takeover(Config) ->
_ = erlang:process_flag(trap_exit, true),
ok = restart_emqx(),
[{Node1, Port1}, {Node2, _Port2}] = ?config(evacuate_nodes, Config),
{ok, C0} = emqtt_connect([
{clientid, <<"client_with_session">>},
{clean_start, false},
{port, Port1}
]),
{ok, _, _} = emqtt:subscribe(C0, <<"t1">>),
ok = rpc:call(Node1, emqx_eviction_agent, enable, [test_eviction, undefined]),
?assertEqual(
1,
rpc:call(Node1, emqx_eviction_agent, connection_count, [])
),
[ChanPid] = rpc:call(Node1, emqx_cm, lookup_channels, [<<"client_with_session">>]),
?assertWaitEvent(
begin
ok = rpc:call(Node1, emqx_eviction_agent, evict_connections, [1]),
receive
{'EXIT', C0, {disconnected, ?RC_USE_ANOTHER_SERVER, _}} -> ok
after 1000 ->
?assert(false, "Connection not evicted")
end
end,
#{?snk_kind := emqx_cm_connected_client_count_dec, chan_pid := ChanPid},
2000
),
?assertEqual(
0,
rpc:call(Node1, emqx_eviction_agent, connection_count, [])
),
?assertEqual(
1,
rpc:call(Node1, emqx_eviction_agent, session_count, [])
),
%% First, evacuate to the same node
?assertWaitEvent(
rpc:call(Node1, emqx_eviction_agent, evict_sessions, [1, Node1]),
#{?snk_kind := emqx_channel_takeover_end, clientid := <<"client_with_session">>},
1000
),
ok = rpc:call(Node1, emqx_eviction_agent, disable, [test_eviction]),
{ok, C1} = emqtt_connect([{port, Port1}]),
emqtt:publish(C1, <<"t1">>, <<"MessageToEvictedSession1">>),
ok = emqtt:disconnect(C1),
ok = rpc:call(Node1, emqx_eviction_agent, enable, [test_eviction, undefined]),
%% Evacuate to another node
?assertWaitEvent(
rpc:call(Node1, emqx_eviction_agent, evict_sessions, [1, Node2]),
#{?snk_kind := emqx_channel_takeover_end, clientid := <<"client_with_session">>},
1000
),
?assertEqual(
0,
rpc:call(Node1, emqx_eviction_agent, session_count, [])
),
?assertEqual(
1,
rpc:call(Node2, emqx_eviction_agent, session_count, [])
),
ok = rpc:call(Node1, emqx_eviction_agent, disable, [test_eviction]),
%% Session is on Node2, but we connect to Node1
{ok, C2} = emqtt_connect([{port, Port1}]),
emqtt:publish(C2, <<"t1">>, <<"MessageToEvictedSession2">>),
ok = emqtt:disconnect(C2),
ct:sleep(100),
%% Session is on Node2, but we connect the subscribed client to Node1
%% It should take over the session for the third time and recieve
%% previously published messages
{ok, C3} = emqtt_connect([
{clientid, <<"client_with_session">>},
{clean_start, false},
{port, Port1}
]),
ok = assert_receive_publish(
[
#{payload => <<"MessageToEvictedSession1">>, topic => <<"t1">>},
#{payload => <<"MessageToEvictedSession2">>, topic => <<"t1">>}
]
),
ok = emqtt:disconnect(C3).
t_disable_on_restart(_Config) ->
ok = emqx_eviction_agent:enable(test_eviction, undefined),
ok = supervisor:terminate_child(emqx_eviction_agent_sup, emqx_eviction_agent),
{ok, _} = supervisor:restart_child(emqx_eviction_agent_sup, emqx_eviction_agent),
?assertEqual(
disabled,
emqx_eviction_agent:status()
).
t_session_serialization(_Config) ->
_ = erlang:process_flag(trap_exit, true),
ok = restart_emqx(),
{ok, C0} = emqtt_connect(<<"client_with_session">>, false),
{ok, _, _} = emqtt:subscribe(C0, <<"t1">>),
ok = emqtt:disconnect(C0),
ok = emqx_eviction_agent:enable(test_eviction, undefined),
?assertEqual(
1,
emqx_eviction_agent:session_count()
),
%% Evacuate to the same node
?assertWaitEvent(
emqx_eviction_agent:evict_sessions(1, node()),
#{?snk_kind := emqx_channel_takeover_end, clientid := <<"client_with_session">>},
1000
),
ok = emqx_eviction_agent:disable(test_eviction),
?assertEqual(
1,
emqx_eviction_agent:session_count()
),
?assertMatch(
#{data := [#{clientid := <<"client_with_session">>}]},
emqx_mgmt_api:cluster_query(
emqx_channel_info,
#{},
[],
fun emqx_mgmt_api_clients:qs2ms/2,
fun emqx_mgmt_api_clients:format_channel_info/2
)
),
mock_print(),
?assertPrinted(
"client_with_session",
emqx_mgmt_cli:clients(["list"])
),
?assertPrinted(
"client_with_session",
emqx_mgmt_cli:clients(["show", "client_with_session"])
),
?assertWaitEvent(
emqx_cm:kick_session(<<"client_with_session">>),
#{?snk_kind := emqx_cm_clean_down, client_id := <<"client_with_session">>},
1000
),
?assertEqual(
0,
emqx_eviction_agent:session_count()
).
t_will_msg(_Config) ->
erlang:process_flag(trap_exit, true),
WillMsg = <<"will_msg">>,
WillTopic = <<"will_topic">>,
ClientId = <<"client_with_will">>,
_ = emqtt_connect([
{clean_start, false},
{clientid, ClientId},
{will_payload, WillMsg},
{will_topic, WillTopic}
]),
{ok, C} = emqtt_connect(),
{ok, _, _} = emqtt:subscribe(C, WillTopic),
[ChanPid] = emqx_cm:lookup_channels(ClientId),
ChanPid !
{disconnect, ?RC_USE_ANOTHER_SERVER, use_another_server, #{
'Server-Reference' => <<>>
}},
receive
{publish, #{
payload := WillMsg,
topic := WillTopic
}} ->
ok
after 1000 ->
ct:fail("Will message not received")
end,
ok = emqtt:disconnect(C).
t_ws_conn(_Config) ->
erlang:process_flag(trap_exit, true),
ClientId = <<"ws_client">>,
{ok, C} = emqtt:start_link([
{proto_ver, v5},
{clientid, ClientId},
{port, 8083},
{ws_path, "/mqtt"}
]),
{ok, _} = emqtt:ws_connect(C),
ok = emqx_eviction_agent:enable(test_eviction, undefined),
?assertEqual(
1,
emqx_eviction_agent:connection_count()
),
?assertWaitEvent(
ok = emqx_eviction_agent:evict_connections(1),
#{?snk_kind := emqx_cm_connected_client_count_dec},
1000
),
?assertEqual(
0,
emqx_eviction_agent:connection_count()
).
-ifndef(BUILD_WITHOUT_QUIC).
t_quic_conn(_Config) ->
erlang:process_flag(trap_exit, true),
QuicPort = emqx_common_test_helpers:select_free_port(quic),
application:ensure_all_started(quicer),
emqx_common_test_helpers:ensure_quic_listener(?MODULE, QuicPort),
ClientId = <<"quic_client">>,
{ok, C} = emqtt:start_link([
{proto_ver, v5},
{clientid, ClientId},
{port, QuicPort}
]),
{ok, _} = emqtt:quic_connect(C),
ok = emqx_eviction_agent:enable(test_eviction, undefined),
?assertEqual(
1,
emqx_eviction_agent:connection_count()
),
?assertWaitEvent(
ok = emqx_eviction_agent:evict_connections(1),
#{?snk_kind := emqx_cm_connected_client_count_dec},
1000
),
?assertEqual(
0,
emqx_eviction_agent:connection_count()
).
-endif.
%%--------------------------------------------------------------------
%% Helpers
%%--------------------------------------------------------------------
assert_receive_publish([]) ->
ok;
assert_receive_publish([#{payload := Msg, topic := Topic} | Rest]) ->
receive
{publish, #{
payload := Msg,
topic := Topic
}} ->
assert_receive_publish(Rest)
after 1000 ->
?assert(false, "Message `" ++ binary_to_list(Msg) ++ "` is lost")
end.
connect_and_publish(Topic, Message) ->
{ok, C} = emqtt_connect(),
emqtt:publish(C, Topic, Message),
ok = emqtt:disconnect(C).
restart_emqx() ->
_ = application:stop(emqx),
_ = application:start(emqx),
_ = application:stop(emqx_eviction_agent),
_ = application:start(emqx_eviction_agent),
ok.
mock_print() ->
catch meck:unload(emqx_ctl),
meck:new(emqx_ctl, [non_strict, passthrough]),
meck:expect(emqx_ctl, print, fun(Arg) -> emqx_ctl:format(Arg, []) end),
meck:expect(emqx_ctl, print, fun(Msg, Arg) -> emqx_ctl:format(Msg, Arg) end),
meck:expect(emqx_ctl, usage, fun(Usages) -> emqx_ctl:format_usage(Usages) end),
meck:expect(emqx_ctl, usage, fun(Cmd, Descr) -> emqx_ctl:format_usage(Cmd, Descr) end).

View File

@ -0,0 +1,69 @@
%%--------------------------------------------------------------------
%% Copyright (c) 2022-2023 EMQ Technologies Co., Ltd. All Rights Reserved.
%%--------------------------------------------------------------------
-module(emqx_eviction_agent_api_SUITE).
-compile(export_all).
-compile(nowarn_export_all).
-include_lib("eunit/include/eunit.hrl").
-include_lib("common_test/include/ct.hrl").
-import(
emqx_mgmt_api_test_util,
[
request_api/2,
uri/1
]
).
all() ->
emqx_common_test_helpers:all(?MODULE).
init_per_suite(Config) ->
emqx_mgmt_api_test_util:init_suite([emqx_eviction_agent]),
Config.
end_per_suite(Config) ->
emqx_mgmt_api_test_util:end_suite([emqx_eviction_agent]),
Config.
%%--------------------------------------------------------------------
%% Tests
%%--------------------------------------------------------------------
t_status(_Config) ->
?assertMatch(
{ok, #{<<"status">> := <<"disabled">>}},
api_get(["node_eviction", "status"])
),
ok = emqx_eviction_agent:enable(apitest, undefined),
?assertMatch(
{ok, #{
<<"status">> := <<"enabled">>,
<<"stats">> := #{}
}},
api_get(["node_eviction", "status"])
),
ok = emqx_eviction_agent:disable(apitest),
?assertMatch(
{ok, #{<<"status">> := <<"disabled">>}},
api_get(["node_eviction", "status"])
).
%%--------------------------------------------------------------------
%% Helpers
%%--------------------------------------------------------------------
api_get(Path) ->
case request_api(get, uri(Path)) of
{ok, ResponseBody} ->
{ok, jiffy:decode(list_to_binary(ResponseBody), [return_maps])};
{error, _} = Error ->
Error
end.

View File

@ -0,0 +1,251 @@
%%--------------------------------------------------------------------
%% Copyright (c) 2022-2023 EMQ Technologies Co., Ltd. All Rights Reserved.
%%--------------------------------------------------------------------
-module(emqx_eviction_agent_channel_SUITE).
-compile(export_all).
-compile(nowarn_export_all).
-include_lib("eunit/include/eunit.hrl").
-include_lib("common_test/include/ct.hrl").
-include_lib("emqx/include/emqx_mqtt.hrl").
-define(CLIENT_ID, <<"client_with_session">>).
-import(
emqx_eviction_agent_test_helpers,
[emqtt_connect/0, emqtt_connect/2]
).
all() ->
emqx_common_test_helpers:all(?MODULE).
init_per_suite(Config) ->
emqx_common_test_helpers:start_apps([emqx_conf, emqx_eviction_agent]),
{ok, _} = emqx:update_config([rpc, port_discovery], manual),
Config.
end_per_suite(_Config) ->
emqx_common_test_helpers:stop_apps([emqx_eviction_agent, emqx_conf]).
init_per_testcase(t_persistence, Config) ->
emqx_config:put([persistent_session_store, enabled], true),
{ok, _} = emqx_persistent_session_sup:start_link(),
emqx_persistent_session:init_db_backend(),
?assert(emqx_persistent_session:is_store_enabled()),
Config;
init_per_testcase(_TestCase, Config) ->
Config.
end_per_testcase(t_persistence, Config) ->
emqx_config:put([persistent_session_store, enabled], false),
emqx_persistent_session:init_db_backend(),
?assertNot(emqx_persistent_session:is_store_enabled()),
Config;
end_per_testcase(_TestCase, _Config) ->
ok.
%%--------------------------------------------------------------------
%% Tests
%%--------------------------------------------------------------------
t_start_no_session(_Config) ->
Opts = #{
clientinfo => #{
clientid => ?CLIENT_ID,
zone => internal
},
conninfo => #{
clientid => ?CLIENT_ID,
receive_maximum => 32,
expiry_interval => 10000
}
},
?assertMatch(
{error, {no_session, _}},
emqx_eviction_agent_channel:start_supervised(Opts)
).
t_start_no_expire(_Config) ->
erlang:process_flag(trap_exit, true),
_ = emqtt_connect(?CLIENT_ID, false),
Opts = #{
clientinfo => #{
clientid => ?CLIENT_ID,
zone => internal
},
conninfo => #{
clientid => ?CLIENT_ID,
receive_maximum => 32,
expiry_interval => 0
}
},
?assertMatch(
{error, {should_be_expired, _}},
emqx_eviction_agent_channel:start_supervised(Opts)
).
t_start_infinite_expire(_Config) ->
erlang:process_flag(trap_exit, true),
_ = emqtt_connect(?CLIENT_ID, false),
Opts = #{
clientinfo => #{
clientid => ?CLIENT_ID,
zone => internal
},
conninfo => #{
clientid => ?CLIENT_ID,
receive_maximum => 32,
expiry_interval => ?UINT_MAX
}
},
?assertMatch(
{ok, _},
emqx_eviction_agent_channel:start_supervised(Opts)
).
t_kick(_Config) ->
erlang:process_flag(trap_exit, true),
_ = emqtt_connect(?CLIENT_ID, false),
Opts = evict_session_opts(?CLIENT_ID),
{ok, Pid} = emqx_eviction_agent_channel:start_supervised(Opts),
?assertEqual(
ok,
emqx_eviction_agent_channel:call(Pid, kick)
).
t_discard(_Config) ->
erlang:process_flag(trap_exit, true),
_ = emqtt_connect(?CLIENT_ID, false),
Opts = evict_session_opts(?CLIENT_ID),
{ok, Pid} = emqx_eviction_agent_channel:start_supervised(Opts),
?assertEqual(
ok,
emqx_eviction_agent_channel:call(Pid, discard)
).
t_stop(_Config) ->
erlang:process_flag(trap_exit, true),
_ = emqtt_connect(?CLIENT_ID, false),
Opts = evict_session_opts(?CLIENT_ID),
{ok, Pid} = emqx_eviction_agent_channel:start_supervised(Opts),
?assertEqual(
ok,
emqx_eviction_agent_channel:stop(Pid)
).
t_ignored_calls(_Config) ->
erlang:process_flag(trap_exit, true),
_ = emqtt_connect(?CLIENT_ID, false),
Opts = evict_session_opts(?CLIENT_ID),
{ok, Pid} = emqx_eviction_agent_channel:start_supervised(Opts),
ok = emqx_eviction_agent_channel:cast(Pid, unknown),
Pid ! unknown,
?assertEqual(
[],
emqx_eviction_agent_channel:call(Pid, list_acl_cache)
),
?assertEqual(
ok,
emqx_eviction_agent_channel:call(Pid, {quota, quota})
),
?assertEqual(
ignored,
emqx_eviction_agent_channel:call(Pid, unknown)
).
t_expire(_Config) ->
erlang:process_flag(trap_exit, true),
_ = emqtt_connect(?CLIENT_ID, false),
#{conninfo := ConnInfo} = Opts0 = evict_session_opts(?CLIENT_ID),
Opts1 = Opts0#{conninfo => ConnInfo#{expiry_interval => 1}},
{ok, Pid} = emqx_eviction_agent_channel:start_supervised(Opts1),
ct:sleep(1500),
?assertNot(is_process_alive(Pid)).
t_get_connected_client_count(_Config) ->
erlang:process_flag(trap_exit, true),
_ = emqtt_connect(?CLIENT_ID, false),
?assertEqual(
1,
emqx_cm:get_connected_client_count()
),
Opts = evict_session_opts(?CLIENT_ID),
{ok, _} = emqx_eviction_agent_channel:start_supervised(Opts),
?assertEqual(
0,
emqx_cm:get_connected_client_count()
).
t_persistence(_Config) ->
erlang:process_flag(trap_exit, true),
Topic = <<"t1">>,
Message = <<"message_to_persist">>,
{ok, C0} = emqtt_connect(?CLIENT_ID, false),
{ok, _, _} = emqtt:subscribe(C0, Topic, 0),
Opts = evict_session_opts(?CLIENT_ID),
{ok, Pid} = emqx_eviction_agent_channel:start_supervised(Opts),
{ok, C1} = emqtt_connect(),
{ok, _} = emqtt:publish(C1, Topic, Message, 1),
ok = emqtt:disconnect(C1),
%% Kill channel so that the session is only persisted
ok = emqx_eviction_agent_channel:call(Pid, kick),
%% Should restore session from persistents storage and receive messages
{ok, C2} = emqtt_connect(?CLIENT_ID, false),
receive
{publish, #{
payload := Message,
topic := Topic
}} ->
ok
after 1000 ->
ct:fail("message not received")
end,
ok = emqtt:disconnect(C2).
%%--------------------------------------------------------------------
%% Helpers
%%--------------------------------------------------------------------
evict_session_opts(ClientId) ->
maps:with(
[conninfo, clientinfo],
emqx_cm:get_chan_info(ClientId)
).

View File

@ -0,0 +1,39 @@
%%--------------------------------------------------------------------
%% Copyright (c) 2022-2023 EMQ Technologies Co., Ltd. All Rights Reserved.
%%--------------------------------------------------------------------
-module(emqx_eviction_agent_cli_SUITE).
-compile(export_all).
-compile(nowarn_export_all).
-include_lib("eunit/include/eunit.hrl").
-include_lib("common_test/include/ct.hrl").
all() ->
emqx_common_test_helpers:all(?MODULE).
init_per_suite(Config) ->
emqx_common_test_helpers:start_apps([emqx_eviction_agent]),
Config.
end_per_suite(Config) ->
_ = emqx_eviction_agent:disable(foo),
emqx_common_test_helpers:stop_apps([emqx_eviction_agent]),
Config.
%%--------------------------------------------------------------------
%% Tests
%%--------------------------------------------------------------------
t_status(_Config) ->
%% usage
ok = emqx_eviction_agent_cli:cli(["foobar"]),
%% status
ok = emqx_eviction_agent_cli:cli(["status"]),
ok = emqx_eviction_agent:enable(foo, undefined),
%% status
ok = emqx_eviction_agent_cli:cli(["status"]).

View File

@ -0,0 +1,134 @@
%%--------------------------------------------------------------------
%% Copyright (c) 2022-2023 EMQ Technologies Co., Ltd. All Rights Reserved.
%%--------------------------------------------------------------------
-module(emqx_eviction_agent_test_helpers).
-export([
emqtt_connect/0,
emqtt_connect/1,
emqtt_connect/2,
emqtt_connect_many/2,
stop_many/1,
emqtt_try_connect/1,
start_cluster/2,
start_cluster/3,
stop_cluster/2,
case_specific_node_name/2,
case_specific_node_name/3,
concat_atoms/1
]).
emqtt_connect() ->
emqtt_connect(<<"client1">>, true).
emqtt_connect(ClientId, CleanStart) ->
emqtt_connect([{clientid, ClientId}, {clean_start, CleanStart}]).
emqtt_connect(Opts) ->
{ok, C} = emqtt:start_link(
Opts ++
[
{proto_ver, v5},
{properties, #{'Session-Expiry-Interval' => 600}}
]
),
case emqtt:connect(C) of
{ok, _} -> {ok, C};
{error, _} = Error -> Error
end.
emqtt_connect_many(Port, Count) ->
lists:map(
fun(N) ->
NBin = integer_to_binary(N),
ClientId = <<"client-", NBin/binary>>,
{ok, C} = emqtt_connect([{clientid, ClientId}, {clean_start, false}, {port, Port}]),
C
end,
lists:seq(1, Count)
).
stop_many(Clients) ->
lists:foreach(
fun(C) ->
catch emqtt:disconnect(C)
end,
Clients
),
ct:sleep(100).
emqtt_try_connect(Opts) ->
case emqtt_connect(Opts) of
{ok, C} ->
emqtt:disconnect(C),
ok;
{error, _} = Error ->
Error
end.
start_cluster(NamesWithPorts, Apps) ->
start_cluster(NamesWithPorts, Apps, []).
start_cluster(NamesWithPorts, Apps, Env) ->
Specs = lists:map(
fun({ShortName, Port}) ->
{core, ShortName, #{listener_ports => [{tcp, Port}]}}
end,
NamesWithPorts
),
Opts0 = [
{env, [{emqx, boot_modules, [broker, listeners]}] ++ Env},
{apps, Apps},
{conf,
[{[listeners, Proto, default, enabled], false} || Proto <- [ssl, ws, wss]] ++
[{[rpc, mode], async}]}
],
Cluster = emqx_common_test_helpers:emqx_cluster(
Specs,
Opts0
),
NodesWithPorts = [
{
emqx_common_test_helpers:start_slave(Name, Opts),
proplists:get_value(Name, NamesWithPorts)
}
|| {Name, Opts} <- Cluster
],
NodesWithPorts.
stop_cluster(NodesWithPorts, Apps) ->
lists:foreach(
fun({Node, _Port}) ->
lists:foreach(
fun(App) ->
rpc:call(Node, application, stop, [App])
end,
Apps
),
%% This sleep is just to make logs cleaner
ct:sleep(100),
_ = rpc:call(Node, emqx_common_test_helpers, stop_apps, []),
emqx_common_test_helpers:stop_slave(Node)
end,
NodesWithPorts
).
case_specific_node_name(Module, Case) ->
concat_atoms([Module, '__', Case]).
case_specific_node_name(Module, Case, Node) ->
concat_atoms([Module, '__', Case, '__', Node]).
concat_atoms(Atoms) ->
binary_to_atom(
iolist_to_binary(
lists:map(
fun atom_to_binary/1,
Atoms
)
)
).

View File

@ -155,7 +155,9 @@ basic_reboot_apps() ->
CE ++
[
emqx_s3,
emqx_ft
emqx_ft,
emqx_eviction_agent,
emqx_node_rebalance
]
end.

View File

@ -0,0 +1,94 @@
Business Source License 1.1
Licensor: Hangzhou EMQ Technologies Co., Ltd.
Licensed Work: EMQX Enterprise Edition
The Licensed Work is (c) 2023
Hangzhou EMQ Technologies Co., Ltd.
Additional Use Grant: Students and educators are granted right to copy,
modify, and create derivative work for research
or education.
Change Date: 2027-02-01
Change License: Apache License, Version 2.0
For information about alternative licensing arrangements for the Software,
please contact Licensor: https://www.emqx.com/en/contact
Notice
The Business Source License (this document, or the “License”) is not an Open
Source license. However, the Licensed Work will eventually be made available
under an Open Source License, as stated in this License.
License text copyright (c) 2017 MariaDB Corporation Ab, All Rights Reserved.
“Business Source License” is a trademark of MariaDB Corporation Ab.
-----------------------------------------------------------------------------
Business Source License 1.1
Terms
The Licensor hereby grants you the right to copy, modify, create derivative
works, redistribute, and make non-production use of the Licensed Work. The
Licensor may make an Additional Use Grant, above, permitting limited
production use.
Effective on the Change Date, or the fourth anniversary of the first publicly
available distribution of a specific version of the Licensed Work under this
License, whichever comes first, the Licensor hereby grants you rights under
the terms of the Change License, and the rights granted in the paragraph
above terminate.
If your use of the Licensed Work does not comply with the requirements
currently in effect as described in this License, you must purchase a
commercial license from the Licensor, its affiliated entities, or authorized
resellers, or you must refrain from using the Licensed Work.
All copies of the original and modified Licensed Work, and derivative works
of the Licensed Work, are subject to this License. This License applies
separately for each version of the Licensed Work and the Change Date may vary
for each version of the Licensed Work released by Licensor.
You must conspicuously display this License on each original or modified copy
of the Licensed Work. If you receive the Licensed Work in original or
modified form from a third party, the terms and conditions set forth in this
License apply to your use of that work.
Any use of the Licensed Work in violation of this License will automatically
terminate your rights under this License for the current and all other
versions of the Licensed Work.
This License does not grant you any right in any trademark or logo of
Licensor or its affiliates (provided that you may use a trademark or logo of
Licensor as expressly required by this License).
TO THE EXTENT PERMITTED BY APPLICABLE LAW, THE LICENSED WORK IS PROVIDED ON
AN “AS IS” BASIS. LICENSOR HEREBY DISCLAIMS ALL WARRANTIES AND CONDITIONS,
EXPRESS OR IMPLIED, INCLUDING (WITHOUT LIMITATION) WARRANTIES OF
MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NON-INFRINGEMENT, AND
TITLE.
MariaDB hereby grants you permission to use this Licenses text to license
your works, and to refer to it using the trademark “Business Source License”,
as long as you comply with the Covenants of Licensor below.
Covenants of Licensor
In consideration of the right to use this Licenses text and the “Business
Source License” name and trademark, Licensor covenants to MariaDB, and to all
other recipients of the licensed work to be provided by Licensor:
1. To specify as the Change License the GPL Version 2.0 or any later version,
or a license that is compatible with GPL Version 2.0 or a later version,
where “compatible” means that software provided under the Change License can
be included in a program with software provided under GPL Version 2.0 or a
later version. Licensor may specify additional Change Licenses without
limitation.
2. To either: (a) specify an additional grant of rights to use that does not
impose any additional restriction on the right granted in this License, as
the Additional Use Grant; or (b) insert the text “None”.
3. To specify a Change Date.
4. Not to modify this License in any other way.

View File

@ -0,0 +1,40 @@
# EMQX Node Rebalance
`emqx_node_rebalance` is a part of the node evacuation/node rebalance feature in EMQX.
It implements high-level scenarios for node evacuation and rebalancing.
## Application Responsibilities
`emqx_node_rebalance` application's core concept is a _rebalance coordinator_.
_Rebalance сoordinator_ is an entity that implements the rebalancing logic and orchestrates the rebalancing process.
In particular, it:
* Enables/Disables Eviction Agent on nodes.
* Sends connection/session eviction commands to Eviction Agents according to the evacuation logic.
We have two implementations of the _rebalance coordinator_:
* `emqx_node_rebalance` - a coordinator that implements node rebalancing;
* `emqx_node_rebalance_evacuation` - a coordinator that implements node evacuation.
## EMQX Integration
`emqx_node_rebalance` is a high-level application that is loosely coupled with the rest of the system.
It uses Eviction Agent to perform the required operations.
## User Facing API
The application provides API (CLI and HTTP) to perform the following operations:
* Start/Stop rebalancing across a set of nodes or the whole cluster;
* Start/Stop evacuation of a node;
* Get the current rebalancing status of a local node.
* Get the current rebalancing status of the whole cluster.
Also, an HTTP endpoint is provided for liveness probes.
# Documentation
The rebalancing concept is described in the corresponding [EIP](https://github.com/emqx/eip/blob/main/active/0020-node-rebalance.md).
# Contributing
Please see our [contributing.md](../../CONTRIBUTING.md).

View File

@ -0,0 +1,3 @@
##--------------------------------------------------------------------
## EMQX Node Rebalance Plugin
##--------------------------------------------------------------------

View File

@ -0,0 +1,21 @@
%%--------------------------------------------------------------------
%% Copyright (c) 2022-2023 EMQ Technologies Co., Ltd. All Rights Reserved.
%%--------------------------------------------------------------------
-define(DEFAULT_CONN_EVICT_RATE, 500).
-define(DEFAULT_SESS_EVICT_RATE, 500).
%% sec
-define(DEFAULT_WAIT_HEALTH_CHECK, 60).
%% sec
-define(DEFAULT_WAIT_TAKEOVER, 60).
-define(DEFAULT_ABS_CONN_THRESHOLD, 1000).
-define(DEFAULT_ABS_SESS_THRESHOLD, 1000).
-define(DEFAULT_REL_CONN_THRESHOLD, 1.1).
-define(DEFAULT_REL_SESS_THRESHOLD, 1.1).
-define(EVICT_INTERVAL, 1000).
-define(EVACUATION_FILENAME, <<".evacuation">>).

View File

@ -0,0 +1,2 @@
{deps, [{emqx, {path, "../../apps/emqx"}}]}.
{project_plugins, [erlfmt]}.

View File

@ -0,0 +1,21 @@
{application, emqx_node_rebalance, [
{description, "EMQX Node Rebalance"},
{vsn, "5.0.0"},
{registered, [
emqx_node_rebalance_sup,
emqx_node_rebalance,
emqx_node_rebalance_agent,
emqx_node_rebalance_evacuation
]},
{applications, [
kernel,
stdlib
]},
{mod, {emqx_node_rebalance_app, []}},
{env, []},
{modules, []},
{links, [
{"Homepage", "https://www.emqx.com/"},
{"Github", "https://github.com/emqx"}
]}
]}.

View File

@ -0,0 +1,3 @@
%% -*- mode: erlang -*-
%% Unless you know what you are doing, DO NOT edit manually!!
{VSN, [{<<".*">>, []}], [{<<".*">>, []}]}.

View File

@ -0,0 +1,438 @@
%%--------------------------------------------------------------------
%% Copyright (c) 2022-2023 EMQ Technologies Co., Ltd. All Rights Reserved.
%%--------------------------------------------------------------------
-module(emqx_node_rebalance).
-include("emqx_node_rebalance.hrl").
-include_lib("emqx/include/logger.hrl").
-include_lib("emqx/include/types.hrl").
-include_lib("snabbkaffe/include/snabbkaffe.hrl").
-export([
start/1,
status/0,
status/1,
stop/0
]).
-export([start_link/0]).
-behaviour(gen_statem).
-export([
init/1,
callback_mode/0,
handle_event/4,
code_change/4
]).
-export([
is_node_available/0,
available_nodes/1,
connection_count/0,
session_count/0,
disconnected_session_count/0
]).
-export_type([
start_opts/0,
start_error/0
]).
%%--------------------------------------------------------------------
%% APIs
%%--------------------------------------------------------------------
-type start_opts() :: #{
conn_evict_rate => pos_integer(),
sess_evict_rate => pos_integer(),
wait_health_check => pos_integer(),
wait_takeover => pos_integer(),
abs_conn_threshold => pos_integer(),
rel_conn_threshold => number(),
abs_sess_threshold => pos_integer(),
rel_sess_threshold => number(),
nodes => [node()]
}.
-type start_error() :: already_started | [{node(), term()}].
-spec start(start_opts()) -> ok_or_error(start_error()).
start(StartOpts) ->
Opts = maps:merge(default_opts(), StartOpts),
gen_statem:call(?MODULE, {start, Opts}).
-spec stop() -> ok_or_error(not_started).
stop() ->
gen_statem:call(?MODULE, stop).
-spec status() -> disabled | {enabled, map()}.
status() ->
gen_statem:call(?MODULE, status).
-spec status(pid()) -> disabled | {enabled, map()}.
status(Pid) ->
gen_statem:call(Pid, status).
-spec start_link() -> startlink_ret().
start_link() ->
gen_statem:start_link({local, ?MODULE}, ?MODULE, [], []).
-spec available_nodes(list(node())) -> list(node()).
available_nodes(Nodes) when is_list(Nodes) ->
{Available, _} = emqx_node_rebalance_proto_v1:available_nodes(Nodes),
lists:filter(fun is_atom/1, Available).
%%--------------------------------------------------------------------
%% gen_statem callbacks
%%--------------------------------------------------------------------
callback_mode() -> handle_event_function.
%% states: disabled, wait_health_check, evicting_conns, wait_takeover, evicting_sessions
init([]) ->
?tp(debug, emqx_node_rebalance_started, #{}),
{ok, disabled, #{}}.
%% start
handle_event(
{call, From},
{start, #{wait_health_check := WaitHealthCheck} = Opts},
disabled,
#{} = Data
) ->
case enable_rebalance(Data#{opts => Opts}) of
{ok, NewData} ->
?SLOG(warning, #{msg => "node_rebalance_enabled", opts => Opts}),
{next_state, wait_health_check, NewData, [
{state_timeout, seconds(WaitHealthCheck), evict_conns},
{reply, From, ok}
]};
{error, Reason} ->
?SLOG(warning, #{
msg => "node_rebalance_enable_failed",
reason => Reason
}),
{keep_state_and_data, [{reply, From, {error, Reason}}]}
end;
handle_event({call, From}, {start, _Opts}, _State, #{}) ->
{keep_state_and_data, [{reply, From, {error, already_started}}]};
%% stop
handle_event({call, From}, stop, disabled, #{}) ->
{keep_state_and_data, [{reply, From, {error, not_started}}]};
handle_event({call, From}, stop, _State, Data) ->
ok = disable_rebalance(Data),
?SLOG(warning, #{msg => "node_rebalance_stopped"}),
{next_state, disabled, deinit(Data), [{reply, From, ok}]};
%% status
handle_event({call, From}, status, disabled, #{}) ->
{keep_state_and_data, [{reply, From, disabled}]};
handle_event({call, From}, status, State, Data) ->
Stats = get_stats(State, Data),
{keep_state_and_data, [
{reply, From,
{enabled, Stats#{
state => State,
coordinator_node => node()
}}}
]};
%% conn eviction
handle_event(
state_timeout,
evict_conns,
wait_health_check,
Data
) ->
?SLOG(warning, #{msg => "node_rebalance_wait_health_check_over"}),
{next_state, evicting_conns, Data, [{state_timeout, 0, evict_conns}]};
handle_event(
state_timeout,
evict_conns,
evicting_conns,
#{
opts := #{
wait_takeover := WaitTakeover,
evict_interval := EvictInterval
}
} = Data
) ->
case evict_conns(Data) of
ok ->
?SLOG(warning, #{msg => "node_rebalance_evict_conns_over"}),
{next_state, wait_takeover, Data, [
{state_timeout, seconds(WaitTakeover), evict_sessions}
]};
{continue, NewData} ->
{keep_state, NewData, [{state_timeout, EvictInterval, evict_conns}]}
end;
handle_event(
state_timeout,
evict_sessions,
wait_takeover,
Data
) ->
?SLOG(warning, #{msg => "node_rebalance_wait_takeover_over"}),
{next_state, evicting_sessions, Data, [{state_timeout, 0, evict_sessions}]};
handle_event(
state_timeout,
evict_sessions,
evicting_sessions,
#{opts := #{evict_interval := EvictInterval}} = Data
) ->
case evict_sessions(Data) of
ok ->
?tp(debug, emqx_node_rebalance_evict_sess_over, #{}),
?SLOG(warning, #{msg => "node_rebalance_evict_sessions_over"}),
ok = disable_rebalance(Data),
?SLOG(warning, #{msg => "node_rebalance_finished_successfully"}),
{next_state, disabled, deinit(Data)};
{continue, NewData} ->
{keep_state, NewData, [{state_timeout, EvictInterval, evict_sessions}]}
end;
handle_event({call, From}, Msg, _State, _Data) ->
?SLOG(warning, #{msg => "node_rebalance_unknown_call", call => Msg}),
{keep_state_and_data, [{reply, From, ignored}]};
handle_event(info, Msg, _State, _Data) ->
?SLOG(warning, #{msg => "node_rebalance_unknown_info", info => Msg}),
keep_state_and_data;
handle_event(cast, Msg, _State, _Data) ->
?SLOG(warning, #{msg => "node_rebalance_unknown_cast", cast => Msg}),
keep_state_and_data.
code_change(_Vsn, State, Data, _Extra) ->
{ok, State, Data}.
%%--------------------------------------------------------------------
%% internal funs
%%--------------------------------------------------------------------
enable_rebalance(#{opts := Opts} = Data) ->
Nodes = maps:get(nodes, Opts),
ConnCounts = multicall(Nodes, connection_counts, []),
SessCounts = multicall(Nodes, session_counts, []),
{_, Counts} = lists:unzip(ConnCounts),
Avg = avg(Counts),
{DonorCounts, RecipientCounts} = lists:partition(
fun({_Node, Count}) ->
Count >= Avg
end,
ConnCounts
),
?SLOG(warning, #{
msg => "node_rebalance_enabling",
conn_counts => ConnCounts,
donor_counts => DonorCounts,
recipient_counts => RecipientCounts
}),
{DonorNodes, _} = lists:unzip(DonorCounts),
{RecipientNodes, _} = lists:unzip(RecipientCounts),
case need_rebalance(DonorNodes, RecipientNodes, ConnCounts, SessCounts, Opts) of
false ->
{error, nothing_to_balance};
true ->
_ = multicall(DonorNodes, enable_rebalance_agent, [self()]),
{ok, Data#{
donors => DonorNodes,
recipients => RecipientNodes,
initial_conn_counts => maps:from_list(ConnCounts),
initial_sess_counts => maps:from_list(SessCounts)
}}
end.
disable_rebalance(#{donors := DonorNodes}) ->
_ = multicall(DonorNodes, disable_rebalance_agent, [self()]),
ok.
evict_conns(#{donors := DonorNodes, recipients := RecipientNodes, opts := Opts} = Data) ->
DonorNodeCounts = multicall(DonorNodes, connection_counts, []),
{_, DonorCounts} = lists:unzip(DonorNodeCounts),
RecipientNodeCounts = multicall(RecipientNodes, connection_counts, []),
{_, RecipientCounts} = lists:unzip(RecipientNodeCounts),
DonorAvg = avg(DonorCounts),
RecipientAvg = avg(RecipientCounts),
Thresholds = thresholds(conn, Opts),
NewData = Data#{
donor_conn_avg => DonorAvg,
recipient_conn_avg => RecipientAvg,
donor_conn_counts => maps:from_list(DonorNodeCounts),
recipient_conn_counts => maps:from_list(RecipientNodeCounts)
},
case within_thresholds(DonorAvg, RecipientAvg, Thresholds) of
true ->
ok;
false ->
ConnEvictRate = maps:get(conn_evict_rate, Opts),
NodesToEvict = nodes_to_evict(RecipientAvg, DonorNodeCounts),
?SLOG(warning, #{
msg => "node_rebalance_evict_conns",
nodes => NodesToEvict,
counts => ConnEvictRate
}),
_ = multicall(NodesToEvict, evict_connections, [ConnEvictRate]),
{continue, NewData}
end.
evict_sessions(#{donors := DonorNodes, recipients := RecipientNodes, opts := Opts} = Data) ->
DonorNodeCounts = multicall(DonorNodes, disconnected_session_counts, []),
{_, DonorCounts} = lists:unzip(DonorNodeCounts),
RecipientNodeCounts = multicall(RecipientNodes, disconnected_session_counts, []),
{_, RecipientCounts} = lists:unzip(RecipientNodeCounts),
DonorAvg = avg(DonorCounts),
RecipientAvg = avg(RecipientCounts),
Thresholds = thresholds(sess, Opts),
NewData = Data#{
donor_sess_avg => DonorAvg,
recipient_sess_avg => RecipientAvg,
donor_sess_counts => maps:from_list(DonorNodeCounts),
recipient_sess_counts => maps:from_list(RecipientNodeCounts)
},
case within_thresholds(DonorAvg, RecipientAvg, Thresholds) of
true ->
ok;
false ->
SessEvictRate = maps:get(sess_evict_rate, Opts),
NodesToEvict = nodes_to_evict(RecipientAvg, DonorNodeCounts),
?SLOG(warning, #{
msg => "node_rebalance_evict_sessions",
nodes => NodesToEvict,
counts => SessEvictRate
}),
_ = multicall(
NodesToEvict,
evict_sessions,
[SessEvictRate, RecipientNodes, disconnected]
),
{continue, NewData}
end.
need_rebalance([] = _DonorNodes, _RecipientNodes, _ConnCounts, _SessCounts, _Opts) ->
false;
need_rebalance(_DonorNodes, [] = _RecipientNodes, _ConnCounts, _SessCounts, _Opts) ->
false;
need_rebalance(DonorNodes, RecipientNodes, ConnCounts, SessCounts, Opts) ->
DonorConnAvg = avg_for_nodes(DonorNodes, ConnCounts),
RecipientConnAvg = avg_for_nodes(RecipientNodes, ConnCounts),
DonorSessAvg = avg_for_nodes(DonorNodes, SessCounts),
RecipientSessAvg = avg_for_nodes(RecipientNodes, SessCounts),
Result =
(not within_thresholds(DonorConnAvg, RecipientConnAvg, thresholds(conn, Opts))) orelse
(not within_thresholds(DonorSessAvg, RecipientSessAvg, thresholds(sess, Opts))),
?tp(
debug,
emqx_node_rebalance_need_rebalance,
#{
donors => DonorNodes,
recipients => RecipientNodes,
conn_counts => ConnCounts,
sess_counts => SessCounts,
opts => Opts,
result => Result
}
),
Result.
avg_for_nodes(Nodes, Counts) ->
avg(maps:values(maps:with(Nodes, maps:from_list(Counts)))).
within_thresholds(Value, GoalValue, {AbsThres, RelThres}) ->
(Value =< GoalValue + AbsThres) orelse (Value =< GoalValue * RelThres).
thresholds(conn, #{abs_conn_threshold := Abs, rel_conn_threshold := Rel}) ->
{Abs, Rel};
thresholds(sess, #{abs_sess_threshold := Abs, rel_sess_threshold := Rel}) ->
{Abs, Rel}.
nodes_to_evict(Goal, NodeCounts) ->
{Nodes, _} = lists:unzip(
lists:filter(
fun({_Node, Count}) ->
Count > Goal
end,
NodeCounts
)
),
Nodes.
get_stats(disabled, _Data) -> #{};
get_stats(_State, Data) -> Data.
avg(List) when length(List) >= 1 ->
lists:sum(List) / length(List).
multicall(Nodes, F, A) ->
case apply(emqx_node_rebalance_proto_v1, F, [Nodes | A]) of
{Results, []} ->
case lists:partition(fun is_ok/1, lists:zip(Nodes, Results)) of
{OkResults, []} ->
[{Node, ok_result(Result)} || {Node, Result} <- OkResults];
{_, BadResults} ->
error({bad_nodes, BadResults})
end;
{_, [_BadNode | _] = BadNodes} ->
error({bad_nodes, BadNodes})
end.
is_ok({_Node, {ok, _}}) -> true;
is_ok({_Node, ok}) -> true;
is_ok(_) -> false.
ok_result({ok, Result}) -> Result;
ok_result(ok) -> ok.
connection_count() ->
{ok, emqx_eviction_agent:connection_count()}.
session_count() ->
{ok, emqx_eviction_agent:session_count()}.
disconnected_session_count() ->
{ok, emqx_eviction_agent:session_count(disconnected)}.
default_opts() ->
#{
conn_evict_rate => ?DEFAULT_CONN_EVICT_RATE,
abs_conn_threshold => ?DEFAULT_ABS_CONN_THRESHOLD,
rel_conn_threshold => ?DEFAULT_REL_CONN_THRESHOLD,
sess_evict_rate => ?DEFAULT_SESS_EVICT_RATE,
abs_sess_threshold => ?DEFAULT_ABS_SESS_THRESHOLD,
rel_sess_threshold => ?DEFAULT_REL_SESS_THRESHOLD,
wait_health_check => ?DEFAULT_WAIT_HEALTH_CHECK,
wait_takeover => ?DEFAULT_WAIT_TAKEOVER,
evict_interval => ?EVICT_INTERVAL,
nodes => all_nodes()
}.
deinit(Data) ->
Keys = [
recipient_conn_avg,
recipient_sess_avg,
donor_conn_avg,
donor_sess_avg,
recipient_conn_counts,
recipient_sess_counts,
donor_conn_counts,
donor_sess_counts,
initial_conn_counts,
initial_sess_counts,
opts
],
maps:without(Keys, Data).
is_node_available() ->
true = is_pid(whereis(emqx_node_rebalance_agent)),
disabled = emqx_eviction_agent:status(),
node().
all_nodes() ->
mria_mnesia:running_nodes().
seconds(Sec) ->
round(timer:seconds(Sec)).

View File

@ -0,0 +1,131 @@
%%--------------------------------------------------------------------
%% Copyright (c) 2022-2023 EMQ Technologies Co., Ltd. All Rights Reserved.
%%--------------------------------------------------------------------
-module(emqx_node_rebalance_agent).
-include_lib("emqx/include/emqx_mqtt.hrl").
-include_lib("emqx/include/logger.hrl").
-include_lib("emqx/include/types.hrl").
-include_lib("stdlib/include/qlc.hrl").
-include_lib("snabbkaffe/include/snabbkaffe.hrl").
-export([
start_link/0,
enable/1,
disable/1,
status/0
]).
-export([
init/1,
handle_call/3,
handle_info/2,
handle_cast/2,
code_change/3
]).
-define(ENABLE_KIND, emqx_node_rebalance).
%%--------------------------------------------------------------------
%% APIs
%%--------------------------------------------------------------------
-type status() :: {enabled, pid()} | disabled.
-spec start_link() -> startlink_ret().
start_link() ->
gen_server:start_link({local, ?MODULE}, ?MODULE, [], []).
-spec enable(pid()) -> ok_or_error(already_enabled | eviction_agent_busy).
enable(CoordinatorPid) ->
gen_server:call(?MODULE, {enable, CoordinatorPid}).
-spec disable(pid()) -> ok_or_error(already_disabled | invalid_coordinator).
disable(CoordinatorPid) ->
gen_server:call(?MODULE, {disable, CoordinatorPid}).
-spec status() -> status().
status() ->
gen_server:call(?MODULE, status).
%%--------------------------------------------------------------------
%% gen_server callbacks
%%--------------------------------------------------------------------
init([]) ->
{ok, #{}}.
handle_call({enable, CoordinatorPid}, _From, St) ->
case St of
#{coordinator_pid := _Pid} ->
{reply, {error, already_enabled}, St};
_ ->
true = link(CoordinatorPid),
EvictionAgentPid = whereis(emqx_eviction_agent),
true = link(EvictionAgentPid),
case emqx_eviction_agent:enable(?ENABLE_KIND, undefined) of
ok ->
{reply, ok, #{
coordinator_pid => CoordinatorPid,
eviction_agent_pid => EvictionAgentPid
}};
{error, eviction_agent_busy} ->
true = unlink(EvictionAgentPid),
true = unlink(CoordinatorPid),
{reply, {error, eviction_agent_busy}, St}
end
end;
handle_call({disable, CoordinatorPid}, _From, St) ->
case St of
#{
coordinator_pid := CoordinatorPid,
eviction_agent_pid := EvictionAgentPid
} ->
_ = emqx_eviction_agent:disable(?ENABLE_KIND),
true = unlink(EvictionAgentPid),
true = unlink(CoordinatorPid),
NewSt = maps:without(
[coordinator_pid, eviction_agent_pid],
St
),
{reply, ok, NewSt};
#{coordinator_pid := _CoordinatorPid} ->
{reply, {error, invalid_coordinator}, St};
#{} ->
{reply, {error, already_disabled}, St}
end;
handle_call(status, _From, St) ->
case St of
#{coordinator_pid := Pid} ->
{reply, {enabled, Pid}, St};
_ ->
{reply, disabled, St}
end;
handle_call(Msg, _From, St) ->
?SLOG(warning, #{
msg => "unknown_call",
call => Msg,
state => St
}),
{reply, ignored, St}.
handle_info(Msg, St) ->
?SLOG(warning, #{
msg => "unknown_info",
info => Msg,
state => St
}),
{noreply, St}.
handle_cast(Msg, St) ->
?SLOG(warning, #{
msg => "unknown_cast",
cast => Msg,
state => St
}),
{noreply, St}.
code_change(_Vsn, State, _Extra) ->
{ok, State}.

View File

@ -0,0 +1,733 @@
%%--------------------------------------------------------------------
%% Copyright (c) 2022-2023 EMQ Technologies Co., Ltd. All Rights Reserved.
%%--------------------------------------------------------------------
-module(emqx_node_rebalance_api).
-behaviour(minirest_api).
-include_lib("typerefl/include/types.hrl").
-include_lib("hocon/include/hoconsc.hrl").
-include_lib("emqx/include/logger.hrl").
-include_lib("emqx_utils/include/emqx_utils_api.hrl").
%% Swagger specs from hocon schema
-export([
api_spec/0,
paths/0,
schema/1,
namespace/0
]).
-export([
fields/1,
roots/0
]).
%% API callbacks
-export([
'/load_rebalance/status'/2,
'/load_rebalance/global_status'/2,
'/load_rebalance/availability_check'/2,
'/load_rebalance/:node/start'/2,
'/load_rebalance/:node/stop'/2,
'/load_rebalance/:node/evacuation/start'/2,
'/load_rebalance/:node/evacuation/stop'/2
]).
%% Schema examples
-export([
rebalance_example/0,
rebalance_evacuation_example/0,
translate/2
]).
-import(hoconsc, [mk/2, ref/1, ref/2]).
-import(emqx_dashboard_swagger, [error_codes/2]).
-define(BAD_REQUEST, 'BAD_REQUEST').
-define(NODE_EVACUATING, 'NODE_EVACUATING').
-define(RPC_ERROR, 'RPC_ERROR').
-define(NOT_FOUND, 'NOT_FOUND').
%%--------------------------------------------------------------------
%% API Spec
%%--------------------------------------------------------------------
namespace() -> "load_rebalance".
api_spec() ->
emqx_dashboard_swagger:spec(?MODULE, #{check_schema => true}).
paths() ->
[
"/load_rebalance/status",
"/load_rebalance/global_status",
"/load_rebalance/availability_check",
"/load_rebalance/:node/start",
"/load_rebalance/:node/stop",
"/load_rebalance/:node/evacuation/start",
"/load_rebalance/:node/evacuation/stop"
].
schema("/load_rebalance/status") ->
#{
'operationId' => '/load_rebalance/status',
get => #{
tags => [<<"load_rebalance">>],
summary => <<"Get rebalance status">>,
description => ?DESC("load_rebalance_status"),
responses => #{
200 => local_status_response_schema()
}
}
};
schema("/load_rebalance/global_status") ->
#{
'operationId' => '/load_rebalance/global_status',
get => #{
tags => [<<"load_rebalance">>],
summary => <<"Get global rebalance status">>,
description => ?DESC("load_rebalance_global_status"),
responses => #{
200 => response_schema()
}
}
};
schema("/load_rebalance/availability_check") ->
#{
'operationId' => '/load_rebalance/availability_check',
get => #{
tags => [<<"load_rebalance">>],
summary => <<"Node rebalance availability check">>,
description => ?DESC("load_rebalance_availability_check"),
responses => #{
200 => response_schema(),
503 => error_codes([?NODE_EVACUATING], <<"Node Evacuating">>)
}
}
};
schema("/load_rebalance/:node/start") ->
#{
'operationId' => '/load_rebalance/:node/start',
post => #{
tags => [<<"load_rebalance">>],
summary => <<"Start rebalancing with the node as coordinator">>,
description => ?DESC("load_rebalance_start"),
parameters => [param_node()],
'requestBody' =>
emqx_dashboard_swagger:schema_with_examples(
ref(rebalance_start),
rebalance_example()
),
responses => #{
200 => response_schema(),
400 => error_codes([?BAD_REQUEST], <<"Bad Request">>),
404 => error_codes([?NOT_FOUND], <<"Not Found">>)
}
}
};
schema("/load_rebalance/:node/stop") ->
#{
'operationId' => '/load_rebalance/:node/stop',
post => #{
tags => [<<"load_rebalance">>],
summary => <<"Stop rebalancing coordinated by the node">>,
description => ?DESC("load_rebalance_stop"),
parameters => [param_node()],
responses => #{
200 => response_schema(),
400 => error_codes([?BAD_REQUEST], <<"Bad Request">>),
404 => error_codes([?NOT_FOUND], <<"Not Found">>)
}
}
};
schema("/load_rebalance/:node/evacuation/start") ->
#{
'operationId' => '/load_rebalance/:node/evacuation/start',
post => #{
tags => [<<"load_rebalance">>],
summary => <<"Start evacuation on a node">>,
description => ?DESC("load_rebalance_evacuation_start"),
parameters => [param_node()],
'requestBody' =>
emqx_dashboard_swagger:schema_with_examples(
ref(rebalance_evacuation_start),
rebalance_evacuation_example()
),
responses => #{
200 => response_schema(),
400 => error_codes([?BAD_REQUEST], <<"Bad Request">>),
404 => error_codes([?NOT_FOUND], <<"Not Found">>)
}
}
};
schema("/load_rebalance/:node/evacuation/stop") ->
#{
'operationId' => '/load_rebalance/:node/evacuation/stop',
post => #{
tags => [<<"load_rebalance">>],
summary => <<"Stop evacuation on a node">>,
description => ?DESC("load_rebalance_evacuation_stop"),
parameters => [param_node()],
responses => #{
200 => response_schema(),
400 => error_codes([?BAD_REQUEST], <<"Bad Request">>),
404 => error_codes([?NOT_FOUND], <<"Not Found">>)
}
}
}.
%%--------------------------------------------------------------------
%% Handlers
%%--------------------------------------------------------------------
'/load_rebalance/status'(get, #{}) ->
case emqx_node_rebalance_status:local_status() of
disabled ->
{200, #{status => disabled}};
{rebalance, Stats} ->
{200, format_status(rebalance, Stats)};
{evacuation, Stats} ->
{200, format_status(evacuation, Stats)}
end.
'/load_rebalance/global_status'(get, #{}) ->
#{
evacuations := Evacuations,
rebalances := Rebalances
} = emqx_node_rebalance_status:global_status(),
{200, #{
evacuations => format_as_map_list(Evacuations),
rebalances => format_as_map_list(Rebalances)
}}.
'/load_rebalance/availability_check'(get, #{}) ->
case emqx_eviction_agent:status() of
disabled ->
{200, #{}};
{enabled, _Stats} ->
error_response(503, ?NODE_EVACUATING, <<"Node Evacuating">>)
end.
'/load_rebalance/:node/start'(post, #{bindings := #{node := NodeBin}, body := Params0}) ->
emqx_utils_api:with_node(NodeBin, fun(Node) ->
Params1 = translate(rebalance_start, Params0),
with_nodes_at_key(nodes, Params1, fun(Params2) ->
wrap_rpc(
Node, emqx_node_rebalance_api_proto_v1:node_rebalance_start(Node, Params2)
)
end)
end).
'/load_rebalance/:node/stop'(post, #{bindings := #{node := NodeBin}}) ->
emqx_utils_api:with_node(NodeBin, fun(Node) ->
wrap_rpc(
Node, emqx_node_rebalance_api_proto_v1:node_rebalance_stop(Node)
)
end).
'/load_rebalance/:node/evacuation/start'(post, #{
bindings := #{node := NodeBin}, body := Params0
}) ->
emqx_utils_api:with_node(NodeBin, fun(Node) ->
Params1 = translate(rebalance_evacuation_start, Params0),
with_nodes_at_key(migrate_to, Params1, fun(Params2) ->
wrap_rpc(
Node,
emqx_node_rebalance_api_proto_v1:node_rebalance_evacuation_start(
Node, Params2
)
)
end)
end).
'/load_rebalance/:node/evacuation/stop'(post, #{bindings := #{node := NodeBin}}) ->
emqx_utils_api:with_node(NodeBin, fun(Node) ->
wrap_rpc(
Node, emqx_node_rebalance_api_proto_v1:node_rebalance_evacuation_stop(Node)
)
end).
%%--------------------------------------------------------------------
%% Helpers
%%--------------------------------------------------------------------
wrap_rpc(Node, RPCResult) ->
case RPCResult of
ok ->
{200, #{}};
{error, Reason} ->
error_response(
400, ?BAD_REQUEST, io_lib:format("error on node ~p: ~p", [Node, Reason])
);
{badrpc, Reason} ->
error_response(
503, ?RPC_ERROR, io_lib:format("RPC error on node ~p: ~p", [Node, Reason])
)
end.
format_status(Process, Stats) ->
Stats#{process => Process, status => enabled}.
validate_nodes(Key, Params) when is_map_key(Key, Params) ->
BinNodes = maps:get(Key, Params),
{ValidNodes, InvalidNodes} = lists:foldl(
fun(BinNode, {Nodes, UnknownNodes}) ->
case parse_node(BinNode) of
{ok, Node} -> {[Node | Nodes], UnknownNodes};
{error, _} -> {Nodes, [BinNode | UnknownNodes]}
end
end,
{[], []},
BinNodes
),
case InvalidNodes of
[] ->
case emqx_node_rebalance_evacuation:available_nodes(ValidNodes) of
ValidNodes -> {ok, Params#{Key => ValidNodes}};
OtherNodes -> {error, {unavailable, ValidNodes -- OtherNodes}}
end;
_ ->
{error, {invalid, InvalidNodes}}
end;
validate_nodes(_Key, Params) ->
{ok, Params}.
with_nodes_at_key(Key, Params, Fun) ->
Res = validate_nodes(Key, Params),
case Res of
{ok, Params1} ->
Fun(Params1);
{error, {unavailable, Nodes}} ->
error_response(400, ?NOT_FOUND, io_lib:format("Nodes unavailable: ~p", [Nodes]));
{error, {invalid, Nodes}} ->
error_response(400, ?BAD_REQUEST, io_lib:format("Invalid nodes: ~p", [Nodes]))
end.
parse_node(Bin) when is_binary(Bin) ->
try
{ok, binary_to_existing_atom(Bin)}
catch
error:badarg ->
{error, {unknown, Bin}}
end.
format_as_map_list(List) ->
lists:map(
fun({Node, Info}) ->
Info#{node => Node}
end,
List
).
error_response(HttpCode, Code, Message) ->
{HttpCode, ?ERROR_MSG(Code, Message)}.
without(Keys, Props) ->
lists:filter(
fun({Key, _}) ->
not lists:member(Key, Keys)
end,
Props
).
%%------------------------------------------------------------------------------
%% Schema
%%------------------------------------------------------------------------------
translate(Ref, Conf) ->
Options = #{atom_key => true},
#{Ref := TranslatedConf} = hocon_tconf:check_plain(
?MODULE, #{atom_to_binary(Ref) => Conf}, Options, [Ref]
),
TranslatedConf.
param_node() ->
{
node,
mk(binary(), #{
in => path,
desc => ?DESC(param_node),
required => true
})
}.
fields(rebalance_start) ->
[
{"wait_health_check",
mk(
emqx_schema:duration_s(),
#{
desc => ?DESC(wait_health_check),
required => false
}
)},
{"conn_evict_rate",
mk(
pos_integer(),
#{
desc => ?DESC(conn_evict_rate),
required => false
}
)},
{"sess_evict_rate",
mk(
pos_integer(),
#{
desc => ?DESC(sess_evict_rate),
required => false
}
)},
{"abs_conn_threshold",
mk(
pos_integer(),
#{
desc => ?DESC(abs_conn_threshold),
required => false
}
)},
{"rel_conn_threshold",
mk(
number(),
#{
desc => ?DESC(rel_conn_threshold),
required => false,
validator => [fun(Value) -> Value > 1.0 end]
}
)},
{"abs_sess_threshold",
mk(
pos_integer(),
#{
desc => ?DESC(abs_sess_threshold),
required => false
}
)},
{"rel_sess_threshold",
mk(
number(),
#{
desc => ?DESC(rel_sess_threshold),
required => false,
validator => [fun(Value) -> Value > 1.0 end]
}
)},
{"wait_takeover",
mk(
emqx_schema:duration_s(),
#{
desc => ?DESC(wait_takeover),
required => false
}
)},
{"nodes",
mk(
list(binary()),
#{
desc => ?DESC(rebalance_nodes),
required => false,
validator => [fun(Values) -> length(Values) > 0 end]
}
)}
];
fields(rebalance_evacuation_start) ->
[
{"conn_evict_rate",
mk(
pos_integer(),
#{
desc => ?DESC(conn_evict_rate),
required => false
}
)},
{"sess_evict_rate",
mk(
pos_integer(),
#{
desc => ?DESC(sess_evict_rate),
required => false
}
)},
{"redirect_to",
mk(
binary(),
#{
desc => ?DESC(redirect_to),
required => false
}
)},
{"wait_takeover",
mk(
pos_integer(),
#{
desc => ?DESC(wait_takeover),
required => false
}
)},
{"migrate_to",
mk(
nonempty_list(binary()),
#{
desc => ?DESC(migrate_to),
required => false
}
)}
];
fields(local_status_disabled) ->
[
{"status",
mk(
disabled,
#{
desc => ?DESC(local_status_enabled),
required => true
}
)}
];
fields(local_status_enabled) ->
[
{"status",
mk(
enabled,
#{
desc => ?DESC(local_status_enabled),
required => true
}
)},
{"process",
mk(
hoconsc:union([rebalance, evacuation]),
#{
desc => ?DESC(local_status_process),
required => true
}
)},
{"state",
mk(
atom(),
#{
desc => ?DESC(local_status_state),
required => true
}
)},
{"coordinator_node",
mk(
binary(),
#{
desc => ?DESC(local_status_coordinator_node),
required => false
}
)},
{"connection_eviction_rate",
mk(
pos_integer(),
#{
desc => ?DESC(local_status_connection_eviction_rate),
required => false
}
)},
{"session_eviction_rate",
mk(
pos_integer(),
#{
desc => ?DESC(local_status_session_eviction_rate),
required => false
}
)},
{"connection_goal",
mk(
non_neg_integer(),
#{
desc => ?DESC(local_status_connection_goal),
required => false
}
)},
{"session_goal",
mk(
non_neg_integer(),
#{
desc => ?DESC(local_status_session_goal),
required => false
}
)},
{"disconnected_session_goal",
mk(
non_neg_integer(),
#{
desc => ?DESC(local_status_disconnected_session_goal),
required => false
}
)},
{"session_recipients",
mk(
list(binary()),
#{
desc => ?DESC(local_status_session_recipients),
required => false
}
)},
{"recipients",
mk(
list(binary()),
#{
desc => ?DESC(local_status_recipients),
required => false
}
)},
{"stats",
mk(
ref(status_stats),
#{
desc => ?DESC(local_status_stats),
required => false
}
)}
];
fields(status_stats) ->
[
{"initial_connected",
mk(
non_neg_integer(),
#{
desc => ?DESC(status_stats_initial_connected),
required => true
}
)},
{"current_connected",
mk(
non_neg_integer(),
#{
desc => ?DESC(status_stats_current_connected),
required => true
}
)},
{"initial_sessions",
mk(
non_neg_integer(),
#{
desc => ?DESC(status_stats_initial_sessions),
required => true
}
)},
{"current_sessions",
mk(
non_neg_integer(),
#{
desc => ?DESC(status_stats_current_sessions),
required => true
}
)},
{"current_disconnected_sessions",
mk(
non_neg_integer(),
#{
desc => ?DESC(status_stats_current_disconnected_sessions),
required => false
}
)}
];
fields(global_coordinator_status) ->
without(
["status", "process", "session_goal", "session_recipients", "stats"],
fields(local_status_enabled)
) ++
[
{"donors",
mk(
list(binary()),
#{
desc => ?DESC(coordinator_status_donors),
required => false
}
)},
{"donor_conn_avg",
mk(
non_neg_integer(),
#{
desc => ?DESC(coordinator_status_donor_conn_avg),
required => false
}
)},
{"donor_sess_avg",
mk(
non_neg_integer(),
#{
desc => ?DESC(coordinator_status_donor_sess_avg),
required => false
}
)},
{"node",
mk(
binary(),
#{
desc => ?DESC(coordinator_status_node),
required => true
}
)}
];
fields(global_evacuation_status) ->
without(["status", "process"], fields(local_status_enabled)) ++
[
{"node",
mk(
binary(),
#{
desc => ?DESC(evacuation_status_node),
required => true
}
)}
];
fields(global_status) ->
[
{"evacuations",
mk(
hoconsc:array(ref(global_evacuation_status)),
#{
desc => ?DESC(global_status_evacuations),
required => false
}
)},
{"rebalances",
mk(
hoconsc:array(ref(global_coordinator_status)),
#{
desc => ?DESC(global_status_rebalances),
required => false
}
)}
].
rebalance_example() ->
#{
wait_health_check => 10,
conn_evict_rate => 10,
sess_evict_rate => 20,
abs_conn_threshold => 10,
rel_conn_threshold => 1.5,
abs_sess_threshold => 10,
rel_sess_threshold => 1.5,
wait_takeover => 10,
nodes => [<<"othernode@127.0.0.1">>]
}.
rebalance_evacuation_example() ->
#{
conn_evict_rate => 100,
sess_evict_rate => 100,
redirect_to => <<"othernode:1883">>,
wait_takeover => 10,
migrate_to => [<<"othernode@127.0.0.1">>]
}.
local_status_response_schema() ->
hoconsc:union([ref(local_status_disabled), ref(local_status_enabled)]).
response_schema() ->
mk(
map(),
#{
desc => ?DESC(empty_response)
}
).
roots() -> [].

View File

@ -0,0 +1,22 @@
%%--------------------------------------------------------------------
%% Copyright (c) 2022-2023 EMQ Technologies Co., Ltd. All Rights Reserved.
%%--------------------------------------------------------------------
-module(emqx_node_rebalance_app).
-behaviour(application).
-emqx_plugin(?MODULE).
-export([
start/2,
stop/1
]).
start(_Type, _Args) ->
{ok, Sup} = emqx_node_rebalance_sup:start_link(),
ok = emqx_node_rebalance_cli:load(),
{ok, Sup}.
stop(_State) ->
emqx_node_rebalance_cli:unload().

View File

@ -0,0 +1,305 @@
%%--------------------------------------------------------------------
%% Copyright (c) 2022-2023 EMQ Technologies Co., Ltd. All Rights Reserved.
%%--------------------------------------------------------------------
-module(emqx_node_rebalance_cli).
%% APIs
-export([
load/0,
unload/0,
cli/1
]).
load() ->
emqx_ctl:register_command(rebalance, {?MODULE, cli}, []).
unload() ->
emqx_ctl:unregister_command(rebalance).
cli(["start" | StartArgs]) ->
case start_args(StartArgs) of
{evacuation, Opts} ->
case emqx_node_rebalance_evacuation:status() of
disabled ->
ok = emqx_node_rebalance_evacuation:start(Opts),
emqx_ctl:print("Rebalance(evacuation) started~n"),
true;
{enabled, _} ->
emqx_ctl:print("Rebalance is already enabled~n"),
false
end;
{rebalance, Opts} ->
case emqx_node_rebalance:start(Opts) of
ok ->
emqx_ctl:print("Rebalance started~n"),
true;
{error, Reason} ->
emqx_ctl:print("Rebalance start error: ~p~n", [Reason]),
false
end;
{error, Error} ->
emqx_ctl:print("Rebalance start error: ~s~n", [Error]),
false
end;
cli(["node-status", NodeStr]) ->
case emqx_utils:safe_to_existing_atom(NodeStr, utf8) of
{ok, Node} ->
node_status(emqx_node_rebalance_status:local_status(Node));
{error, _} ->
emqx_ctl:print("Node status error: invalid node~n"),
false
end;
cli(["node-status"]) ->
node_status(emqx_node_rebalance_status:local_status());
cli(["status"]) ->
#{
evacuations := Evacuations,
rebalances := Rebalances
} = emqx_node_rebalance_status:global_status(),
lists:foreach(
fun({Node, Status}) ->
emqx_ctl:print(
"--------------------------------------------------------------------~n"
),
emqx_ctl:print(
"Node ~p: evacuation~n~s",
[Node, emqx_node_rebalance_status:format_local_status(Status)]
)
end,
Evacuations
),
lists:foreach(
fun({Node, Status}) ->
emqx_ctl:print(
"--------------------------------------------------------------------~n"
),
emqx_ctl:print(
"Node ~p: rebalance coordinator~n~s",
[Node, emqx_node_rebalance_status:format_coordinator_status(Status)]
)
end,
Rebalances
);
cli(["stop"]) ->
case emqx_node_rebalance_evacuation:status() of
{enabled, _} ->
ok = emqx_node_rebalance_evacuation:stop(),
emqx_ctl:print("Rebalance(evacuation) stopped~n"),
true;
disabled ->
case emqx_node_rebalance:status() of
{enabled, _} ->
ok = emqx_node_rebalance:stop(),
emqx_ctl:print("Rebalance stopped~n"),
true;
disabled ->
emqx_ctl:print("Rebalance is already disabled~n"),
false
end
end;
cli(_) ->
emqx_ctl:usage(
[
{
"rebalance start --evacuation \\\n"
" [--redirect-to \"Host1:Port1 Host2:Port2 ...\"] \\\n"
" [--conn-evict-rate CountPerSec] \\\n"
" [--migrate-to \"node1@host1 node2@host2 ...\"] \\\n"
" [--wait-takeover Secs] \\\n"
" [--sess-evict-rate CountPerSec]",
"Start current node evacuation with optional server redirect to the specified servers"
},
{
"rebalance start \\\n"
" [--nodes \"node1@host1 node2@host2\"] \\\n"
" [--wait-health-check Secs] \\\n"
" [--conn-evict-rate ConnPerSec] \\\n"
" [--abs-conn-threshold Count] \\\n"
" [--rel-conn-threshold Fraction] \\\n"
" [--conn-evict-rate ConnPerSec] \\\n"
" [--wait-takeover Secs] \\\n"
" [--sess-evict-rate CountPerSec] \\\n"
" [--abs-sess-threshold Count] \\\n"
" [--rel-sess-threshold Fraction]",
"Start rebalance on the specified nodes using the current node as the coordinator"
},
{"rebalance node-status", "Get current node rebalance status"},
{"rebalance node-status \"node1@host1\"", "Get remote node rebalance status"},
{"rebalance status",
"Get statuses of all current rebalance/evacuation processes across the cluster"},
{"rebalance stop", "Stop node rebalance"}
]
).
node_status(NodeStatus) ->
case NodeStatus of
{Process, Status} when Process =:= evacuation orelse Process =:= rebalance ->
emqx_ctl:print(
"Rebalance type: ~p~n~s~n",
[Process, emqx_node_rebalance_status:format_local_status(Status)]
);
disabled ->
emqx_ctl:print("Rebalance disabled~n");
Other ->
emqx_ctl:print("Error detecting rebalance status: ~p~n", [Other])
end.
start_args(Args) ->
case collect_args(Args, #{}) of
{ok, #{"--evacuation" := true} = Collected} ->
case validate_evacuation(maps:to_list(Collected), #{}) of
{ok, Validated} ->
{evacuation, Validated};
{error, _} = Error ->
Error
end;
{ok, #{} = Collected} ->
case validate_rebalance(maps:to_list(Collected), #{}) of
{ok, Validated} ->
{rebalance, Validated};
{error, _} = Error ->
Error
end;
{error, _} = Error ->
Error
end.
collect_args([], Map) ->
{ok, Map};
%% evacuation
collect_args(["--evacuation" | Args], Map) ->
collect_args(Args, Map#{"--evacuation" => true});
collect_args(["--redirect-to", ServerReference | Args], Map) ->
collect_args(Args, Map#{"--redirect-to" => ServerReference});
collect_args(["--migrate-to", MigrateTo | Args], Map) ->
collect_args(Args, Map#{"--migrate-to" => MigrateTo});
%% rebalance
collect_args(["--nodes", Nodes | Args], Map) ->
collect_args(Args, Map#{"--nodes" => Nodes});
collect_args(["--wait-health-check", WaitHealthCheck | Args], Map) ->
collect_args(Args, Map#{"--wait-health-check" => WaitHealthCheck});
collect_args(["--abs-conn-threshold", AbsConnThres | Args], Map) ->
collect_args(Args, Map#{"--abs-conn-threshold" => AbsConnThres});
collect_args(["--rel-conn-threshold", RelConnThres | Args], Map) ->
collect_args(Args, Map#{"--rel-conn-threshold" => RelConnThres});
collect_args(["--abs-sess-threshold", AbsSessThres | Args], Map) ->
collect_args(Args, Map#{"--abs-sess-threshold" => AbsSessThres});
collect_args(["--rel-sess-threshold", RelSessThres | Args], Map) ->
collect_args(Args, Map#{"--rel-sess-threshold" => RelSessThres});
%% common
collect_args(["--conn-evict-rate", ConnEvictRate | Args], Map) ->
collect_args(Args, Map#{"--conn-evict-rate" => ConnEvictRate});
collect_args(["--wait-takeover", WaitTakeover | Args], Map) ->
collect_args(Args, Map#{"--wait-takeover" => WaitTakeover});
collect_args(["--sess-evict-rate", SessEvictRate | Args], Map) ->
collect_args(Args, Map#{"--sess-evict-rate" => SessEvictRate});
%% fallback
collect_args(Args, _Map) ->
{error, io_lib:format("unknown arguments: ~p", [Args])}.
validate_evacuation([], Map) ->
{ok, Map};
validate_evacuation([{"--evacuation", _} | Rest], Map) ->
validate_evacuation(Rest, Map);
validate_evacuation([{"--redirect-to", ServerReference} | Rest], Map) ->
validate_evacuation(Rest, Map#{server_reference => list_to_binary(ServerReference)});
validate_evacuation([{"--conn-evict-rate", _} | _] = Opts, Map) ->
validate_pos_int(conn_evict_rate, Opts, Map, fun validate_evacuation/2);
validate_evacuation([{"--sess-evict-rate", _} | _] = Opts, Map) ->
validate_pos_int(sess_evict_rate, Opts, Map, fun validate_evacuation/2);
validate_evacuation([{"--wait-takeover", _} | _] = Opts, Map) ->
validate_pos_int(wait_takeover, Opts, Map, fun validate_evacuation/2);
validate_evacuation([{"--migrate-to", MigrateTo} | Rest], Map) ->
case strings_to_atoms(string:tokens(MigrateTo, ", ")) of
{_, Invalid} when Invalid =/= [] ->
{error, io_lib:format("invalid --migrate-to, invalid nodes: ~p", [Invalid])};
{Nodes, []} ->
case emqx_node_rebalance_evacuation:available_nodes(Nodes) of
[] ->
{error, "invalid --migrate-to, no nodes"};
Nodes ->
validate_evacuation(Rest, Map#{migrate_to => Nodes});
OtherNodes ->
{error,
io_lib:format(
"invalid --migrate-to, unavailable nodes: ~p",
[Nodes -- OtherNodes]
)}
end
end;
validate_evacuation(Rest, _Map) ->
{error, io_lib:format("unknown evacuation arguments: ~p", [Rest])}.
validate_rebalance([], Map) ->
{ok, Map};
validate_rebalance([{"--wait-health-check", _} | _] = Opts, Map) ->
validate_pos_int(wait_health_check, Opts, Map, fun validate_rebalance/2);
validate_rebalance([{"--conn-evict-rate", _} | _] = Opts, Map) ->
validate_pos_int(conn_evict_rate, Opts, Map, fun validate_rebalance/2);
validate_rebalance([{"--sess-evict-rate", _} | _] = Opts, Map) ->
validate_pos_int(sess_evict_rate, Opts, Map, fun validate_rebalance/2);
validate_rebalance([{"--abs-conn-threshold", _} | _] = Opts, Map) ->
validate_pos_int(abs_conn_threshold, Opts, Map, fun validate_rebalance/2);
validate_rebalance([{"--rel-conn-threshold", _} | _] = Opts, Map) ->
validate_fraction(rel_conn_threshold, Opts, Map, fun validate_rebalance/2);
validate_rebalance([{"--abs-sess-threshold", _} | _] = Opts, Map) ->
validate_pos_int(abs_sess_threshold, Opts, Map, fun validate_rebalance/2);
validate_rebalance([{"--rel-sess-threshold", _} | _] = Opts, Map) ->
validate_fraction(rel_sess_threshold, Opts, Map, fun validate_rebalance/2);
validate_rebalance([{"--wait-takeover", _} | _] = Opts, Map) ->
validate_pos_int(wait_takeover, Opts, Map, fun validate_rebalance/2);
validate_rebalance([{"--nodes", NodeStr} | Rest], Map) ->
case strings_to_atoms(string:tokens(NodeStr, ", ")) of
{_, Invalid} when Invalid =/= [] ->
{error, io_lib:format("invalid --nodes, invalid nodes: ~p", [Invalid])};
{Nodes, []} ->
case emqx_node_rebalance:available_nodes(Nodes) of
[] ->
{error, "invalid --nodes, no nodes"};
Nodes ->
validate_rebalance(Rest, Map#{nodes => Nodes});
OtherNodes ->
{error,
io_lib:format(
"invalid --nodes, unavailable nodes: ~p",
[Nodes -- OtherNodes]
)}
end
end;
validate_rebalance(Rest, _Map) ->
{error, io_lib:format("unknown rebalance arguments: ~p", [Rest])}.
validate_fraction(Name, [{OptionName, Value} | Rest], Map, Next) ->
case string:to_float(Value) of
{Num, ""} when Num > 1.0 ->
Next(Rest, Map#{Name => Num});
_ ->
{error, "invalid " ++ OptionName ++ " value"}
end.
validate_pos_int(Name, [{OptionName, Value} | Rest], Map, Next) ->
case string:to_integer(Value) of
{Int, ""} when Int > 0 ->
Next(Rest, Map#{Name => Int});
_ ->
{error, "invalid " ++ OptionName ++ " value"}
end.
strings_to_atoms(Strings) ->
strings_to_atoms(Strings, [], []).
strings_to_atoms([], Atoms, Invalid) ->
{lists:reverse(Atoms), lists:reverse(Invalid)};
strings_to_atoms([Str | Rest], Atoms, Invalid) ->
case emqx_utils:safe_to_existing_atom(Str, utf8) of
{ok, Atom} ->
strings_to_atoms(Rest, [Atom | Atoms], Invalid);
{error, _} ->
strings_to_atoms(Rest, Atoms, [Str | Invalid])
end.

View File

@ -0,0 +1,308 @@
%%--------------------------------------------------------------------
%% Copyright (c) 2022-2023 EMQ Technologies Co., Ltd. All Rights Reserved.
%%--------------------------------------------------------------------
-module(emqx_node_rebalance_evacuation).
-include("emqx_node_rebalance.hrl").
-include_lib("emqx/include/logger.hrl").
-include_lib("emqx/include/types.hrl").
-include_lib("snabbkaffe/include/snabbkaffe.hrl").
-export([
start/1,
status/0,
stop/0
]).
-export([start_link/0]).
-behaviour(gen_statem).
-export([
init/1,
callback_mode/0,
handle_event/4,
code_change/4
]).
-export([
is_node_available/0,
available_nodes/1
]).
-export_type([
start_opts/0,
start_error/0
]).
-ifdef(TEST).
-export([migrate_to/1]).
-endif.
%%--------------------------------------------------------------------
%% APIs
%%--------------------------------------------------------------------
-define(EVICT_INTERVAL_NO_NODES, 30000).
-type migrate_to() :: [node()] | undefined.
-type start_opts() :: #{
server_reference => emqx_eviction_agent:server_reference(),
conn_evict_rate => pos_integer(),
sess_evict_rate => pos_integer(),
wait_takeover => pos_integer(),
migrate_to => migrate_to()
}.
-type start_error() :: already_started | eviction_agent_busy.
-type stats() :: #{
initial_conns := non_neg_integer(),
initial_sessions := non_neg_integer(),
current_conns := non_neg_integer(),
current_sessions := non_neg_integer(),
conn_evict_rate := pos_integer(),
sess_evict_rate := pos_integer(),
server_reference := emqx_eviction_agent:server_reference(),
migrate_to := migrate_to()
}.
-type status() :: {enabled, stats()} | disabled.
-spec start(start_opts()) -> ok_or_error(start_error()).
start(StartOpts) ->
Opts = maps:merge(default_opts(), StartOpts),
gen_statem:call(?MODULE, {start, Opts}).
-spec stop() -> ok_or_error(not_started).
stop() ->
gen_statem:call(?MODULE, stop).
-spec status() -> status().
status() ->
gen_statem:call(?MODULE, status).
-spec start_link() -> startlink_ret().
start_link() ->
gen_statem:start_link({local, ?MODULE}, ?MODULE, [], []).
-spec available_nodes(list(node())) -> list(node()).
available_nodes(Nodes) when is_list(Nodes) ->
{Available, _} = emqx_node_rebalance_evacuation_proto_v1:available_nodes(Nodes),
lists:filter(fun is_atom/1, Available).
%%--------------------------------------------------------------------
%% gen_statem callbacks
%%--------------------------------------------------------------------
callback_mode() -> handle_event_function.
%% states: disabled, evicting_conns, waiting_takeover, evicting_sessions, prohibiting
init([]) ->
case emqx_node_rebalance_evacuation_persist:read(default_opts()) of
{ok, #{server_reference := ServerReference} = Opts} ->
?SLOG(warning, #{msg => "restoring_evacuation_state", opts => Opts}),
case emqx_eviction_agent:enable(?MODULE, ServerReference) of
ok ->
Data = init_data(#{}, Opts),
ok = warn_enabled(),
{ok, evicting_conns, Data, [{state_timeout, 0, evict_conns}]};
{error, eviction_agent_busy} ->
emqx_node_rebalance_evacuation_persist:clear(),
{ok, disabled, #{}}
end;
none ->
{ok, disabled, #{}}
end.
%% start
handle_event(
{call, From},
{start, #{server_reference := ServerReference} = Opts},
disabled,
#{} = Data
) ->
case emqx_eviction_agent:enable(?MODULE, ServerReference) of
ok ->
NewData = init_data(Data, Opts),
ok = emqx_node_rebalance_evacuation_persist:save(Opts),
?SLOG(warning, #{
msg => "node_evacuation_started",
opts => Opts
}),
{next_state, evicting_conns, NewData, [
{state_timeout, 0, evict_conns},
{reply, From, ok}
]};
{error, eviction_agent_busy} ->
{keep_state_and_data, [{reply, From, {error, eviction_agent_busy}}]}
end;
handle_event({call, From}, {start, _Opts}, _State, #{}) ->
{keep_state_and_data, [{reply, From, {error, already_started}}]};
%% stop
handle_event({call, From}, stop, disabled, #{}) ->
{keep_state_and_data, [{reply, From, {error, not_started}}]};
handle_event({call, From}, stop, _State, Data) ->
ok = emqx_node_rebalance_evacuation_persist:clear(),
_ = emqx_eviction_agent:disable(?MODULE),
?SLOG(warning, #{msg => "node_evacuation_stopped"}),
{next_state, disabled, deinit(Data), [{reply, From, ok}]};
%% status
handle_event({call, From}, status, disabled, #{}) ->
{keep_state_and_data, [{reply, From, disabled}]};
handle_event({call, From}, status, State, #{migrate_to := MigrateTo} = Data) ->
Stats = maps:with(
[
initial_conns,
current_conns,
initial_sessions,
current_sessions,
server_reference,
conn_evict_rate,
sess_evict_rate
],
Data
),
{keep_state_and_data, [
{reply, From, {enabled, Stats#{state => State, migrate_to => migrate_to(MigrateTo)}}}
]};
%% conn eviction
handle_event(
state_timeout,
evict_conns,
evicting_conns,
#{
conn_evict_rate := ConnEvictRate,
wait_takeover := WaitTakeover
} = Data
) ->
case emqx_eviction_agent:status() of
{enabled, #{connections := Conns}} when Conns > 0 ->
ok = emqx_eviction_agent:evict_connections(ConnEvictRate),
?tp(debug, node_evacuation_evict_conn, #{conn_evict_rate => ConnEvictRate}),
?SLOG(
warning,
#{
msg => "node_evacuation_evict_conns",
count => Conns,
conn_evict_rate => ConnEvictRate
}
),
NewData = Data#{current_conns => Conns},
{keep_state, NewData, [{state_timeout, ?EVICT_INTERVAL, evict_conns}]};
{enabled, #{connections := 0}} ->
NewData = Data#{current_conns => 0},
?SLOG(warning, #{msg => "node_evacuation_evict_conns_done"}),
{next_state, waiting_takeover, NewData, [
{state_timeout, timer:seconds(WaitTakeover), evict_sessions}
]}
end;
handle_event(
state_timeout,
evict_sessions,
waiting_takeover,
Data
) ->
?SLOG(warning, #{msg => "node_evacuation_waiting_takeover_done"}),
{next_state, evicting_sessions, Data, [{state_timeout, 0, evict_sessions}]};
%% session eviction
handle_event(
state_timeout,
evict_sessions,
evicting_sessions,
#{
sess_evict_rate := SessEvictRate,
migrate_to := MigrateTo,
current_sessions := CurrSessCount
} = Data
) ->
case emqx_eviction_agent:status() of
{enabled, #{sessions := SessCount}} when SessCount > 0 ->
case migrate_to(MigrateTo) of
[] ->
?SLOG(warning, #{
msg => "no_nodes_to_evacuate_sessions", session_count => CurrSessCount
}),
{keep_state_and_data, [
{state_timeout, ?EVICT_INTERVAL_NO_NODES, evict_sessions}
]};
Nodes ->
ok = emqx_eviction_agent:evict_sessions(SessEvictRate, Nodes),
?SLOG(
warning,
#{
msg => "node_evacuation_evict_sessions",
session_count => SessCount,
session_evict_rate => SessEvictRate,
target_nodes => Nodes
}
),
NewData = Data#{current_sessions => SessCount},
{keep_state, NewData, [{state_timeout, ?EVICT_INTERVAL, evict_sessions}]}
end;
{enabled, #{sessions := 0}} ->
?tp(debug, node_evacuation_evict_sess_over, #{}),
?SLOG(warning, #{msg => "node_evacuation_evict_sessions_over"}),
NewData = Data#{current_sessions => 0},
{next_state, prohibiting, NewData}
end;
handle_event({call, From}, Msg, State, Data) ->
?SLOG(warning, #{msg => "unknown_call", call => Msg, state => State, data => Data}),
{keep_state_and_data, [{reply, From, ignored}]};
handle_event(info, Msg, State, Data) ->
?SLOG(warning, #{msg => "unknown_info", info => Msg, state => State, data => Data}),
keep_state_and_data;
handle_event(cast, Msg, State, Data) ->
?SLOG(warning, #{msg => "unknown_cast", cast => Msg, state => State, data => Data}),
keep_state_and_data.
code_change(_Vsn, State, Data, _Extra) ->
{ok, State, Data}.
%%--------------------------------------------------------------------
%% internal funs
%%--------------------------------------------------------------------
default_opts() ->
#{
server_reference => undefined,
conn_evict_rate => ?DEFAULT_CONN_EVICT_RATE,
sess_evict_rate => ?DEFAULT_SESS_EVICT_RATE,
wait_takeover => ?DEFAULT_WAIT_TAKEOVER,
migrate_to => undefined
}.
init_data(Data0, Opts) ->
Data1 = maps:merge(Data0, Opts),
{enabled, #{connections := ConnCount, sessions := SessCount}} = emqx_eviction_agent:status(),
Data1#{
initial_conns => ConnCount,
current_conns => ConnCount,
initial_sessions => SessCount,
current_sessions => SessCount
}.
deinit(Data) ->
Keys =
[initial_conns, current_conns, initial_sessions, current_sessions] ++
maps:keys(default_opts()),
maps:without(Keys, Data).
warn_enabled() ->
?SLOG(warning, #{msg => "node_evacuation_enabled"}),
io:format(
standard_error, "Node evacuation is enabled. The node will not receive connections.~n", []
).
migrate_to(undefined) ->
migrate_to(all_nodes());
migrate_to(Nodes) when is_list(Nodes) ->
available_nodes(Nodes).
is_node_available() ->
disabled = emqx_eviction_agent:status(),
node().
all_nodes() ->
mria_mnesia:running_nodes() -- [node()].

View File

@ -0,0 +1,120 @@
%%--------------------------------------------------------------------
%% Copyright (c) 2022-2023 EMQ Technologies Co., Ltd. All Rights Reserved.
%%--------------------------------------------------------------------
-module(emqx_node_rebalance_evacuation_persist).
-export([
save/1,
clear/0,
read/1
]).
-ifdef(TEST).
-export([evacuation_filepath/0]).
-endif.
-include("emqx_node_rebalance.hrl").
-include_lib("emqx/include/types.hrl").
%%--------------------------------------------------------------------
%% APIs
%%--------------------------------------------------------------------
%% do not persist `migrate_to`:
%% * after restart there is nothing to migrate
%% * this value may be invalid after node was offline
-type persisted_start_opts() :: #{
server_reference => emqx_eviction_agent:server_reference(),
conn_evict_rate => pos_integer(),
sess_evict_rate => pos_integer(),
wait_takeover => pos_integer()
}.
-type start_opts() :: #{
server_reference => emqx_eviction_agent:server_reference(),
conn_evict_rate => pos_integer(),
sess_evict_rate => pos_integer(),
wait_takeover => pos_integer(),
migrate_to => emqx_node_rebalance_evacuation:migrate_to()
}.
-spec save(persisted_start_opts()) -> ok_or_error(term()).
save(
#{
server_reference := ServerReference,
conn_evict_rate := ConnEvictRate,
sess_evict_rate := SessEvictRate,
wait_takeover := WaitTakeover
} = Data
) when
(is_binary(ServerReference) orelse ServerReference =:= undefined) andalso
is_integer(ConnEvictRate) andalso ConnEvictRate > 0 andalso
is_integer(SessEvictRate) andalso SessEvictRate > 0 andalso
is_integer(WaitTakeover) andalso WaitTakeover >= 0
->
Filepath = evacuation_filepath(),
case filelib:ensure_dir(Filepath) of
ok ->
JsonData = emqx_utils_json:encode(
prepare_for_encode(maps:with(persist_keys(), Data)),
[pretty]
),
file:write_file(Filepath, JsonData);
{error, _} = Error ->
Error
end.
-spec clear() -> ok.
clear() ->
file:delete(evacuation_filepath()).
-spec read(start_opts()) -> {ok, start_opts()} | none.
read(DefaultOpts) ->
case file:read_file(evacuation_filepath()) of
{ok, Data} ->
case emqx_utils_json:safe_decode(Data, [return_maps]) of
{ok, Map} when is_map(Map) ->
{ok, map_to_opts(DefaultOpts, Map)};
_NotAMap ->
{ok, DefaultOpts}
end;
{error, _} ->
none
end.
%%--------------------------------------------------------------------
%% Internal funcs
%%--------------------------------------------------------------------
persist_keys() ->
[
server_reference,
conn_evict_rate,
sess_evict_rate,
wait_takeover
].
prepare_for_encode(#{server_reference := undefined} = Data) ->
Data#{server_reference => null};
prepare_for_encode(Data) ->
Data.
format_after_decode(#{server_reference := null} = Data) ->
Data#{server_reference => undefined};
format_after_decode(Data) ->
Data.
map_to_opts(DefaultOpts, Map) ->
format_after_decode(
map_to_opts(
maps:to_list(DefaultOpts), Map, #{}
)
).
map_to_opts([], _Map, Opts) ->
Opts;
map_to_opts([{Key, DefaultVal} | Rest], Map, Opts) ->
map_to_opts(Rest, Map, Opts#{Key => maps:get(atom_to_binary(Key), Map, DefaultVal)}).
evacuation_filepath() ->
filename:join([emqx:data_dir(), ?EVACUATION_FILENAME]).

View File

@ -0,0 +1,238 @@
%%--------------------------------------------------------------------
%% Copyright (c) 2022-2023 EMQ Technologies Co., Ltd. All Rights Reserved.
%%--------------------------------------------------------------------
-module(emqx_node_rebalance_status).
-export([
local_status/0,
local_status/1,
global_status/0,
format_local_status/1,
format_coordinator_status/1
]).
%% For RPC
-export([
evacuation_status/0,
rebalance_status/0
]).
%%--------------------------------------------------------------------
%% APIs
%%--------------------------------------------------------------------
-spec local_status() -> disabled | {evacuation, map()} | {rebalance, map()}.
local_status() ->
case emqx_node_rebalance_evacuation:status() of
{enabled, Status} ->
{evacuation, evacuation(Status)};
disabled ->
case emqx_node_rebalance_agent:status() of
{enabled, CoordinatorPid} ->
case emqx_node_rebalance:status(CoordinatorPid) of
{enabled, Status} ->
local_rebalance(Status, node());
disabled ->
disabled
end;
disabled ->
disabled
end
end.
-spec local_status(node()) -> disabled | {evacuation, map()} | {rebalance, map()}.
local_status(Node) ->
emqx_node_rebalance_status_proto_v1:local_status(Node).
-spec format_local_status(map()) -> iodata().
format_local_status(Status) ->
format_status(Status, local_status_field_format_order()).
-spec global_status() -> #{rebalances := [{node(), map()}], evacuations := [{node(), map()}]}.
global_status() ->
Nodes = mria_mnesia:running_nodes(),
{RebalanceResults, _} = emqx_node_rebalance_status_proto_v1:rebalance_status(Nodes),
Rebalances = [
{Node, coordinator_rebalance(Status)}
|| {Node, {enabled, Status}} <- RebalanceResults
],
{EvacuatioResults, _} = emqx_node_rebalance_status_proto_v1:evacuation_status(Nodes),
Evacuations = [{Node, evacuation(Status)} || {Node, {enabled, Status}} <- EvacuatioResults],
#{rebalances => Rebalances, evacuations => Evacuations}.
-spec format_coordinator_status(map()) -> iodata().
format_coordinator_status(Status) ->
format_status(Status, coordinator_status_field_format_order()).
%%--------------------------------------------------------------------
%% Internal functions
%%--------------------------------------------------------------------
evacuation(Status) ->
#{
state => maps:get(state, Status),
connection_eviction_rate => maps:get(conn_evict_rate, Status),
session_eviction_rate => maps:get(sess_evict_rate, Status),
connection_goal => 0,
session_goal => 0,
session_recipients => maps:get(migrate_to, Status),
stats => #{
initial_connected => maps:get(initial_conns, Status),
current_connected => maps:get(current_conns, Status),
initial_sessions => maps:get(initial_sessions, Status),
current_sessions => maps:get(current_sessions, Status)
}
}.
local_rebalance(#{donors := Donors} = Stats, Node) ->
case lists:member(Node, Donors) of
true -> {rebalance, donor_rebalance(Stats, Node)};
false -> disabled
end.
donor_rebalance(Status, Node) ->
Opts = maps:get(opts, Status),
InitialConnCounts = maps:get(initial_conn_counts, Status),
InitialSessCounts = maps:get(initial_sess_counts, Status),
CurrentStats = #{
initial_connected => maps:get(Node, InitialConnCounts),
initial_sessions => maps:get(Node, InitialSessCounts),
current_connected => emqx_eviction_agent:connection_count(),
current_sessions => emqx_eviction_agent:session_count(),
current_disconnected_sessions => emqx_eviction_agent:session_count(
disconnected
)
},
maps:from_list(
[
{state, maps:get(state, Status)},
{coordinator_node, maps:get(coordinator_node, Status)},
{connection_eviction_rate, maps:get(conn_evict_rate, Opts)},
{session_eviction_rate, maps:get(sess_evict_rate, Opts)},
{recipients, maps:get(recipients, Status)},
{stats, CurrentStats}
] ++
[
{connection_goal, maps:get(recipient_conn_avg, Status)}
|| maps:is_key(recipient_conn_avg, Status)
] ++
[
{disconnected_session_goal, maps:get(recipient_sess_avg, Status)}
|| maps:is_key(recipient_sess_avg, Status)
]
).
coordinator_rebalance(Status) ->
Opts = maps:get(opts, Status),
maps:from_list(
[
{state, maps:get(state, Status)},
{coordinator_node, maps:get(coordinator_node, Status)},
{connection_eviction_rate, maps:get(conn_evict_rate, Opts)},
{session_eviction_rate, maps:get(sess_evict_rate, Opts)},
{recipients, maps:get(recipients, Status)},
{donors, maps:get(donors, Status)}
] ++
[
{connection_goal, maps:get(recipient_conn_avg, Status)}
|| maps:is_key(recipient_conn_avg, Status)
] ++
[
{disconnected_session_goal, maps:get(recipient_sess_avg, Status)}
|| maps:is_key(recipient_sess_avg, Status)
] ++
[
{donor_conn_avg, maps:get(donor_conn_avg, Status)}
|| maps:is_key(donor_conn_avg, Status)
] ++
[
{donor_sess_avg, maps:get(donor_sess_avg, Status)}
|| maps:is_key(donor_sess_avg, Status)
]
).
local_status_field_format_order() ->
[
state,
coordinator_node,
connection_eviction_rate,
session_eviction_rate,
connection_goal,
session_goal,
disconnected_session_goal,
session_recipients,
recipients,
stats
].
coordinator_status_field_format_order() ->
[
state,
coordinator_node,
donors,
recipients,
connection_eviction_rate,
session_eviction_rate,
connection_goal,
disconnected_session_goal,
donor_conn_avg,
donor_sess_avg
].
format_status(Status, FieldOrder) ->
Fields = lists:flatmap(
fun(FieldName) ->
maps:to_list(maps:with([FieldName], Status))
end,
FieldOrder
),
lists:map(
fun format_local_status_field/1,
Fields
).
format_local_status_field({state, State}) ->
io_lib:format("Rebalance state: ~p~n", [State]);
format_local_status_field({coordinator_node, Node}) ->
io_lib:format("Coordinator node: ~p~n", [Node]);
format_local_status_field({connection_eviction_rate, ConnEvictRate}) ->
io_lib:format("Connection eviction rate: ~p connections/second~n", [ConnEvictRate]);
format_local_status_field({session_eviction_rate, SessEvictRate}) ->
io_lib:format("Session eviction rate: ~p sessions/second~n", [SessEvictRate]);
format_local_status_field({connection_goal, ConnGoal}) ->
io_lib:format("Connection goal: ~p~n", [ConnGoal]);
format_local_status_field({session_goal, SessGoal}) ->
io_lib:format("Session goal: ~p~n", [SessGoal]);
format_local_status_field({disconnected_session_goal, DisconnSessGoal}) ->
io_lib:format("Disconnected session goal: ~p~n", [DisconnSessGoal]);
format_local_status_field({session_recipients, SessionRecipients}) ->
io_lib:format("Session recipient nodes: ~p~n", [SessionRecipients]);
format_local_status_field({recipients, Recipients}) ->
io_lib:format("Recipient nodes: ~p~n", [Recipients]);
format_local_status_field({donors, Donors}) ->
io_lib:format("Donor nodes: ~p~n", [Donors]);
format_local_status_field({donor_conn_avg, DonorConnAvg}) ->
io_lib:format("Current average donor node connection count: ~p~n", [DonorConnAvg]);
format_local_status_field({donor_sess_avg, DonorSessAvg}) ->
io_lib:format("Current average donor node disconnected session count: ~p~n", [DonorSessAvg]);
format_local_status_field({stats, Stats}) ->
format_local_stats(Stats).
format_local_stats(Stats) ->
[
"Channel statistics:\n"
| lists:map(
fun({Name, Value}) ->
io_lib:format(" ~p: ~p~n", [Name, Value])
end,
maps:to_list(Stats)
)
].
evacuation_status() ->
{node(), emqx_node_rebalance_evacuation:status()}.
rebalance_status() ->
{node(), emqx_node_rebalance:status()}.

View File

@ -0,0 +1,35 @@
%%--------------------------------------------------------------------
%% Copyright (c) 2022-2023 EMQ Technologies Co., Ltd. All Rights Reserved.
%%--------------------------------------------------------------------
-module(emqx_node_rebalance_sup).
-behaviour(supervisor).
-export([start_link/0]).
-export([init/1]).
start_link() ->
supervisor:start_link({local, ?MODULE}, ?MODULE, []).
init([]) ->
Childs = [
child_spec(emqx_node_rebalance_evacuation, []),
child_spec(emqx_node_rebalance_agent, []),
child_spec(emqx_node_rebalance, [])
],
{ok, {
#{strategy => one_for_one, intensity => 10, period => 3600},
Childs
}}.
child_spec(Mod, Args) ->
#{
id => Mod,
start => {Mod, start_link, Args},
restart => permanent,
shutdown => 5000,
type => worker,
modules => [Mod]
}.

View File

@ -0,0 +1,43 @@
%%--------------------------------------------------------------------
%% Copyright (c) 2022-2023 EMQ Technologies Co., Ltd. All Rights Reserved.
%%--------------------------------------------------------------------
-module(emqx_node_rebalance_api_proto_v1).
-behaviour(emqx_bpapi).
-export([
introduced_in/0,
node_rebalance_evacuation_start/2,
node_rebalance_evacuation_stop/1,
node_rebalance_start/2,
node_rebalance_stop/1
]).
-include_lib("emqx/include/bpapi.hrl").
-include_lib("emqx/include/types.hrl").
introduced_in() ->
"5.0.22".
-spec node_rebalance_evacuation_start(node(), emqx_node_rebalance_evacuation:start_opts()) ->
emqx_rpc:badrpc() | ok_or_error(emqx_node_rebalance_evacuation:start_error()).
node_rebalance_evacuation_start(Node, #{} = Opts) ->
rpc:call(Node, emqx_node_rebalance_evacuation, start, [Opts]).
-spec node_rebalance_evacuation_stop(node()) ->
emqx_rpc:badrpc() | ok_or_error(not_started).
node_rebalance_evacuation_stop(Node) ->
rpc:call(Node, emqx_node_rebalance_evacuation, stop, []).
-spec node_rebalance_start(node(), emqx_node_rebalance:start_opts()) ->
emqx_rpc:badrpc() | ok_or_error(emqx_node_rebalance:start_error()).
node_rebalance_start(Node, Opts) ->
rpc:call(Node, emqx_node_rebalance, start, [Opts]).
-spec node_rebalance_stop(node()) ->
emqx_rpc:badrpc() | ok_or_error(not_started).
node_rebalance_stop(Node) ->
rpc:call(Node, emqx_node_rebalance, stop, []).

View File

@ -0,0 +1,22 @@
%%--------------------------------------------------------------------
%% Copyright (c) 2022-2023 EMQ Technologies Co., Ltd. All Rights Reserved.
%%--------------------------------------------------------------------
-module(emqx_node_rebalance_evacuation_proto_v1).
-behaviour(emqx_bpapi).
-export([
introduced_in/0,
available_nodes/1
]).
-include_lib("emqx/include/bpapi.hrl").
introduced_in() ->
"5.0.22".
-spec available_nodes([node()]) -> emqx_rpc:multicall_result(node()).
available_nodes(Nodes) ->
rpc:multicall(Nodes, emqx_node_rebalance_evacuation, is_node_available, []).

View File

@ -0,0 +1,62 @@
%%--------------------------------------------------------------------
%% Copyright (c) 2022-2023 EMQ Technologies Co., Ltd. All Rights Reserved.
%%--------------------------------------------------------------------
-module(emqx_node_rebalance_proto_v1).
-behaviour(emqx_bpapi).
-export([
introduced_in/0,
available_nodes/1,
evict_connections/2,
evict_sessions/4,
connection_counts/1,
session_counts/1,
enable_rebalance_agent/2,
disable_rebalance_agent/2,
disconnected_session_counts/1
]).
-include_lib("emqx/include/bpapi.hrl").
-include_lib("emqx/include/types.hrl").
introduced_in() ->
"5.0.22".
-spec available_nodes([node()]) -> emqx_rpc:multicall_result(node()).
available_nodes(Nodes) ->
rpc:multicall(Nodes, emqx_node_rebalance, is_node_available, []).
-spec evict_connections([node()], non_neg_integer()) ->
emqx_rpc:multicall_result(ok_or_error(disabled)).
evict_connections(Nodes, Count) ->
rpc:multicall(Nodes, emqx_eviction_agent, evict_connections, [Count]).
-spec evict_sessions([node()], non_neg_integer(), [node()], emqx_channel:conn_state()) ->
emqx_rpc:multicall_result(ok_or_error(disabled)).
evict_sessions(Nodes, Count, RecipientNodes, ConnState) ->
rpc:multicall(Nodes, emqx_eviction_agent, evict_sessions, [Count, RecipientNodes, ConnState]).
-spec connection_counts([node()]) -> emqx_rpc:multicall_result({ok, non_neg_integer()}).
connection_counts(Nodes) ->
rpc:multicall(Nodes, emqx_node_rebalance, connection_count, []).
-spec session_counts([node()]) -> emqx_rpc:multicall_result({ok, non_neg_integer()}).
session_counts(Nodes) ->
rpc:multicall(Nodes, emqx_node_rebalance, session_count, []).
-spec enable_rebalance_agent([node()], pid()) ->
emqx_rpc:multicall_result(ok_or_error(already_enabled | eviction_agent_busy)).
enable_rebalance_agent(Nodes, OwnerPid) ->
rpc:multicall(Nodes, emqx_node_rebalance_agent, enable, [OwnerPid]).
-spec disable_rebalance_agent([node()], pid()) ->
emqx_rpc:multicall_result(ok_or_error(already_disabled | invalid_coordinator)).
disable_rebalance_agent(Nodes, OwnerPid) ->
rpc:multicall(Nodes, emqx_node_rebalance_agent, disable, [OwnerPid]).
-spec disconnected_session_counts([node()]) -> emqx_rpc:multicall_result({ok, non_neg_integer()}).
disconnected_session_counts(Nodes) ->
rpc:multicall(Nodes, emqx_node_rebalance, disconnected_session_count, []).

View File

@ -0,0 +1,36 @@
%%--------------------------------------------------------------------
%% Copyright (c) 2022-2023 EMQ Technologies Co., Ltd. All Rights Reserved.
%%--------------------------------------------------------------------
-module(emqx_node_rebalance_status_proto_v1).
-behaviour(emqx_bpapi).
-export([
introduced_in/0,
local_status/1,
rebalance_status/1,
evacuation_status/1
]).
-include_lib("emqx/include/bpapi.hrl").
-include_lib("emqx/include/types.hrl").
introduced_in() ->
"5.0.22".
-spec local_status(node()) ->
emqx_rpc:badrpc() | disabled | {evacuation, map()} | {rebalance, map()}.
local_status(Node) ->
rpc:call(Node, emqx_node_rebalance_status, local_status, []).
-spec rebalance_status([node()]) ->
emqx_rpc:multicall_result({node(), map()}).
rebalance_status(Nodes) ->
rpc:multicall(Nodes, emqx_node_rebalance_status, rebalance_status, []).
-spec evacuation_status([node()]) ->
emqx_rpc:multicall_result({node(), map()}).
evacuation_status(Nodes) ->
rpc:multicall(Nodes, emqx_node_rebalance_status, evacuation_status, []).

View File

@ -0,0 +1,229 @@
%%--------------------------------------------------------------------
%% Copyright (c) 2022-2023 EMQ Technologies Co., Ltd. All Rights Reserved.
%%--------------------------------------------------------------------
-module(emqx_node_rebalance_SUITE).
-compile(export_all).
-compile(nowarn_export_all).
-include_lib("emqx/include/emqx.hrl").
-include_lib("emqx/include/emqx_mqtt.hrl").
-include_lib("emqx/include/asserts.hrl").
-include_lib("eunit/include/eunit.hrl").
-include_lib("common_test/include/ct.hrl").
-include_lib("snabbkaffe/include/snabbkaffe.hrl").
-import(
emqx_eviction_agent_test_helpers,
[emqtt_connect_many/1, emqtt_connect_many/2, stop_many/1, case_specific_node_name/3]
).
-define(START_APPS, [emqx_eviction_agent, emqx_node_rebalance]).
all() ->
emqx_common_test_helpers:all(?MODULE).
init_per_suite(Config) ->
ok = emqx_common_test_helpers:start_apps([]),
Config.
end_per_suite(_Config) ->
ok = emqx_common_test_helpers:stop_apps([]),
ok.
init_per_testcase(Case, Config) ->
ClusterNodes = emqx_eviction_agent_test_helpers:start_cluster(
[
{case_specific_node_name(?MODULE, Case, '_donor'), 2883},
{case_specific_node_name(?MODULE, Case, '_recipient'), 3883}
],
?START_APPS
),
ok = snabbkaffe:start_trace(),
[{cluster_nodes, ClusterNodes} | Config].
end_per_testcase(_Case, Config) ->
ok = snabbkaffe:stop(),
ok = emqx_eviction_agent_test_helpers:stop_cluster(
?config(cluster_nodes, Config),
?START_APPS
).
%%--------------------------------------------------------------------
%% Tests
%%--------------------------------------------------------------------
t_rebalance(Config) ->
process_flag(trap_exit, true),
[{DonorNode, DonorPort}, {RecipientNode, _RecipientPort}] = ?config(cluster_nodes, Config),
Nodes = [DonorNode, RecipientNode],
Conns = emqtt_connect_many(DonorPort, 500),
Opts = #{
conn_evict_rate => 10,
sess_evict_rate => 10,
evict_interval => 10,
abs_conn_threshold => 50,
abs_sess_threshold => 50,
rel_conn_threshold => 1.0,
rel_sess_threshold => 1.0,
wait_health_check => 0.01,
wait_takeover => 0.01,
nodes => Nodes
},
?assertWaitEvent(
ok = rpc:call(DonorNode, emqx_node_rebalance, start, [Opts]),
#{?snk_kind := emqx_node_rebalance_evict_sess_over},
10000
),
DonorConnCount = rpc:call(DonorNode, emqx_eviction_agent, connection_count, []),
DonorSessCount = rpc:call(DonorNode, emqx_eviction_agent, session_count, []),
DonorDSessCount = rpc:call(DonorNode, emqx_eviction_agent, session_count, [disconnected]),
RecipientConnCount = rpc:call(RecipientNode, emqx_eviction_agent, connection_count, []),
RecipientSessCount = rpc:call(RecipientNode, emqx_eviction_agent, session_count, []),
RecipientDSessCount = rpc:call(RecipientNode, emqx_eviction_agent, session_count, [disconnected]),
ct:pal(
"Donor: conn=~p, sess=~p, dsess=~p",
[DonorConnCount, DonorSessCount, DonorDSessCount]
),
ct:pal(
"Recipient: conn=~p, sess=~p, dsess=~p",
[RecipientConnCount, RecipientSessCount, RecipientDSessCount]
),
?assert(DonorConnCount - 50 =< RecipientConnCount),
?assert(DonorDSessCount - 50 =< RecipientDSessCount),
ok = stop_many(Conns).
t_rebalance_node_crash(Config) ->
process_flag(trap_exit, true),
[{DonorNode, DonorPort}, {RecipientNode, _RecipientPort}] = ?config(cluster_nodes, Config),
Nodes = [DonorNode, RecipientNode],
Conns = emqtt_connect_many(DonorPort, 500),
Opts = #{
conn_evict_rate => 10,
sess_evict_rate => 10,
evict_interval => 10,
abs_conn_threshold => 50,
abs_sess_threshold => 50,
rel_conn_threshold => 1.0,
rel_sess_threshold => 1.0,
wait_health_check => 0.01,
wait_takeover => 0.01,
nodes => Nodes
},
?assertWaitEvent(
begin
ok = rpc:call(DonorNode, emqx_node_rebalance, start, [Opts]),
emqx_common_test_helpers:stop_slave(RecipientNode)
end,
#{?snk_kind := emqx_node_rebalance_started},
1000
),
?assertEqual(
disabled,
rpc:call(DonorNode, emqx_node_rebalance, status, [])
),
ok = stop_many(Conns).
t_no_need_to_rebalance(Config) ->
process_flag(trap_exit, true),
[{DonorNode, DonorPort}, {RecipientNode, _RecipientPort}] = ?config(cluster_nodes, Config),
Nodes = [DonorNode, RecipientNode],
Opts = #{
conn_evict_rate => 10,
sess_evict_rate => 10,
evict_interval => 10,
abs_conn_threshold => 50,
abs_sess_threshold => 50,
rel_conn_threshold => 1.0,
rel_sess_threshold => 1.0,
wait_health_check => 0.01,
wait_takeover => 0.01,
nodes => Nodes
},
?assertEqual(
{error, nothing_to_balance},
rpc:call(DonorNode, emqx_node_rebalance, start, [Opts])
),
Conns = emqtt_connect_many(DonorPort, 50),
?assertEqual(
{error, nothing_to_balance},
rpc:call(DonorNode, emqx_node_rebalance, start, [Opts])
),
ok = stop_many(Conns).
t_unknown_mesages(Config) ->
process_flag(trap_exit, true),
[{DonorNode, DonorPort}, {RecipientNode, _RecipientPort}] = ?config(cluster_nodes, Config),
Nodes = [DonorNode, RecipientNode],
Conns = emqtt_connect_many(DonorPort, 500),
Opts = #{
wait_health_check => 100,
abs_conn_threshold => 50,
nodes => Nodes
},
Pid = rpc:call(DonorNode, erlang, whereis, [emqx_node_rebalance]),
Pid ! unknown,
ok = gen_server:cast(Pid, unknown),
?assertEqual(
ignored,
gen_server:call(Pid, unknown)
),
ok = rpc:call(DonorNode, emqx_node_rebalance, start, [Opts]),
Pid ! unknown,
ok = gen_server:cast(Pid, unknown),
?assertEqual(
ignored,
gen_server:call(Pid, unknown)
),
ok = stop_many(Conns).
t_available_nodes(Config) ->
[{DonorNode, _DonorPort}, {RecipientNode, _RecipientPort}] = ?config(cluster_nodes, Config),
%% Start eviction agent on RecipientNode so that it will be "occupied"
%% and not available for rebalance
ok = rpc:call(RecipientNode, emqx_eviction_agent, enable, [test_rebalance, undefined]),
%% Only DonorNode should be is available for rebalance, since RecipientNode is "occupied"
?assertEqual(
[DonorNode],
rpc:call(
DonorNode,
emqx_node_rebalance,
available_nodes,
[[DonorNode, RecipientNode]]
)
).

View File

@ -0,0 +1,214 @@
%%--------------------------------------------------------------------
%% Copyright (c) 2022-2023 EMQ Technologies Co., Ltd. All Rights Reserved.
%%--------------------------------------------------------------------
-module(emqx_node_rebalance_agent_SUITE).
-compile(export_all).
-compile(nowarn_export_all).
-include_lib("emqx/include/emqx.hrl").
-include_lib("emqx/include/emqx_mqtt.hrl").
-include_lib("eunit/include/eunit.hrl").
-include_lib("common_test/include/ct.hrl").
-include_lib("snabbkaffe/include/snabbkaffe.hrl").
-import(
emqx_eviction_agent_test_helpers,
[case_specific_node_name/2]
).
all() ->
[
{group, local},
{group, cluster}
].
groups() ->
[
{local, [], [
t_enable_disable,
t_enable_egent_busy,
t_unknown_messages
]},
{cluster, [], [
t_rebalance_agent_coordinator_fail,
t_rebalance_agent_fail
]}
].
init_per_suite(Config) ->
ok = emqx_common_test_helpers:start_apps([emqx_eviction_agent, emqx_node_rebalance]),
Config.
end_per_suite(_Config) ->
ok = emqx_common_test_helpers:stop_apps([emqx_eviction_agent, emqx_node_rebalance]),
ok.
init_per_group(local, Config) ->
[{cluster, false} | Config];
init_per_group(cluster, Config) ->
[{cluster, true} | Config].
end_per_group(_Group, _Config) ->
ok.
init_per_testcase(Case, Config) ->
case ?config(cluster, Config) of
true ->
ClusterNodes = emqx_eviction_agent_test_helpers:start_cluster(
[{case_specific_node_name(?MODULE, Case), 2883}],
[emqx_eviction_agent, emqx_node_rebalance]
),
[{cluster_nodes, ClusterNodes} | Config];
false ->
Config
end.
end_per_testcase(_Case, Config) ->
case ?config(cluster, Config) of
true ->
emqx_eviction_agent_test_helpers:stop_cluster(
?config(cluster_nodes, Config),
[emqx_eviction_agent, emqx_node_rebalance]
);
false ->
ok
end.
%%--------------------------------------------------------------------
%% Tests
%%--------------------------------------------------------------------
%% Local tests
t_enable_disable(_Config) ->
?assertEqual(
disabled,
emqx_node_rebalance_agent:status()
),
?assertEqual(
ok,
emqx_node_rebalance_agent:enable(self())
),
?assertEqual(
{error, already_enabled},
emqx_node_rebalance_agent:enable(self())
),
?assertEqual(
{enabled, self()},
emqx_node_rebalance_agent:status()
),
?assertEqual(
{error, invalid_coordinator},
emqx_node_rebalance_agent:disable(spawn_link(fun() -> ok end))
),
?assertEqual(
ok,
emqx_node_rebalance_agent:disable(self())
),
?assertEqual(
{error, already_disabled},
emqx_node_rebalance_agent:disable(self())
),
?assertEqual(
disabled,
emqx_node_rebalance_agent:status()
).
t_enable_egent_busy(_Config) ->
ok = emqx_eviction_agent:enable(rebalance_test, undefined),
?assertEqual(
{error, eviction_agent_busy},
emqx_node_rebalance_agent:enable(self())
),
ok = emqx_eviction_agent:disable(rebalance_test).
t_unknown_messages(_Config) ->
Pid = whereis(emqx_node_rebalance_agent),
ok = gen_server:cast(Pid, unknown),
Pid ! unknown,
ignored = gen_server:call(Pid, unknown).
%% Cluster tests
% The following tests verify that emqx_node_rebalance_agent correctly links
% coordinator process with emqx_eviction_agent-s.
t_rebalance_agent_coordinator_fail(Config) ->
process_flag(trap_exit, true),
[{Node, _}] = ?config(cluster_nodes, Config),
CoordinatorPid = spawn_link(
fun() ->
receive
done -> ok
end
end
),
?assertEqual(
disabled,
rpc:call(Node, emqx_eviction_agent, status, [])
),
?assertEqual(
ok,
rpc:call(Node, emqx_node_rebalance_agent, enable, [CoordinatorPid])
),
?assertMatch(
{enabled, _},
rpc:call(Node, emqx_eviction_agent, status, [])
),
EvictionAgentPid = rpc:call(Node, erlang, whereis, [emqx_eviction_agent]),
true = link(EvictionAgentPid),
true = exit(CoordinatorPid, kill),
receive
{'EXIT', EvictionAgentPid, _} -> true
after 1000 ->
ct:fail("emqx_eviction_agent did not exit")
end.
t_rebalance_agent_fail(Config) ->
process_flag(trap_exit, true),
[{Node, _}] = ?config(cluster_nodes, Config),
CoordinatorPid = spawn_link(
fun() ->
receive
done -> ok
end
end
),
?assertEqual(
ok,
rpc:call(Node, emqx_node_rebalance_agent, enable, [CoordinatorPid])
),
EvictionAgentPid = rpc:call(Node, erlang, whereis, [emqx_eviction_agent]),
true = exit(EvictionAgentPid, kill),
receive
{'EXIT', CoordinatorPid, _} -> true
after 1000 ->
ct:fail("emqx_node_rebalance_agent did not exit")
end.

View File

@ -0,0 +1,444 @@
%%--------------------------------------------------------------------
%% Copyright (c) 2022-2023 EMQ Technologies Co., Ltd. All Rights Reserved.
%%--------------------------------------------------------------------
-module(emqx_node_rebalance_api_SUITE).
-compile(export_all).
-compile(nowarn_export_all).
-include_lib("eunit/include/eunit.hrl").
-include_lib("common_test/include/ct.hrl").
-import(
emqx_mgmt_api_test_util,
[
request/2,
request/3,
uri/1
]
).
-import(
emqx_eviction_agent_test_helpers,
[emqtt_connect_many/2, stop_many/1, case_specific_node_name/3]
).
-define(START_APPS, [emqx_eviction_agent, emqx_node_rebalance]).
all() ->
emqx_common_test_helpers:all(?MODULE).
init_per_suite(Config) ->
ok = emqx_common_test_helpers:start_apps(?START_APPS),
Config.
end_per_suite(_Config) ->
ok = emqx_common_test_helpers:stop_apps(?START_APPS),
ok.
init_per_testcase(Case, Config) ->
[{DonorNode, _} | _] =
ClusterNodes = emqx_eviction_agent_test_helpers:start_cluster(
[
{case_specific_node_name(?MODULE, Case, '_donor'), 2883},
{case_specific_node_name(?MODULE, Case, '_recipient'), 3883}
],
?START_APPS,
[{emqx, data_dir, case_specific_data_dir(Case, Config)}]
),
ok = rpc:call(DonorNode, emqx_mgmt_api_test_util, init_suite, []),
ok = take_auth_header_from(DonorNode),
[{cluster_nodes, ClusterNodes} | Config].
end_per_testcase(_Case, Config) ->
_ = emqx_eviction_agent_test_helpers:stop_cluster(
?config(cluster_nodes, Config),
?START_APPS
).
%%--------------------------------------------------------------------
%% Tests
%%--------------------------------------------------------------------
t_start_evacuation_validation(Config) ->
[{DonorNode, _}, {RecipientNode, _}] = ?config(cluster_nodes, Config),
BadOpts = [
#{conn_evict_rate => <<"conn">>},
#{sess_evict_rate => <<"sess">>},
#{redirect_to => 123},
#{wait_takeover => <<"wait">>},
#{migrate_to => []},
#{migrate_to => <<"migrate_to">>},
#{migrate_to => [<<"bad_node">>]},
#{migrate_to => [<<"bad_node">>, atom_to_binary(DonorNode)]},
#{unknown => <<"Value">>}
],
lists:foreach(
fun(Opts) ->
?assertMatch(
{ok, 400, #{}},
api_post(
["load_rebalance", atom_to_list(DonorNode), "evacuation", "start"],
Opts
)
)
end,
BadOpts
),
?assertMatch(
{ok, 404, #{}},
api_post(
["load_rebalance", "bad@node", "evacuation", "start"],
#{}
)
),
?assertMatch(
{ok, 200, #{}},
api_post(
["load_rebalance", atom_to_list(DonorNode), "evacuation", "start"],
#{
conn_evict_rate => 10,
sess_evict_rate => 10,
wait_takeover => 10,
redirect_to => <<"srv">>,
migrate_to => [atom_to_binary(RecipientNode)]
}
)
),
DonorNodeBin = atom_to_binary(DonorNode),
?assertMatch(
{ok, 200, #{<<"evacuations">> := [#{<<"node">> := DonorNodeBin}]}},
api_get(["load_rebalance", "global_status"])
).
t_start_rebalance_validation(Config) ->
process_flag(trap_exit, true),
[{DonorNode, DonorPort}, {RecipientNode, _}] = ?config(cluster_nodes, Config),
BadOpts = [
#{conn_evict_rate => <<"conn">>},
#{sess_evict_rate => <<"sess">>},
#{abs_conn_threshold => <<"act">>},
#{rel_conn_threshold => <<"rct">>},
#{abs_sess_threshold => <<"act">>},
#{rel_sess_threshold => <<"rct">>},
#{wait_takeover => <<"wait">>},
#{wait_health_check => <<"wait">>},
#{nodes => <<"nodes">>},
#{nodes => []},
#{nodes => [<<"bad_node">>]},
#{nodes => [<<"bad_node">>, atom_to_binary(DonorNode)]},
#{unknown => <<"Value">>}
],
lists:foreach(
fun(Opts) ->
?assertMatch(
{ok, 400, #{}},
api_post(
["load_rebalance", atom_to_list(DonorNode), "start"],
Opts
)
)
end,
BadOpts
),
?assertMatch(
{ok, 404, #{}},
api_post(
["load_rebalance", "bad@node", "start"],
#{}
)
),
Conns = emqtt_connect_many(DonorPort, 50),
?assertMatch(
{ok, 200, #{}},
api_post(
["load_rebalance", atom_to_list(DonorNode), "start"],
#{
conn_evict_rate => 10,
sess_evict_rate => 10,
wait_takeover => 10,
wait_health_check => 10,
abs_conn_threshold => 10,
rel_conn_threshold => 1.001,
abs_sess_threshold => 10,
rel_sess_threshold => 1.001,
nodes => [
atom_to_binary(DonorNode),
atom_to_binary(RecipientNode)
]
}
)
),
DonorNodeBin = atom_to_binary(DonorNode),
?assertMatch(
{ok, 200, #{<<"rebalances">> := [#{<<"node">> := DonorNodeBin}]}},
api_get(["load_rebalance", "global_status"])
),
ok = stop_many(Conns).
t_start_stop_evacuation(Config) ->
[{DonorNode, _}, {RecipientNode, _}] = ?config(cluster_nodes, Config),
StartOpts = maps:merge(
emqx_node_rebalance_api:rebalance_evacuation_example(),
#{migrate_to => [atom_to_binary(RecipientNode)]}
),
?assertMatch(
{ok, 200, #{}},
api_post(
["load_rebalance", atom_to_list(DonorNode), "evacuation", "start"],
StartOpts
)
),
StatusResponse = api_get(["load_rebalance", "status"]),
?assertMatch(
{ok, 200, _},
StatusResponse
),
{ok, 200, Status} = StatusResponse,
?assertMatch(
#{
process := evacuation,
connection_eviction_rate := 100,
session_eviction_rate := 100,
connection_goal := 0,
session_goal := 0,
stats := #{
initial_connected := _,
current_connected := _,
initial_sessions := _,
current_sessions := _
}
},
emqx_node_rebalance_api:translate(local_status_enabled, Status)
),
DonorNodeBin = atom_to_binary(DonorNode),
GlobalStatusResponse = api_get(["load_rebalance", "global_status"]),
?assertMatch(
{ok, 200, _},
GlobalStatusResponse
),
{ok, 200, GlobalStatus} = GlobalStatusResponse,
?assertMatch(
#{
rebalances := [],
evacuations := [
#{
node := DonorNodeBin,
connection_eviction_rate := 100,
session_eviction_rate := 100,
connection_goal := 0,
session_goal := 0,
stats := #{
initial_connected := _,
current_connected := _,
initial_sessions := _,
current_sessions := _
}
}
]
},
emqx_node_rebalance_api:translate(global_status, GlobalStatus)
),
?assertMatch(
{ok, 200, #{}},
api_post(
["load_rebalance", atom_to_list(DonorNode), "evacuation", "stop"],
#{}
)
),
?assertMatch(
{ok, 200, #{<<"status">> := <<"disabled">>}},
api_get(["load_rebalance", "status"])
),
?assertMatch(
{ok, 200, #{<<"evacuations">> := [], <<"rebalances">> := []}},
api_get(["load_rebalance", "global_status"])
).
t_start_stop_rebalance(Config) ->
process_flag(trap_exit, true),
[{DonorNode, DonorPort}, {RecipientNode, _}] = ?config(cluster_nodes, Config),
?assertMatch(
{ok, 200, #{<<"status">> := <<"disabled">>}},
api_get(["load_rebalance", "status"])
),
Conns = emqtt_connect_many(DonorPort, 100),
StartOpts = maps:without(
[nodes],
emqx_node_rebalance_api:rebalance_example()
),
?assertMatch(
{ok, 200, #{}},
api_post(
["load_rebalance", atom_to_list(DonorNode), "start"],
StartOpts
)
),
StatusResponse = api_get(["load_rebalance", "status"]),
?assertMatch(
{ok, 200, _},
StatusResponse
),
{ok, 200, Status} = StatusResponse,
?assertMatch(
#{process := rebalance, connection_eviction_rate := 10, session_eviction_rate := 20},
emqx_node_rebalance_api:translate(local_status_enabled, Status)
),
DonorNodeBin = atom_to_binary(DonorNode),
RecipientNodeBin = atom_to_binary(RecipientNode),
GlobalStatusResponse = api_get(["load_rebalance", "global_status"]),
?assertMatch(
{ok, 200, _},
GlobalStatusResponse
),
{ok, 200, GlobalStatus} = GlobalStatusResponse,
?assertMatch(
{ok, 200, #{
<<"evacuations">> := [],
<<"rebalances">> :=
[
#{
<<"state">> := _,
<<"node">> := DonorNodeBin,
<<"coordinator_node">> := _,
<<"connection_eviction_rate">> := 10,
<<"session_eviction_rate">> := 20,
<<"donors">> := [DonorNodeBin],
<<"recipients">> := [RecipientNodeBin]
}
]
}},
GlobalStatusResponse
),
?assertMatch(
#{
evacuations := [],
rebalances := [
#{
state := _,
node := DonorNodeBin,
coordinator_node := _,
connection_eviction_rate := 10,
session_eviction_rate := 20,
donors := [DonorNodeBin],
recipients := [RecipientNodeBin]
}
]
},
emqx_node_rebalance_api:translate(global_status, GlobalStatus)
),
?assertMatch(
{ok, 200, #{}},
api_post(
["load_rebalance", atom_to_list(DonorNode), "stop"],
#{}
)
),
?assertMatch(
{ok, 200, #{<<"status">> := <<"disabled">>}},
api_get(["load_rebalance", "status"])
),
?assertMatch(
{ok, 200, #{<<"evacuations">> := [], <<"rebalances">> := []}},
api_get(["load_rebalance", "global_status"])
),
ok = stop_many(Conns).
t_availability_check(Config) ->
[{DonorNode, _} | _] = ?config(cluster_nodes, Config),
?assertMatch(
{ok, 200, #{}},
api_get(["load_rebalance", "availability_check"])
),
ok = rpc:call(DonorNode, emqx_node_rebalance_evacuation, start, [#{}]),
?assertMatch(
{ok, 503, _},
api_get(["load_rebalance", "availability_check"])
),
ok = rpc:call(DonorNode, emqx_node_rebalance_evacuation, stop, []),
?assertMatch(
{ok, 200, #{}},
api_get(["load_rebalance", "availability_check"])
).
%%--------------------------------------------------------------------
%% Helpers
%%--------------------------------------------------------------------
api_get(Path) ->
case request(get, uri(Path)) of
{ok, Code, ResponseBody} ->
{ok, Code, jiffy:decode(ResponseBody, [return_maps])};
{error, _} = Error ->
Error
end.
api_post(Path, Data) ->
case request(post, uri(Path), Data) of
{ok, Code, ResponseBody} ->
{ok, Code, jiffy:decode(ResponseBody, [return_maps])};
{error, _} = Error ->
Error
end.
take_auth_header_from(Node) ->
meck:new(emqx_common_test_http, [passthrough]),
meck:expect(
emqx_common_test_http,
default_auth_header,
fun() -> rpc:call(Node, emqx_common_test_http, default_auth_header, []) end
),
ok.
case_specific_data_dir(Case, Config) ->
case ?config(priv_dir, Config) of
undefined -> undefined;
PrivDir -> filename:join(PrivDir, atom_to_list(Case))
end.

View File

@ -0,0 +1,291 @@
%%--------------------------------------------------------------------
%% Copyright (c) 2022-2023 EMQ Technologies Co., Ltd. All Rights Reserved.
%%%--------------------------------------------------------------------
-module(emqx_node_rebalance_cli_SUITE).
-compile(export_all).
-compile(nowarn_export_all).
-include_lib("eunit/include/eunit.hrl").
-include_lib("common_test/include/ct.hrl").
-import(
emqx_eviction_agent_test_helpers,
[emqtt_connect_many/2, stop_many/1, case_specific_node_name/3]
).
-define(START_APPS, [emqx_eviction_agent, emqx_node_rebalance]).
all() ->
emqx_common_test_helpers:all(?MODULE).
init_per_suite(Config) ->
emqx_common_test_helpers:start_apps(?START_APPS),
Config.
end_per_suite(Config) ->
emqx_common_test_helpers:stop_apps(lists:reverse(?START_APPS)),
Config.
init_per_testcase(Case = t_rebalance, Config) ->
_ = emqx_node_rebalance_evacuation:stop(),
ClusterNodes = emqx_eviction_agent_test_helpers:start_cluster(
[
{case_specific_node_name(?MODULE, Case, '_donor'), 2883},
{case_specific_node_name(?MODULE, Case, '_recipient'), 3883}
],
?START_APPS
),
[{cluster_nodes, ClusterNodes} | Config];
init_per_testcase(_Case, Config) ->
_ = emqx_node_rebalance_evacuation:stop(),
_ = emqx_node_rebalance:stop(),
Config.
end_per_testcase(t_rebalance, Config) ->
_ = emqx_node_rebalance_evacuation:stop(),
_ = emqx_node_rebalance:stop(),
_ = emqx_eviction_agent_test_helpers:stop_cluster(
?config(cluster_nodes, Config),
?START_APPS
);
end_per_testcase(_Case, _Config) ->
_ = emqx_node_rebalance_evacuation:stop(),
_ = emqx_node_rebalance:stop().
%%--------------------------------------------------------------------
%% Tests
%%--------------------------------------------------------------------
t_evacuation(_Config) ->
%% usage
ok = emqx_node_rebalance_cli:cli(["foobar"]),
%% status
ok = emqx_node_rebalance_cli:cli(["status"]),
ok = emqx_node_rebalance_cli:cli(["node-status"]),
ok = emqx_node_rebalance_cli:cli(["node-status", atom_to_list(node())]),
%% start with invalid args
?assertNot(
emqx_node_rebalance_cli:cli(["start", "--evacuation", "--foo-bar"])
),
?assertNot(
emqx_node_rebalance_cli:cli(["start", "--evacuation", "--conn-evict-rate", "foobar"])
),
?assertNot(
emqx_node_rebalance_cli:cli(["start", "--evacuation", "--sess-evict-rate", "foobar"])
),
?assertNot(
emqx_node_rebalance_cli:cli(["start", "--evacuation", "--wait-takeover", "foobar"])
),
?assertNot(
emqx_node_rebalance_cli:cli([
"start",
"--evacuation",
"--migrate-to",
"nonexistent@node"
])
),
?assertNot(
emqx_node_rebalance_cli:cli([
"start",
"--evacuation",
"--migrate-to",
""
])
),
?assertNot(
emqx_node_rebalance_cli:cli([
"start",
"--evacuation",
"--unknown-arg"
])
),
?assert(
emqx_node_rebalance_cli:cli([
"start",
"--evacuation",
"--conn-evict-rate",
"10",
"--sess-evict-rate",
"10",
"--wait-takeover",
"10",
"--migrate-to",
atom_to_list(node()),
"--redirect-to",
"srv"
])
),
%% status
ok = emqx_node_rebalance_cli:cli(["status"]),
ok = emqx_node_rebalance_cli:cli(["node-status"]),
ok = emqx_node_rebalance_cli:cli(["node-status", atom_to_list(node())]),
?assertMatch(
{enabled, #{}},
emqx_node_rebalance_evacuation:status()
),
%% already enabled
?assertNot(
emqx_node_rebalance_cli:cli([
"start",
"--evacuation",
"--conn-evict-rate",
"10",
"--redirect-to",
"srv"
])
),
%% stop
true = emqx_node_rebalance_cli:cli(["stop"]),
false = emqx_node_rebalance_cli:cli(["stop"]),
?assertEqual(
disabled,
emqx_node_rebalance_evacuation:status()
).
t_rebalance(Config) ->
process_flag(trap_exit, true),
[{DonorNode, DonorPort}, {RecipientNode, _}] = ?config(cluster_nodes, Config),
%% start with invalid args
?assertNot(
emqx_node_rebalance_cli(DonorNode, ["start", "--foo-bar"])
),
?assertNot(
emqx_node_rebalance_cli(DonorNode, ["start", "--conn-evict-rate", "foobar"])
),
?assertNot(
emqx_node_rebalance_cli(DonorNode, ["start", "--abs-conn-threshold", "foobar"])
),
?assertNot(
emqx_node_rebalance_cli(DonorNode, ["start", "--rel-conn-threshold", "foobar"])
),
?assertNot(
emqx_node_rebalance_cli(DonorNode, ["start", "--sess-evict-rate", "foobar"])
),
?assertNot(
emqx_node_rebalance_cli(DonorNode, ["start", "--abs-sess-threshold", "foobar"])
),
?assertNot(
emqx_node_rebalance_cli(DonorNode, ["start", "--rel-sess-threshold", "foobar"])
),
?assertNot(
emqx_node_rebalance_cli(DonorNode, ["start", "--wait-takeover", "foobar"])
),
?assertNot(
emqx_node_rebalance_cli(DonorNode, ["start", "--wait-health-check", "foobar"])
),
?assertNot(
emqx_node_rebalance_cli(DonorNode, [
"start",
"--nodes",
"nonexistent@node"
])
),
?assertNot(
emqx_node_rebalance_cli(DonorNode, [
"start",
"--nodes",
""
])
),
?assertNot(
emqx_node_rebalance_cli(DonorNode, [
"start",
"--nodes",
atom_to_list(RecipientNode)
])
),
?assertNot(
emqx_node_rebalance_cli(DonorNode, [
"start",
"--unknown-arg"
])
),
Conns = emqtt_connect_many(DonorPort, 20),
?assert(
emqx_node_rebalance_cli(DonorNode, [
"start",
"--conn-evict-rate",
"10",
"--abs-conn-threshold",
"10",
"--rel-conn-threshold",
"1.1",
"--sess-evict-rate",
"10",
"--abs-sess-threshold",
"10",
"--rel-sess-threshold",
"1.1",
"--wait-takeover",
"10",
"--nodes",
atom_to_list(DonorNode) ++ "," ++
atom_to_list(RecipientNode)
])
),
%% status
ok = emqx_node_rebalance_cli(DonorNode, ["status"]),
ok = emqx_node_rebalance_cli(DonorNode, ["node-status"]),
ok = emqx_node_rebalance_cli(DonorNode, ["node-status", atom_to_list(DonorNode)]),
?assertMatch(
{enabled, #{}},
rpc:call(DonorNode, emqx_node_rebalance, status, [])
),
%% already enabled
?assertNot(
emqx_node_rebalance_cli(DonorNode, ["start"])
),
%% stop
true = emqx_node_rebalance_cli(DonorNode, ["stop"]),
false = emqx_node_rebalance_cli(DonorNode, ["stop"]),
?assertEqual(
disabled,
rpc:call(DonorNode, emqx_node_rebalance, status, [])
),
ok = stop_many(Conns).
%%--------------------------------------------------------------------
%% Helpers
%%--------------------------------------------------------------------
emqx_node_rebalance_cli(Node, Args) ->
case rpc:call(Node, emqx_node_rebalance_cli, cli, [Args]) of
{badrpc, Reason} ->
error(Reason);
Result ->
Result
end.

View File

@ -0,0 +1,270 @@
%%--------------------------------------------------------------------
%% Copyright (c) 2022-2023 EMQ Technologies Co., Ltd. All Rights Reserved.
%%--------------------------------------------------------------------
-module(emqx_node_rebalance_evacuation_SUITE).
-compile(export_all).
-compile(nowarn_export_all).
-include_lib("emqx/include/emqx_mqtt.hrl").
-include_lib("emqx/include/asserts.hrl").
-include_lib("eunit/include/eunit.hrl").
-include_lib("common_test/include/ct.hrl").
-include_lib("snabbkaffe/include/snabbkaffe.hrl").
-import(
emqx_eviction_agent_test_helpers,
[emqtt_connect/1, emqtt_try_connect/1, case_specific_node_name/3]
).
all() -> [{group, one_node}, {group, two_node}].
groups() ->
[
{one_node, [], one_node_cases()},
{two_node, [], two_node_cases()}
].
two_node_cases() ->
[
t_conn_evicted,
t_migrate_to,
t_session_evicted
].
one_node_cases() ->
emqx_common_test_helpers:all(?MODULE) -- two_node_cases().
init_per_suite(Config) ->
ok = emqx_common_test_helpers:start_apps([]),
Config.
end_per_suite(_Config) ->
ok = emqx_common_test_helpers:stop_apps([]),
ok.
init_per_group(one_node, Config) ->
[{cluster_type, one_node} | Config];
init_per_group(two_node, Config) ->
[{cluster_type, two_node} | Config].
end_per_group(_Group, _Config) ->
ok.
init_per_testcase(Case, Config) ->
NodesWithPorts =
case ?config(cluster_type, Config) of
one_node ->
[{case_specific_node_name(?MODULE, Case, '_evacuated'), 2883}];
two_node ->
[
{case_specific_node_name(?MODULE, Case, '_evacuated'), 2883},
{case_specific_node_name(?MODULE, Case, '_recipient'), 3883}
]
end,
ClusterNodes = emqx_eviction_agent_test_helpers:start_cluster(
NodesWithPorts,
[emqx_eviction_agent, emqx_node_rebalance],
[{emqx, data_dir, case_specific_data_dir(Case, Config)}]
),
ok = snabbkaffe:start_trace(),
[{cluster_nodes, ClusterNodes} | Config].
end_per_testcase(_Case, Config) ->
ok = snabbkaffe:stop(),
ok = emqx_eviction_agent_test_helpers:stop_cluster(
?config(cluster_nodes, Config),
[emqx_eviction_agent, emqx_node_rebalance]
).
%%--------------------------------------------------------------------
%% Tests
%%--------------------------------------------------------------------
%% One node tests
t_agent_busy(Config) ->
[{DonorNode, _DonorPort}] = ?config(cluster_nodes, Config),
ok = rpc:call(DonorNode, emqx_eviction_agent, enable, [other_rebalance, undefined]),
?assertEqual(
{error, eviction_agent_busy},
rpc:call(DonorNode, emqx_node_rebalance_evacuation, start, [opts(Config)])
).
t_already_started(Config) ->
[{DonorNode, _DonorPort}] = ?config(cluster_nodes, Config),
ok = rpc:call(DonorNode, emqx_node_rebalance_evacuation, start, [opts(Config)]),
?assertEqual(
{error, already_started},
rpc:call(DonorNode, emqx_node_rebalance_evacuation, start, [opts(Config)])
).
t_not_started(Config) ->
[{DonorNode, _DonorPort}] = ?config(cluster_nodes, Config),
?assertEqual(
{error, not_started},
rpc:call(DonorNode, emqx_node_rebalance_evacuation, stop, [])
).
t_start(Config) ->
process_flag(trap_exit, true),
[{DonorNode, DonorPort}] = ?config(cluster_nodes, Config),
ok = rpc:call(DonorNode, emqx_node_rebalance_evacuation, start, [opts(Config)]),
?assertMatch(
{error, {use_another_server, #{}}},
emqtt_try_connect([{port, DonorPort}])
).
t_persistence(Config) ->
process_flag(trap_exit, true),
[{DonorNode, DonorPort}] = ?config(cluster_nodes, Config),
ok = rpc:call(DonorNode, emqx_node_rebalance_evacuation, start, [opts(Config)]),
?assertMatch(
{error, {use_another_server, #{}}},
emqtt_try_connect([{port, DonorPort}])
),
ok = rpc:call(DonorNode, supervisor, terminate_child, [
emqx_node_rebalance_sup, emqx_node_rebalance_evacuation
]),
{ok, _} = rpc:call(DonorNode, supervisor, restart_child, [
emqx_node_rebalance_sup, emqx_node_rebalance_evacuation
]),
?assertMatch(
{error, {use_another_server, #{}}},
emqtt_try_connect([{port, DonorPort}])
),
?assertMatch(
{enabled, #{conn_evict_rate := 10}},
rpc:call(DonorNode, emqx_node_rebalance_evacuation, status, [])
).
t_unknown_messages(Config) ->
process_flag(trap_exit, true),
[{DonorNode, _DonorPort}] = ?config(cluster_nodes, Config),
ok = rpc:call(DonorNode, emqx_node_rebalance_evacuation, start, [opts(Config)]),
Pid = rpc:call(DonorNode, erlang, whereis, [emqx_node_rebalance_evacuation]),
Pid ! unknown,
ok = gen_server:cast(Pid, unknown),
?assertEqual(
ignored,
gen_server:call(Pid, unknown)
).
%% Two node tests
t_conn_evicted(Config) ->
process_flag(trap_exit, true),
[{DonorNode, DonorPort}, _] = ?config(cluster_nodes, Config),
{ok, C} = emqtt_connect([{clientid, <<"evacuated">>}, {port, DonorPort}]),
?assertWaitEvent(
ok = rpc:call(DonorNode, emqx_node_rebalance_evacuation, start, [opts(Config)]),
#{?snk_kind := node_evacuation_evict_conn},
1000
),
?assertMatch(
{error, {use_another_server, #{}}},
emqtt_try_connect([{clientid, <<"connecting">>}, {port, DonorPort}])
),
receive
{'EXIT', C, {disconnected, 156, _}} -> ok
after 1000 ->
ct:fail("Connection not evicted")
end.
t_migrate_to(Config) ->
[{DonorNode, _DonorPort}, {RecipientNode, _RecipientPort}] = ?config(cluster_nodes, Config),
?assertEqual(
[RecipientNode],
rpc:call(DonorNode, emqx_node_rebalance_evacuation, migrate_to, [undefined])
),
?assertEqual(
[],
rpc:call(DonorNode, emqx_node_rebalance_evacuation, migrate_to, [['unknown@node']])
),
ok = rpc:call(RecipientNode, emqx_eviction_agent, enable, [test_rebalance, undefined]),
?assertEqual(
[],
rpc:call(DonorNode, emqx_node_rebalance_evacuation, migrate_to, [undefined])
).
t_session_evicted(Config) ->
process_flag(trap_exit, true),
[{DonorNode, DonorPort}, {RecipientNode, _RecipientPort}] = ?config(cluster_nodes, Config),
{ok, C} = emqtt_connect([
{port, DonorPort}, {clientid, <<"client_with_sess">>}, {clean_start, false}
]),
?assertWaitEvent(
ok = rpc:call(DonorNode, emqx_node_rebalance_evacuation, start, [opts(Config)]),
#{?snk_kind := node_evacuation_evict_sess_over},
5000
),
receive
{'EXIT', C, {disconnected, ?RC_USE_ANOTHER_SERVER, _}} -> ok
after 1000 ->
ct:fail("Connection not evicted")
end,
[ChannelPid] = rpc:call(DonorNode, emqx_cm_registry, lookup_channels, [<<"client_with_sess">>]),
?assertEqual(
RecipientNode,
node(ChannelPid)
).
%%--------------------------------------------------------------------
%% Helpers
%%--------------------------------------------------------------------
opts(Config) ->
#{
server_reference => <<"srv">>,
conn_evict_rate => 10,
sess_evict_rate => 10,
wait_takeover => 1,
migrate_to => migrate_to(Config)
}.
migrate_to(Config) ->
case ?config(cluster_type, Config) of
one_node ->
[];
two_node ->
[_, {RecipientNode, _RecipientPort}] = ?config(cluster_nodes, Config),
[RecipientNode]
end.
case_specific_data_dir(Case, Config) ->
case ?config(priv_dir, Config) of
undefined -> undefined;
PrivDir -> filename:join(PrivDir, atom_to_list(Case))
end.

View File

@ -0,0 +1,108 @@
%%--------------------------------------------------------------------
%% Copyright (c) 2022-2023 EMQ Technologies Co., Ltd. All Rights Reserved.
%%--------------------------------------------------------------------
-module(emqx_node_rebalance_evacuation_persist_SUITE).
-compile(export_all).
-compile(nowarn_export_all).
-include_lib("eunit/include/eunit.hrl").
-include_lib("common_test/include/ct.hrl").
all() ->
emqx_common_test_helpers:all(?MODULE).
init_per_suite(Config) ->
Config.
end_per_suite(_Config) ->
ok.
init_per_testcase(_Case, Config) ->
_ = emqx_node_rebalance_evacuation_persist:clear(),
Config.
end_per_testcase(_Case, _Config) ->
_ = emqx_node_rebalance_evacuation_persist:clear().
%%--------------------------------------------------------------------
%% Tests
%%--------------------------------------------------------------------
t_save_read(_Config) ->
DefaultOpts = #{
server_reference => <<"default_ref">>,
conn_evict_rate => 2001,
sess_evict_rate => 2002,
wait_takeover => 2003
},
Opts0 = #{
server_reference => <<"ref">>,
conn_evict_rate => 1001,
sess_evict_rate => 1002,
wait_takeover => 1003
},
ok = emqx_node_rebalance_evacuation_persist:save(Opts0),
{ok, ReadOpts0} = emqx_node_rebalance_evacuation_persist:read(DefaultOpts),
?assertEqual(Opts0, ReadOpts0),
Opts1 = Opts0#{server_reference => undefined},
ok = emqx_node_rebalance_evacuation_persist:save(Opts1),
{ok, ReadOpts1} = emqx_node_rebalance_evacuation_persist:read(DefaultOpts),
?assertEqual(Opts1, ReadOpts1).
t_read_default(_Config) ->
ok = write_evacuation_file(<<"{}">>),
DefaultOpts = #{
server_reference => <<"ref">>,
conn_evict_rate => 1001,
sess_evict_rate => 1002,
wait_takeover => 1003
},
{ok, ReadOpts} = emqx_node_rebalance_evacuation_persist:read(DefaultOpts),
?assertEqual(DefaultOpts, ReadOpts).
t_read_bad_data(_Config) ->
ok = write_evacuation_file(<<"{bad json">>),
DefaultOpts = #{
server_reference => <<"ref">>,
conn_evict_rate => 1001,
sess_evict_rate => 1002,
wait_takeover => 1003
},
{ok, ReadOpts} = emqx_node_rebalance_evacuation_persist:read(DefaultOpts),
?assertEqual(DefaultOpts, ReadOpts).
t_clear(_Config) ->
ok = write_evacuation_file(<<"{}">>),
?assertMatch(
{ok, _},
emqx_node_rebalance_evacuation_persist:read(#{})
),
ok = emqx_node_rebalance_evacuation_persist:clear(),
?assertEqual(
none,
emqx_node_rebalance_evacuation_persist:read(#{})
).
%%--------------------------------------------------------------------
%% Helpers
%%--------------------------------------------------------------------
write_evacuation_file(Json) ->
ok = filelib:ensure_dir(emqx_node_rebalance_evacuation_persist:evacuation_filepath()),
ok = file:write_file(
emqx_node_rebalance_evacuation_persist:evacuation_filepath(),
Json
).

View File

@ -9,18 +9,7 @@
-include_lib("eunit/include/eunit.hrl").
-include_lib("common_test/include/ct.hrl").
-include_lib("snabbkaffe/include/snabbkaffe.hrl").
-define(assertWaitEvent(Code, EventMatch, Timeout),
?assertMatch(
{_, {ok, EventMatch}},
?wait_async_action(
Code,
EventMatch,
Timeout
)
)
).
-include_lib("emqx/include/asserts.hrl").
all() -> emqx_common_test_helpers:all(?MODULE).

View File

@ -72,4 +72,6 @@ is_running_node(Node) ->
handle_result({ok, Result}) ->
?OK(Result);
handle_result({error, Reason}) ->
?BAD_REQUEST(Reason).
?BAD_REQUEST(Reason);
handle_result({HTTPCode, Content}) when is_integer(HTTPCode) ->
{HTTPCode, Content}.

View File

@ -0,0 +1,2 @@
Add node rebalance/node evacuation functionality.
See also: [design doc](https://github.com/emqx/eip/blob/main/active/0020-node-rebalance.md)

View File

@ -412,6 +412,8 @@ defmodule EMQXUmbrella.MixProject do
emqx_bridge_oracle: :permanent,
emqx_bridge_rabbitmq: :permanent,
emqx_ee_schema_registry: :permanent,
emqx_eviction_agent: :permanent,
emqx_node_rebalance: :permanent,
emqx_ft: :permanent
],
else: []

View File

@ -481,6 +481,8 @@ relx_apps_per_edition(ee) ->
emqx_bridge_oracle,
emqx_bridge_rabbitmq,
emqx_ee_schema_registry,
emqx_eviction_agent,
emqx_node_rebalance,
emqx_ft
];
relx_apps_per_edition(ce) ->

View File

@ -0,0 +1,9 @@
emqx_eviction_agent_api {
node_eviction_status_get.desc:
"""Get the node eviction status"""
node_eviction_status_get.label:
"""Node Eviction Status"""
}

View File

@ -0,0 +1,267 @@
emqx_node_rebalance_api {
load_rebalance_status.desc:
"""Get rebalance status of the current node"""
load_rebalance_status.label:
"""Get rebalance status"""
load_rebalance_global_status.desc:
"""Get status of all rebalance/evacuation processes across the cluster"""
load_rebalance_global_status.label:
"""Get global rebalance status"""
load_rebalance_availability_check.desc:
"""Check if the node is being evacuated or rebalanced"""
load_rebalance_availability_check.label:
"""Availability check"""
load_rebalance_start.desc:
"""Start rebalance process"""
load_rebalance_start.label:
"""Start rebalance"""
load_rebalance_stop.desc:
"""Stop rebalance process"""
load_rebalance_stop.label:
"""Stop rebalance"""
load_rebalance_evacuation_start.desc:
"""Start evacuation process"""
load_rebalance_evacuation_start.label:
"""Start evacuation"""
load_rebalance_evacuation_stop.desc:
"""Stop evacuation process"""
load_rebalance_evacuation_stop.label:
"""Stop evacuation"""
param_node.desc:
"""Node name"""
param_node.label:
"""Node name"""
wait_health_check.desc:
"""Time to wait before starting the rebalance process, in seconds"""
wait_health_check.label:
"""Wait health check"""
conn_evict_rate.desc:
"""The rate of evicting connections, in connections per second"""
conn_evict_rate.label:
"""Connection eviction rate"""
sess_evict_rate.desc:
"""The rate of evicting sessions, in sessions per second"""
sess_evict_rate.label:
"""Session eviction rate"""
abs_conn_threshold.desc:
"""Maximum desired difference between the number of connections on the node and the average number of connections on the recipient nodes. Difference lower than this is the goal of the rebalance process."""
abs_conn_threshold.label:
"""Absolute connection threshold"""
rel_conn_threshold.desc:
"""Maximum desired fraction between the number of connections on the node and the average number of connections on the recipient nodes. Fraction lower than this is the goal of the rebalance process."""
rel_conn_threshold.label:
"""Relative connection threshold"""
abs_sess_threshold.desc:
"""Maximum desired difference between the number of sessions on the node and the average number of sessions on the recipient nodes. Difference lower than this is the goal of the evacuation process."""
abs_sess_threshold.label:
"""Absolute session threshold"""
rel_sess_threshold.desc:
"""Maximum desired fraction between the number of sessions on the node and the average number of sessions on the recipient nodes. Fraction lower than this is the goal of the evacuation process"""
rel_sess_threshold.label:
"""Relative session threshold"""
wait_takeover.desc:
"""Time to wait before starting session evacuation process, in seconds"""
wait_takeover.label:
"""Wait takeover"""
redirect_to.desc:
"""Server reference to redirect clients to (MQTTv5 Server redirection)"""
redirect_to.label:
"""Redirect to"""
migrate_to.desc:
"""Nodes to migrate sessions to"""
migrate_to.label:
"""Migrate to"""
rebalance_nodes.desc:
"""Nodes to participate in rebalance"""
rebalance_nodes.label:
"""Rebalance nodes"""
local_status_enabled.desc:
"""Whether the node is being evacuated"""
local_status_enabled.label:
"""Local evacuation status"""
local_status_process.desc:
"""The type of the task that is being performed on the node: 'evacuation' or 'rebalance'"""
local_status_process.label:
"""Task Type"""
local_status_state.desc:
"""The state of the process that is being performed on the node"""
local_status_state.label:
"""Rebalance/evacuation current state"""
local_status_coordinator_node.desc:
"""The node that is coordinating rebalance process"""
local_status_coordinator_node.label:
"""Coordinator node"""
local_status_connection_eviction_rate.desc:
"""The rate of evicting connections, in connections per second"""
local_status_connection_eviction_rate.label:
"""Connection eviction rate"""
local_status_session_eviction_rate.desc:
"""The rate of evicting sessions, in sessions per second"""
local_status_session_eviction_rate.label:
"""Session eviction rate"""
local_status_connection_goal.desc:
"""The number of connections that the node should have after the rebalance/evacuation process"""
local_status_connection_goal.label:
"""Connection goal"""
local_status_session_goal.desc:
"""The number of sessions that the node should have after the evacuation process"""
local_status_session_goal.label:
"""Session goal"""
local_status_disconnected_session_goal.desc:
"""The number of disconnected sessions that the node should have after the rebalance process"""
local_status_disconnected_session_goal.label:
"""Disconnected session goal"""
local_status_session_recipients.desc:
"""List of nodes to which sessions are being evacuated"""
local_status_session_recipients.label:
"""Session recipients"""
local_status_recipients.desc:
"""List of nodes to which connections/sessions are being evacuated during rebalance"""
local_status_recipients.label:
"""Recipients"""
local_status_stats.desc:
"""Statistics of the evacuation/rebalance process"""
local_status_stats.label:
"""Statistics"""
status_stats_initial_connected.desc:
"""The number of connections on the node before the evacuation/rebalance process"""
status_stats_initial_connected.label:
"""Initial connected"""
status_stats_current_connected.desc:
"""Current number of connections on the node"""
status_stats_current_connected.label:
"""Current connections"""
status_stats_initial_sessions.desc:
"""The number of sessions on the node before the evacuation/rebalance process"""
status_stats_initial_sessions.label:
"""Initial sessions"""
status_stats_current_sessions.desc:
"""Current number of sessions on the node"""
status_stats_current_sessions.label:
"""Current sessions"""
status_stats_current_disconnected_sessions.desc:
"""Current number of disconnected sessions on the node"""
status_stats_current_disconnected_sessions.label:
"""Current disconnected sessions"""
coordinator_status_donors.desc:
"""List of nodes from which connections/sessions are being evacuated"""
coordinator_status_donors.label:
"""Donors"""
coordinator_status_donor_conn_avg.desc:
"""Average number of connections per donor node"""
coordinator_status_donor_conn_avg.label:
"""Donor connections average"""
coordinator_status_donor_sess_avg.desc:
"""Average number of sessions per donor node"""
coordinator_status_donor_sess_avg.label:
"""Donor sessions average"""
coordinator_status_node.desc:
"""The node that is coordinating the evacuation/rebalance process"""
coordinator_status_node.label:
"""Coordinator node"""
evacuation_status_node.desc:
"""The node that is being evacuated"""
evacuation_status_node.label:
"""Evacuated node"""
global_status_evacuations.desc:
"""List of nodes that are being evacuated"""
global_status_evacuations.label:
"""Evacuations"""
global_status_rebalances.desc:
"""List of nodes that coordinate a rebalance"""
global_status_rebalances.label:
"""Rebalances"""
empty_response.desc:
"""The response is empty"""
empty_response.label:
"""Empty response"""
}

View File

@ -0,0 +1,9 @@
emqx_eviction_agent_api {
node_eviction_status_get.desc:
"""获取节点驱逐状态"""
node_eviction_status_get.label:
"""节点驱逐状态"""
}

View File

@ -0,0 +1,266 @@
emqx_node_rebalance_api {
load_rebalance_status.desc:
"""获取当前节点的重平衡状态"""
load_rebalance_status.label:
"""获取重平衡状态"""
load_rebalance_global_status.desc:
"""获取集群中所有重平衡/疏散任务的状态"""
load_rebalance_global_status.label:
"""获取全局重平衡状态"""
load_rebalance_availability_check.desc:
"""检查节点是否正在被执行重平衡或疏散"""
load_rebalance_availability_check.label:
"""可用性检查"""
load_rebalance_start.desc:
"""启动重平衡任务"""
load_rebalance_start.label:
"""启动重平衡"""
load_rebalance_stop.desc:
"""停止重平衡任务"""
load_rebalance_stop.label:
"""停止重平衡"""
load_rebalance_evacuation_start.desc:
"""启动疏散任务"""
load_rebalance_evacuation_start.label:
"""启动疏散"""
load_rebalance_evacuation_stop.desc:
"""停止疏散任务"""
load_rebalance_evacuation_stop.label:
"""停止疏散"""
param_node.desc:
"""节点名称"""
param_node.label:
"""节点名称"""
wait_health_check.desc:
"""启动重平衡任务前等待的时间,单位为秒"""
wait_health_check.label:
"""等待健康检查"""
conn_evict_rate.desc:
"""每秒迁出连接数"""
conn_evict_rate.label:
"""迁出速率"""
sess_evict_rate.desc:
"""每秒迁出会话数"""
sess_evict_rate.label:
"""会话迁出速率"""
abs_conn_threshold.desc:
"""当前节点上的连接数与迁入节点上的平均连接数的差值(绝对值)上限,低于该差值时停止迁移连接。"""
abs_conn_threshold.label:
"""连接数差值"""
rel_conn_threshold.desc:
"""当前节点上的连接数与迁入节点上的平均连接数的比值上限,低于该比值时停止迁移连接。"""
rel_conn_threshold.label:
"""连接数比值"""
abs_sess_threshold.desc:
"""当前节点上的会话数与迁入节点上的平均会话数之间的差值(绝对值)上限,低于该差值时停止迁移会话。"""
abs_sess_threshold.label:
"""会话数差值"""
rel_sess_threshold.desc:
"""当前节点上的会话数与迁入节点上的平均会话数的比值上限,低于该比值时停止迁移会话。"""
rel_sess_threshold.label:
"""会话数比值"""
wait_takeover.desc:
"""开始会话疏散任务之前的等待时间,以秒为单位"""
wait_takeover.label:
"""等待接管"""
redirect_to.desc:
"""将客户端重定向到的服务器参考MQTTv5 服务器重定向)"""
redirect_to.label:
"""重定向至"""
migrate_to.desc:
"""接受会话迁入的节点"""
migrate_to.label:
"""迁入节点"""
rebalance_nodes.desc:
"""参与重平衡的节点"""
rebalance_nodes.label:
"""重新平衡节点"""
local_status_enabled.desc:
"""节点是否正在执行重平衡疏散任务"""
local_status_enabled.label:
"""运行状态"""
local_status_process.desc:
"""正在节点上执行的任务:'evacuation' 或 'rebalance'"""
local_status_process.label:
"""节点任务"""
local_status_state.desc:
"""正在节点上执行的任务的状态"""
local_status_state.label:
"""重新平衡/疏散当前状态"""
local_status_coordinator_node.desc:
"""协调分配重平衡任务的节点"""
local_status_coordinator_node.label:
"""协调节点"""
local_status_connection_eviction_rate.desc:
"""每秒迁出的连接数"""
local_status_connection_eviction_rate.label:
"""连接迁出速率"""
local_status_session_eviction_rate.desc:
"""每秒迁出的会话数"""
local_status_session_eviction_rate.label:
"""会话迁出速率"""
local_status_connection_goal.desc:
"""节点在重新平衡/疏散任务完成后预期拥有的连接数"""
local_status_connection_goal.label:
"""连接数目标"""
local_status_session_goal.desc:
"""疏散任务完成后节点预期的会话数"""
local_status_session_goal.label:
"""会话数目标"""
local_status_disconnected_session_goal.desc:
"""重新平衡任务完成后节点预期的无连接的会话数"""
local_status_disconnected_session_goal.label:
"""预期无连接会话数"""
local_status_session_recipients.desc:
"""会话被迁入的节点列表"""
local_status_session_recipients.label:
"""会话迁入节点"""
local_status_recipients.desc:
"""在重新平衡期间接受连接/会话迁入的节点列表"""
local_status_recipients.label:
"""接受迁入节点"""
local_status_stats.desc:
"""疏散/重平衡的统计"""
local_status_stats.label:
"""统计数据"""
status_stats_initial_connected.desc:
"""疏散/重新平衡任务开始之前节点上的连接数"""
status_stats_initial_connected.label:
"""初始连接"""
status_stats_current_connected.desc:
"""节点上的当前连接数"""
status_stats_current_connected.label:
"""当前连接"""
status_stats_initial_sessions.desc:
"""疏散/重新平衡任务开始之前节点上的会话数"""
status_stats_initial_sessions.label:
"""初始会话"""
status_stats_current_sessions.desc:
"""节点上的当前会话数"""
status_stats_current_sessions.label:
"""当前会话"""
status_stats_current_disconnected_sessions.desc:
"""节点上当前无连接的会话数"""
status_stats_current_disconnected_sessions.label:
"""当前无连接会话"""
coordinator_status_donors.desc:
"""正在迁出连接/会话的节点列表"""
coordinator_status_donors.label:
"""迁出节点"""
coordinator_status_donor_conn_avg.desc:
"""每个迁出节点的平均连接数"""
coordinator_status_donor_conn_avg.label:
"""迁出节点连接平均值"""
coordinator_status_donor_sess_avg.desc:
"""每个迁出节点的平均会话数"""
coordinator_status_donor_sess_avg.label:
"""迁出节点会话平均数"""
coordinator_status_node.desc:
"""协调分配疏散/重平衡任务的节点"""
coordinator_status_node.label:
"""协调节点"""
evacuation_status_node.desc:
"""正在迁出的节点"""
evacuation_status_node.label:
"""疏散节点"""
global_status_evacuations.desc:
"""正在迁出的节点列表"""
global_status_evacuations.label:
"""疏散"""
global_status_rebalances.desc:
"""协调重平衡的节点列表"""
global_status_rebalances.label:
"""重平衡"""
empty_response.desc:
"""响应为空"""
empty_response.label:
"""空响应"""
}