feat: add emqx_cluster_rescue
This commit is contained in:
parent
265d3200c2
commit
a6f14c255d
|
@ -0,0 +1,187 @@
|
||||||
|
#!/usr/bin/env bash
|
||||||
|
set -euo pipefail
|
||||||
|
# ==================================
|
||||||
|
# RESCUE THE UNBOOTABLE EMQX CLUSTER
|
||||||
|
# ==================================
|
||||||
|
|
||||||
|
## Global Vars
|
||||||
|
# Steal from emqx_ctl
|
||||||
|
THIS_DIR="$(cd "$(dirname "$(readlink "$0" || echo "$0")")" || true; pwd -P)"
|
||||||
|
|
||||||
|
usage() {
|
||||||
|
local Script
|
||||||
|
Script=$(basename "$0")
|
||||||
|
|
||||||
|
echo "
|
||||||
|
RESCUE THE UNBOOTABLE EMQX CLUSTER
|
||||||
|
|
||||||
|
Use this script only when the entire cluster is stuck at booting & loading.
|
||||||
|
|
||||||
|
This script provides a list of methods to *hack* the DB of EMQX to bring back
|
||||||
|
the cluster back to service but MAY come with some side effects including:
|
||||||
|
|
||||||
|
- Data loss
|
||||||
|
- Inconsistent data in the cluster
|
||||||
|
- Other undefined behaviors
|
||||||
|
|
||||||
|
*DO NOT* use this script unless you understand the consequences.
|
||||||
|
*DO NOT* use this script when EMQX cluster is partitioned.
|
||||||
|
|
||||||
|
Use Case:
|
||||||
|
|
||||||
|
- Lost one node due to unrecoverable failures (hardware, cloud resource outage)
|
||||||
|
and this node prevents other nodes in the cluster from starting.
|
||||||
|
|
||||||
|
Usage:
|
||||||
|
|
||||||
|
# For troubleshooting, find out all the tables that are pending at loading
|
||||||
|
$Script pending-tables
|
||||||
|
|
||||||
|
# For troubleshooting, debug print detailed table info that is pending at loading.
|
||||||
|
$Script table-details
|
||||||
|
|
||||||
|
# Force load one [Tab] or all pending tables from node local storage to bring this node up
|
||||||
|
# Use local data as the data source for the pending tables, should bring up the node immediately and
|
||||||
|
# spread the data to other nodes in the cluster.
|
||||||
|
#
|
||||||
|
# * Take effect immediately
|
||||||
|
# * This is a node local change but the change will be lost after restart.
|
||||||
|
$Script force-load [Tab]
|
||||||
|
|
||||||
|
# Remove Node from mnesia cluster.
|
||||||
|
# Most likely will fail if the remote Node is unreachable.
|
||||||
|
#
|
||||||
|
# * This is a cluster wide schema change.
|
||||||
|
$Script remove-node Node
|
||||||
|
|
||||||
|
# Set master node for distributed DB
|
||||||
|
# The master node will be the data source for pending tables.
|
||||||
|
#
|
||||||
|
# * This is a node local change
|
||||||
|
# * Node could be a remote Erlang node in the cluster or local erlang node
|
||||||
|
# * Use command: 'unset-master' to rollback
|
||||||
|
$Script set-master Node
|
||||||
|
|
||||||
|
# Unset master node for distributed DB, this is a node local change
|
||||||
|
$Script unset-master
|
||||||
|
|
||||||
|
# Cheat the local node that RemoteNode is down so that it will not wait for it to come up.
|
||||||
|
# Local node will take local data as the data source for pending tables and spread the data
|
||||||
|
# to the other pending nodes.
|
||||||
|
#
|
||||||
|
# * Check EMQX logs to find out which remote node(s) the local node is waiting for
|
||||||
|
# * To take effect, restart this EMQX node
|
||||||
|
# * This is a node local setting
|
||||||
|
|
||||||
|
$Script lie-node-down RemoteNode
|
||||||
|
|
||||||
|
Tips:
|
||||||
|
- Override local node name with envvar: \$EMQX_NODE__NAME
|
||||||
|
"
|
||||||
|
}
|
||||||
|
|
||||||
|
# Functions
|
||||||
|
#
|
||||||
|
print_pending_tables() {
|
||||||
|
local erl_cmd='[ io:format("~p :: ~p~n", [T, maps:with([all_nodes, load_order, storage_type,
|
||||||
|
active_replicas, local_content, load_by_force,
|
||||||
|
load_node, load_reason, master_nodes]
|
||||||
|
, maps:from_list(mnesia:table_info(T, all)))])
|
||||||
|
|| T <- mnesia:system_info(local_tables), unknown =:= mnesia:table_info(T, load_node) ],
|
||||||
|
ok
|
||||||
|
'
|
||||||
|
exec "$THIS_DIR/emqx" eval "$erl_cmd"
|
||||||
|
}
|
||||||
|
|
||||||
|
print_details_per_table() {
|
||||||
|
local erl_cmd='[ io:format("~p :: ~p~n", [T, mnesia:table_info(T, all)])
|
||||||
|
|| T <- mnesia:system_info(local_tables), unknown =:= mnesia:table_info(T, load_node) ],
|
||||||
|
ok
|
||||||
|
'
|
||||||
|
exec "$THIS_DIR/emqx" eval "$erl_cmd"
|
||||||
|
}
|
||||||
|
|
||||||
|
force-load() {
|
||||||
|
if [ $# -eq 1 ]; then
|
||||||
|
local erl_cmd="mnesia:force_load_table(${1})"
|
||||||
|
else
|
||||||
|
local erl_cmd='[ {T, mnesia:force_load_table(T)}
|
||||||
|
|| T <- mnesia:system_info(local_tables),
|
||||||
|
unknown =:= mnesia:table_info(T, load_node)
|
||||||
|
]
|
||||||
|
'
|
||||||
|
fi
|
||||||
|
exec "$THIS_DIR/emqx" eval "$erl_cmd"
|
||||||
|
}
|
||||||
|
|
||||||
|
remove-node() {
|
||||||
|
local target_node=$1
|
||||||
|
local erl_cmd="
|
||||||
|
case [T || T <- mnesia:system_info(local_tables), unknown =:= mnesia:table_info(T, load_node)] of
|
||||||
|
[] ->
|
||||||
|
io:format(\"No table need to load\\n\"),
|
||||||
|
skipped;
|
||||||
|
TargetTables ->
|
||||||
|
io:format(\"Going to remove node ${target_node} from schema of the tables:~n~p~n\", [TargetTables]),
|
||||||
|
case io:read(\"confirm? [yes.] OR Ctrl-D to skip: \") of
|
||||||
|
{ok, yes} ->
|
||||||
|
lists:map(fun(T) ->
|
||||||
|
mnesia:force_load_table(T),
|
||||||
|
{T, mnesia:del_table_copy(T, '${target_node}') }
|
||||||
|
end, TargetTables);
|
||||||
|
eof -> skipped;
|
||||||
|
R -> {skipped, R}
|
||||||
|
end
|
||||||
|
end
|
||||||
|
"
|
||||||
|
exec "$THIS_DIR/emqx" eval "$erl_cmd"
|
||||||
|
}
|
||||||
|
|
||||||
|
set-master-node() {
|
||||||
|
if [ $# -eq 1 ]; then
|
||||||
|
local erl_cmd="mnesia:set_master_nodes(['${1}']), mnesia_recover:dump_decision_tab()"
|
||||||
|
else
|
||||||
|
local erl_cmd="mnesia:set_master_nodes([]), mnesia_recover:dump_decision_tab()"
|
||||||
|
fi
|
||||||
|
|
||||||
|
exec "$THIS_DIR/emqx" eval "$erl_cmd"
|
||||||
|
}
|
||||||
|
|
||||||
|
lie-node-down() {
|
||||||
|
if [ $# -eq 1 ]; then
|
||||||
|
local erl_cmd="mnesia_recover:log_mnesia_down('${1}'), mnesia_recover:dump_decision_tab()"
|
||||||
|
exec "$THIS_DIR/emqx" eval "$erl_cmd"
|
||||||
|
else
|
||||||
|
usage
|
||||||
|
fi
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
CMD=${1:-usage}
|
||||||
|
[ $# -gt 0 ] && shift 1
|
||||||
|
|
||||||
|
case "$CMD" in
|
||||||
|
force-load)
|
||||||
|
force-load "$@"
|
||||||
|
;;
|
||||||
|
remove-node)
|
||||||
|
remove-node "$@"
|
||||||
|
;;
|
||||||
|
pending-tables)
|
||||||
|
print_pending_tables
|
||||||
|
;;
|
||||||
|
table-details)
|
||||||
|
print_details_per_table
|
||||||
|
;;
|
||||||
|
set-master)
|
||||||
|
set-master-node "$@"
|
||||||
|
;;
|
||||||
|
unset-master)
|
||||||
|
set-master-node
|
||||||
|
;;
|
||||||
|
lie-node-down)
|
||||||
|
lie-node-down "$@"
|
||||||
|
;;
|
||||||
|
*)
|
||||||
|
usage
|
||||||
|
esac
|
8
mix.exs
8
mix.exs
|
@ -408,6 +408,14 @@ defmodule EMQXUmbrella.MixProject do
|
||||||
|
|
||||||
File.chmod!(Path.join(bin, "node_dump"), 0o755)
|
File.chmod!(Path.join(bin, "node_dump"), 0o755)
|
||||||
|
|
||||||
|
Mix.Generator.copy_file(
|
||||||
|
"bin/emqx_cluster_rescue",
|
||||||
|
Path.join(bin, "emqx_cluster_rescue"),
|
||||||
|
force: overwrite?
|
||||||
|
)
|
||||||
|
|
||||||
|
File.chmod!(Path.join(bin, "emqx_cluster_rescue"), 0o755)
|
||||||
|
|
||||||
render_template(
|
render_template(
|
||||||
"rel/BUILD_INFO",
|
"rel/BUILD_INFO",
|
||||||
assigns,
|
assigns,
|
||||||
|
|
|
@ -380,6 +380,7 @@ relx_overlay(ReleaseType, Edition) ->
|
||||||
{template, "rel/BUILD_INFO", "releases/{{release_version}}/BUILD_INFO"},
|
{template, "rel/BUILD_INFO", "releases/{{release_version}}/BUILD_INFO"},
|
||||||
{copy, "bin/emqx", "bin/emqx"},
|
{copy, "bin/emqx", "bin/emqx"},
|
||||||
{copy, "bin/emqx_ctl", "bin/emqx_ctl"},
|
{copy, "bin/emqx_ctl", "bin/emqx_ctl"},
|
||||||
|
{copy, "bin/emqx_cluster_rescue", "bin/emqx_cluster_rescue"},
|
||||||
{copy, "bin/node_dump", "bin/node_dump"},
|
{copy, "bin/node_dump", "bin/node_dump"},
|
||||||
{copy, "bin/install_upgrade.escript", "bin/install_upgrade.escript"},
|
{copy, "bin/install_upgrade.escript", "bin/install_upgrade.escript"},
|
||||||
%% for relup
|
%% for relup
|
||||||
|
|
Loading…
Reference in New Issue