#!/usr/bin/env bash set -euo pipefail # ================================== # RESCUE THE UNBOOTABLE EMQX CLUSTER # ================================== ## Global Vars # Steal from emqx_ctl THIS_DIR="$(cd "$(dirname "$(readlink "$0" || echo "$0")")" || true; pwd -P)" usage() { local Script Script=$(basename "$0") echo " RESCUE THE UNBOOTABLE EMQX CLUSTER Use this script only when the entire cluster is stuck at booting & loading. This script provides a list of methods to *hack* the DB of EMQX to bring back the cluster back to service but MAY come with some side effects including: - Data loss - Inconsistent data in the cluster - Other undefined behaviors *DO NOT* use this script unless you understand the consequences. *DO NOT* use this script when EMQX cluster is partitioned. Use Case: - Lost one node due to unrecoverable failures (hardware, cloud resource outage) and this node prevents other nodes in the cluster from starting. Usage: # For troubleshooting, find out all the tables that are pending at loading $Script pending-tables # For troubleshooting, debug print detailed table info that is pending at loading. $Script table-details # Force load one [Tab] or all pending tables from node local storage to bring this node up # Use local data as the data source for the pending tables, should bring up the node immediately and # spread the data to other nodes in the cluster. # # * Take effect immediately # * This is a node local change but the change will be lost after restart. $Script force-load [Tab] # Remove Node from mnesia cluster. # Most likely will fail if the remote Node is unreachable. # # * This is a cluster wide schema change. $Script remove-node Node # Set master node for distributed DB # The master node will be the data source for pending tables. # # * This is a node local change # * Node could be a remote Erlang node in the cluster or local erlang node # * Use command: 'unset-master' to rollback $Script set-master Node # Unset master node for distributed DB, this is a node local change $Script unset-master # Cheat the local node that RemoteNode is down so that it will not wait for it to come up. # Local node will take local data as the data source for pending tables and spread the data # to the other pending nodes. # # * Check EMQX logs to find out which remote node(s) the local node is waiting for # * To take effect, restart this EMQX node # * This is a node local setting $Script lie-node-down RemoteNode Tips: - Override local node name with envvar: \$EMQX_NODE__NAME " } # Functions # print_pending_tables() { local erl_cmd='[ io:format("~p :: ~p~n", [T, maps:with([all_nodes, load_order, storage_type, active_replicas, local_content, load_by_force, load_node, load_reason, master_nodes] , maps:from_list(mnesia:table_info(T, all)))]) || T <- mnesia:system_info(local_tables), unknown =:= mnesia:table_info(T, load_node) ], ok ' exec "$THIS_DIR/emqx" eval "$erl_cmd" } print_details_per_table() { local erl_cmd='[ io:format("~p :: ~p~n", [T, mnesia:table_info(T, all)]) || T <- mnesia:system_info(local_tables), unknown =:= mnesia:table_info(T, load_node) ], ok ' exec "$THIS_DIR/emqx" eval "$erl_cmd" } force-load() { if [ $# -eq 1 ]; then local erl_cmd="mnesia:force_load_table(${1})" else local erl_cmd='[ {T, mnesia:force_load_table(T)} || T <- mnesia:system_info(local_tables), unknown =:= mnesia:table_info(T, load_node) ] ' fi exec "$THIS_DIR/emqx" eval "$erl_cmd" } remove-node() { local target_node=$1 local erl_cmd=" case [T || T <- mnesia:system_info(local_tables), unknown =:= mnesia:table_info(T, load_node)] of [] -> io:format(\"No table need to load\\n\"), skipped; TargetTables -> io:format(\"Going to remove node ${target_node} from schema of the tables:~n~p~n\", [TargetTables]), case io:read(\"confirm? [yes.] OR Ctrl-D to skip: \") of {ok, yes} -> lists:map(fun(T) -> mnesia:force_load_table(T), {T, mnesia:del_table_copy(T, '${target_node}') } end, TargetTables); eof -> skipped; R -> {skipped, R} end end " exec "$THIS_DIR/emqx" eval "$erl_cmd" } set-master-node() { if [ $# -eq 1 ]; then local erl_cmd="mnesia:set_master_nodes(['${1}']), mnesia_recover:dump_decision_tab()" else local erl_cmd="mnesia:set_master_nodes([]), mnesia_recover:dump_decision_tab()" fi exec "$THIS_DIR/emqx" eval "$erl_cmd" } lie-node-down() { if [ $# -eq 1 ]; then local erl_cmd="mnesia_recover:log_mnesia_down('${1}'), mnesia_recover:dump_decision_tab()" exec "$THIS_DIR/emqx" eval "$erl_cmd" else usage fi } CMD=${1:-usage} [ $# -gt 0 ] && shift 1 case "$CMD" in force-load) force-load "$@" ;; remove-node) remove-node "$@" ;; pending-tables) print_pending_tables ;; table-details) print_details_per_table ;; set-master) set-master-node "$@" ;; unset-master) set-master-node ;; lie-node-down) lie-node-down "$@" ;; *) usage esac