fix(route schema): allow boot if all peer nodes agree on one version

This commit is contained in:
zmstone 2024-03-25 16:15:07 +01:00
parent fe3cc25855
commit 3aff9eb2a4
1 changed files with 65 additions and 56 deletions

View File

@ -656,16 +656,16 @@ init_schema() ->
ok = mria:wait_for_tables([?ROUTE_TAB, ?ROUTE_TAB_FILTERS]), ok = mria:wait_for_tables([?ROUTE_TAB, ?ROUTE_TAB_FILTERS]),
ok = emqx_trie:wait_for_tables(), ok = emqx_trie:wait_for_tables(),
ConfSchema = emqx_config:get([broker, routing, storage_schema]), ConfSchema = emqx_config:get([broker, routing, storage_schema]),
ClusterState = discover_cluster_schema_vsn(), {ClusterSchema, ClusterState} = discover_cluster_schema_vsn(),
Schema = choose_schema_vsn(ConfSchema, ClusterState), Schema = choose_schema_vsn(ConfSchema, ClusterSchema, ClusterState),
ok = persistent_term:put(?PT_SCHEMA_VSN, Schema), ok = persistent_term:put(?PT_SCHEMA_VSN, Schema),
case Schema of case Schema =:= ConfSchema of
ConfSchema -> true ->
?SLOG(info, #{ ?SLOG(info, #{
msg => "routing_schema_used", msg => "routing_schema_used",
schema => Schema schema => Schema
}); });
_ -> false ->
?SLOG(notice, #{ ?SLOG(notice, #{
msg => "configured_routing_schema_ignored", msg => "configured_routing_schema_ignored",
schema_in_use => Schema, schema_in_use => Schema,
@ -687,6 +687,7 @@ discover_cluster_schema_vsn() ->
discover_cluster_schema_vsn(emqx:running_nodes() -- [node()]). discover_cluster_schema_vsn(emqx:running_nodes() -- [node()]).
discover_cluster_schema_vsn([]) -> discover_cluster_schema_vsn([]) ->
%% single node
undefined; undefined;
discover_cluster_schema_vsn(Nodes) -> discover_cluster_schema_vsn(Nodes) ->
Responses = lists:zipwith( Responses = lists:zipwith(
@ -709,68 +710,72 @@ discover_cluster_schema_vsn(Nodes) ->
[Vsn] when Vsn =:= v1; Vsn =:= v2 -> [Vsn] when Vsn =:= v1; Vsn =:= v2 ->
{Vsn, Responses}; {Vsn, Responses};
[] -> [] ->
?SLOG(notice, #{ ?SLOG(warning, #{
msg => "cluster_routing_schema_discovery_failed", msg => "cluster_routing_schema_discovery_failed",
responses => Responses, responses => Responses,
reason => reason =>
"Could not determine configured routing storage schema in the cluster." "Could not determine configured routing storage schema in peer nodes."
}), }),
{undefined, Responses}; {undefined, Responses};
[_ | _] -> [_ | _] ->
Desc = schema_conflict_reason(config, Responses),
io:format(standard_error, "Error: ~ts~n", [Desc]),
?SLOG(critical, #{ ?SLOG(critical, #{
msg => "conflicting_routing_schemas_configured_in_cluster", msg => "conflicting_routing_schemas_in_cluster",
responses => Responses, responses => Responses,
reason => description => Desc
"There are nodes in the cluster with different configured routing "
"storage schemas. This probably means that some nodes use v1 schema "
"and some use v2, independently of each other. The routing is likely "
"broken. Manual intervention required.",
action => mk_conflict_resolution_action(Responses)
}), }),
error(conflicting_routing_schemas_configured_in_cluster) error(conflicting_routing_schemas_configured_in_cluster)
end. end.
-spec choose_schema_vsn( -spec choose_schema_vsn(
schemavsn(), schemavsn(),
_ClusterState :: {schemavsn() | undefined, [{node(), schemavsn() | undefined, _Details}]} _ClusterSchema :: schemavsn() | undefined,
_ClusterState :: [{node(), schemavsn() | undefined, _Details}]
) -> schemavsn(). ) -> schemavsn().
choose_schema_vsn(ConfSchema, {ClusterSchema, State}) -> choose_schema_vsn(ConfSchema, ClusterSchema, State) ->
case detect_table_schema_vsn() of case detect_table_schema_vsn() of
[ClusterSchema] ->
%% Table contents match configured schema in the cluster.
ClusterSchema;
[Schema] when ClusterSchema =:= undefined ->
%% There are existing records following some schema, we have to use it.
Schema;
[] -> [] ->
%% No records in the tables, use schema configured in the cluster if any, %% No records in the tables, use schema configured in the cluster if any,
%% otherwise use configured. %% otherwise use configured.
emqx_maybe:define(ClusterSchema, ConfSchema); emqx_maybe:define(ClusterSchema, ConfSchema);
ConlictingSchemas -> [Schema] when Schema =:= ClusterSchema ->
Reason = %% Table contents match configured schema in the cluster.
"There are records in the routing tables either related to both v1 " Schema;
"and v2 storage schemas, or conflicting with storage schema assumed " [Schema] when ClusterSchema =:= undefined ->
"by the cluster. This probably means that some nodes in the cluster " %% There are existing records following some schema, we have to use it.
"use v1 schema and some use v2, independently of each other. The " Schema;
"routing is likely broken. Manual intervention and full cluster " [v1, v2] when ClusterSchema =/= undefined ->
"restart is required. This node will shut down.", %% There are existing records in both v1 and v2 schema,
Action = mk_conflict_resolution_action(State), %% we have to use what the peer nodes agreed on.
%% because it could be HTIS node which caused cnoflict.
%%
%% The stale records will be left-over, but harmless
ClusterSchema;
[v1, v2] ->
Desc = schema_conflict_reason(records, State),
io:format(standard_error, "Error: ~ts~n", [Desc]),
?SLOG(critical, #{ ?SLOG(critical, #{
msg => "conflicting_routing_schemas_detected_in_cluster", msg => "conflicting_routing_storage_in_cluster",
detected => ConlictingSchemas, description => Desc
configured => ConfSchema,
configured_in_cluster => ClusterSchema,
reason => Reason,
action => Action
}), }),
io:format(
standard_error,
"Error: conflicting routing schemas detected in the cluster.\n~s\n~s\n",
[Reason, Action]
),
error(conflicting_routing_schemas_detected_in_cluster) error(conflicting_routing_schemas_detected_in_cluster)
end. end.
schema_conflict_reason(Type, State) ->
Observe =
case Type of
config ->
"Peer nodes have route storage schema resolved into conflicting versions.\n";
records ->
"There are conflicting routing records found.\n"
end,
Cause =
"\nThis was caused by a race-condition when the cluster was rolling upgraded "
"from an older version to 5.4.0, 5.4.1, 5.5.0 or 5.5.1."
"\nThis node cannot boot before the conflicts are resolved.\n",
Observe ++ Cause ++ mk_conflict_resolution_action(State).
detect_table_schema_vsn() -> detect_table_schema_vsn() ->
lists:flatten([ lists:flatten([
[v1 || _NonEmptyTrieIndex = not emqx_trie:empty()], [v1 || _NonEmptyTrieIndex = not emqx_trie:empty()],
@ -784,23 +789,27 @@ mk_conflict_resolution_action(State) ->
NodesV1 = [Node || {Node, v1, _} <- State], NodesV1 = [Node || {Node, v1, _} <- State],
NodesUnknown = [Node || {Node, unknown, _} <- State], NodesUnknown = [Node || {Node, unknown, _} <- State],
Format = Format =
"Following EMQX nodes are running with conflicting schema:" "There are two ways to resolve the conflict:"
"\n ~p" "\n"
"Please take the following steps to resolve the conflict:" "\nA: Full cluster restart: stop ALL running nodes one by one "
"\n 1. Stop listeners on those nodes: `$ emqx eval 'emqx_listener:stop()'`" "and restart them in the reversed order."
"\n 2. Wait until they are safe to restart." "\n"
"\n This could take some time, depending on the number of clients and their subscriptions." "\nB: Force v1 nodes to clean up their routes."
"\n The following conditions should be both true for each of the nodes in order to proceed:" "\n Following EMQX nodes are running with v1 schema: ~0p."
"\n * `$ emqx eval 'ets:info(emqx_subscriber, size)'` prints `0`." "\n 1. Stop listeners with command \"emqx eval 'emqx_listener:stop()'\" in all v1 nodes"
"\n * `$ emqx ctl topics list` prints `No topics.`" "\n 2. Wait until they are safe to restart."
"\n 3. Upgrade the nodes to version ~s." "\n This could take some time, depending on the number of clients and their subscriptions."
"\n 4. Restart the nodes.", "\n Below conditions should be true for each of the nodes in order to proceed:"
"\n a) Command 'ets:info(emqx_subscriber, size)' prints `0`."
"\n b) Command 'emqx ctl topics list' prints No topics.`"
"\n 3. Upgrade the nodes to 5.6.0 or newer"
"\n 4. Restart the node",
FormatUnkown = FormatUnkown =
"Additionally, the following nodes were unreachable during startup: ~p." "Additionally, the following nodes were unreachable during startup: ~0p."
"It is strongly advised to include them in the manual resolution procedure as well.", "It is strongly advised to include them in the manual resolution procedure as well.",
Message = io_lib:format(Format, [NodesV1, emqx_release:version_with_prefix()]), Message = io_lib:format(Format, [NodesV1]),
MessageUnknown = [io_lib:format(FormatUnkown, [NodesUnknown]) || NodesUnknown =/= []], MessageUnknown = [io_lib:format(FormatUnkown, [NodesUnknown]) || NodesUnknown =/= []],
unicode:characters_to_list(Message ++ "\n" ++ MessageUnknown). unicode:characters_to_list([Message, "\n", MessageUnknown]).
%%-------------------------------------------------------------------- %%--------------------------------------------------------------------
%% gen_server callbacks %% gen_server callbacks