chore: update instructions for routing schema conflict

This commit is contained in:
Ivan Dyachkov 2024-03-25 15:09:01 +01:00
parent 849fe0c2c8
commit fe3cc25855
2 changed files with 22 additions and 26 deletions

View File

@ -707,7 +707,7 @@ discover_cluster_schema_vsn(Nodes) ->
),
case lists:usort([Vsn || {_Node, Vsn, _} <- Responses, Vsn /= unknown]) of
[Vsn] when Vsn =:= v1; Vsn =:= v2 ->
Vsn;
{Vsn, Responses};
[] ->
?SLOG(notice, #{
msg => "cluster_routing_schema_discovery_failed",
@ -715,7 +715,7 @@ discover_cluster_schema_vsn(Nodes) ->
reason =>
"Could not determine configured routing storage schema in the cluster."
}),
undefined;
{undefined, Responses};
[_ | _] ->
?SLOG(critical, #{
msg => "conflicting_routing_schemas_configured_in_cluster",
@ -747,20 +747,27 @@ choose_schema_vsn(ConfSchema, {ClusterSchema, State}) ->
%% otherwise use configured.
emqx_maybe:define(ClusterSchema, ConfSchema);
ConlictingSchemas ->
Reason =
"There are records in the routing tables either related to both v1 "
"and v2 storage schemas, or conflicting with storage schema assumed "
"by the cluster. This probably means that some nodes in the cluster "
"use v1 schema and some use v2, independently of each other. The "
"routing is likely broken. Manual intervention and full cluster "
"restart is required. This node will shut down.",
Action = mk_conflict_resolution_action(State),
?SLOG(critical, #{
msg => "conflicting_routing_schemas_detected_in_cluster",
detected => ConlictingSchemas,
configured => ConfSchema,
configured_in_cluster => ClusterSchema,
reason =>
"There are records in the routing tables either related to both v1 "
"and v2 storage schemas, or conflicting with storage schema assumed "
"by the cluster. This probably means that some nodes in the cluster "
"use v1 schema and some use v2, independently of each other. The "
"routing is likely broken. Manual intervention and full cluster "
"restart is required. This node will shut down.",
action => mk_conflict_resolution_action(State)
reason => Reason,
action => Action
}),
io:format(
standard_error,
"Error: conflicting routing schemas detected in the cluster.\n~s\n~s\n",
[Reason, Action]
),
error(conflicting_routing_schemas_detected_in_cluster)
end.
@ -783,16 +790,15 @@ mk_conflict_resolution_action(State) ->
"\n 1. Stop listeners on those nodes: `$ emqx eval 'emqx_listener:stop()'`"
"\n 2. Wait until they are safe to restart."
"\n This could take some time, depending on the number of clients and their subscriptions."
"\n Those conditions should be true for each of the nodes in order to proceed:"
"\n The following conditions should be both true for each of the nodes in order to proceed:"
"\n * `$ emqx eval 'ets:info(emqx_subscriber, size)'` prints `0`."
"\n * `$ emqx ctl topics list` prints `No topics.`"
"\n 3. Upgrade the nodes to the latest version."
"\n 3. Upgrade the nodes to version ~s."
"\n 4. Restart the nodes.",
FormatUnkown =
"Additionally, following nodes were unreachable during startup:"
"\n ~p"
"It's strongly advised to include them in the manual resolution procedure as well.",
Message = io_lib:format(Format, [NodesV1]),
"Additionally, the following nodes were unreachable during startup: ~p."
"It is strongly advised to include them in the manual resolution procedure as well.",
Message = io_lib:format(Format, [NodesV1, emqx_release:version_with_prefix()]),
MessageUnknown = [io_lib:format(FormatUnkown, [NodesUnknown]) || NodesUnknown =/= []],
unicode:characters_to_list(Message ++ "\n" ++ MessageUnknown).

View File

@ -1,13 +1,3 @@
Fixed an issue which may occur when performing rolling upgrade, especially when upgrading from a version earlier than 5.4.0.
When the cluster is empty (more precisely, routing tables are empty), try to additionally ask the cluster nodes for the routing schema in use, to make more informed decision about routing storage schema upon startup. This should make routing storage schema less likely to diverge across cluster nodes, especially when the cluster is composed of different versions of EMQX.
In case you get the following message about broken routing during rolling upgrade: "There are records in the routing tables either related to both v1 and v2 storage schemas, or conflicting with storage schema assumed by the cluster. This probably means that some nodes in the cluster use v1 schema and some use v2, independently of each other. The routing is likely broken. Manual intervention and full cluster restart is required. This node will shut down.", please follow the steps below to resolve the issue.
1. Stop listeners on legacy nodes: `$ emqx eval 'emqx_listener:stop()'`
2. Wait until they are safe to restart.
This could take some time, depending on the number of clients and their subscriptions.
Those conditions should be true for both nodes in order to proceed:
* `$ emqx eval 'ets:info(emqx_subscriber, size)'` prints `0`.
* `$ emqx ctl topics list` prints `No topics.`
3. Upgrade the nodes to the latest version.
4. Restart the nodes.