fix(ds): Perform read operations on the leader.
This commit is contained in:
parent
4580906405
commit
c6fc76e335
|
@ -124,6 +124,8 @@ The following application environment variables are available:
|
||||||
|
|
||||||
- `emqx_durable_storage.egress_flush_interval`: period at which the batches of messages are committed to the durable storage.
|
- `emqx_durable_storage.egress_flush_interval`: period at which the batches of messages are committed to the durable storage.
|
||||||
|
|
||||||
|
- `emqx_durable_storage.reads`: `leader_preferred` | `local_preferred`.
|
||||||
|
|
||||||
Runtime settings for the durable storages can be modified via CLI as well as the REST API.
|
Runtime settings for the durable storages can be modified via CLI as well as the REST API.
|
||||||
The following CLI commands are available:
|
The following CLI commands are available:
|
||||||
|
|
||||||
|
|
|
@ -561,12 +561,27 @@ list_nodes() ->
|
||||||
%% Too large for normal operation, need better backpressure mechanism.
|
%% Too large for normal operation, need better backpressure mechanism.
|
||||||
-define(RA_TIMEOUT, 60 * 1000).
|
-define(RA_TIMEOUT, 60 * 1000).
|
||||||
|
|
||||||
-define(SAFERPC(EXPR),
|
-define(SAFE_ERPC(EXPR),
|
||||||
try
|
try
|
||||||
EXPR
|
EXPR
|
||||||
catch
|
catch
|
||||||
error:RPCError = {erpc, _} ->
|
error:RPCError__ = {erpc, _} ->
|
||||||
{error, recoverable, RPCError}
|
{error, recoverable, RPCError__}
|
||||||
|
end
|
||||||
|
).
|
||||||
|
|
||||||
|
-define(SHARD_RPC(DB, SHARD, NODE, BODY),
|
||||||
|
case
|
||||||
|
emqx_ds_replication_layer_shard:servers(
|
||||||
|
DB, SHARD, application:get_env(emqx_durable_storage, reads, leader_preferred)
|
||||||
|
)
|
||||||
|
of
|
||||||
|
[{_, NODE} | _] ->
|
||||||
|
begin
|
||||||
|
BODY
|
||||||
|
end;
|
||||||
|
[] ->
|
||||||
|
{error, recoverable, replica_offline}
|
||||||
end
|
end
|
||||||
).
|
).
|
||||||
|
|
||||||
|
@ -623,44 +638,79 @@ ra_drop_generation(DB, Shard, GenId) ->
|
||||||
end.
|
end.
|
||||||
|
|
||||||
ra_get_streams(DB, Shard, TopicFilter, Time) ->
|
ra_get_streams(DB, Shard, TopicFilter, Time) ->
|
||||||
{_, Node} = emqx_ds_replication_layer_shard:server(DB, Shard, local_preferred),
|
|
||||||
TimestampUs = timestamp_to_timeus(Time),
|
TimestampUs = timestamp_to_timeus(Time),
|
||||||
?SAFERPC(emqx_ds_proto_v4:get_streams(Node, DB, Shard, TopicFilter, TimestampUs)).
|
?SHARD_RPC(
|
||||||
|
DB,
|
||||||
|
Shard,
|
||||||
|
Node,
|
||||||
|
?SAFE_ERPC(emqx_ds_proto_v4:get_streams(Node, DB, Shard, TopicFilter, TimestampUs))
|
||||||
|
).
|
||||||
|
|
||||||
ra_get_delete_streams(DB, Shard, TopicFilter, Time) ->
|
ra_get_delete_streams(DB, Shard, TopicFilter, Time) ->
|
||||||
{_Name, Node} = emqx_ds_replication_layer_shard:server(DB, Shard, local_preferred),
|
?SHARD_RPC(
|
||||||
?SAFERPC(emqx_ds_proto_v4:get_delete_streams(Node, DB, Shard, TopicFilter, Time)).
|
DB,
|
||||||
|
Shard,
|
||||||
|
Node,
|
||||||
|
?SAFE_ERPC(emqx_ds_proto_v4:get_delete_streams(Node, DB, Shard, TopicFilter, Time))
|
||||||
|
).
|
||||||
|
|
||||||
ra_make_iterator(DB, Shard, Stream, TopicFilter, StartTime) ->
|
ra_make_iterator(DB, Shard, Stream, TopicFilter, StartTime) ->
|
||||||
{_, Node} = emqx_ds_replication_layer_shard:server(DB, Shard, local_preferred),
|
|
||||||
TimeUs = timestamp_to_timeus(StartTime),
|
TimeUs = timestamp_to_timeus(StartTime),
|
||||||
?SAFERPC(emqx_ds_proto_v4:make_iterator(Node, DB, Shard, Stream, TopicFilter, TimeUs)).
|
?SHARD_RPC(
|
||||||
|
DB,
|
||||||
|
Shard,
|
||||||
|
Node,
|
||||||
|
?SAFE_ERPC(emqx_ds_proto_v4:make_iterator(Node, DB, Shard, Stream, TopicFilter, TimeUs))
|
||||||
|
).
|
||||||
|
|
||||||
ra_make_delete_iterator(DB, Shard, Stream, TopicFilter, StartTime) ->
|
ra_make_delete_iterator(DB, Shard, Stream, TopicFilter, StartTime) ->
|
||||||
{_Name, Node} = emqx_ds_replication_layer_shard:server(DB, Shard, local_preferred),
|
|
||||||
TimeUs = timestamp_to_timeus(StartTime),
|
TimeUs = timestamp_to_timeus(StartTime),
|
||||||
?SAFERPC(emqx_ds_proto_v4:make_delete_iterator(Node, DB, Shard, Stream, TopicFilter, TimeUs)).
|
?SHARD_RPC(
|
||||||
|
DB,
|
||||||
|
Shard,
|
||||||
|
Node,
|
||||||
|
?SAFE_ERPC(
|
||||||
|
emqx_ds_proto_v4:make_delete_iterator(Node, DB, Shard, Stream, TopicFilter, TimeUs)
|
||||||
|
)
|
||||||
|
).
|
||||||
|
|
||||||
ra_update_iterator(DB, Shard, Iter, DSKey) ->
|
ra_update_iterator(DB, Shard, Iter, DSKey) ->
|
||||||
{_Name, Node} = emqx_ds_replication_layer_shard:server(DB, Shard, local_preferred),
|
?SHARD_RPC(
|
||||||
?SAFERPC(emqx_ds_proto_v4:update_iterator(Node, DB, Shard, Iter, DSKey)).
|
DB,
|
||||||
|
Shard,
|
||||||
|
Node,
|
||||||
|
?SAFE_ERPC(emqx_ds_proto_v4:update_iterator(Node, DB, Shard, Iter, DSKey))
|
||||||
|
).
|
||||||
|
|
||||||
ra_next(DB, Shard, Iter, BatchSize) ->
|
ra_next(DB, Shard, Iter, BatchSize) ->
|
||||||
{_Name, Node} = emqx_ds_replication_layer_shard:server(DB, Shard, local_preferred),
|
?SHARD_RPC(
|
||||||
case emqx_ds_proto_v4:next(Node, DB, Shard, Iter, BatchSize) of
|
DB,
|
||||||
RPCError = {badrpc, _} ->
|
Shard,
|
||||||
{error, recoverable, RPCError};
|
Node,
|
||||||
Other ->
|
case emqx_ds_proto_v4:next(Node, DB, Shard, Iter, BatchSize) of
|
||||||
Other
|
Err = {badrpc, _} ->
|
||||||
end.
|
{error, recoverable, Err};
|
||||||
|
Ret ->
|
||||||
|
Ret
|
||||||
|
end
|
||||||
|
).
|
||||||
|
|
||||||
ra_delete_next(DB, Shard, Iter, Selector, BatchSize) ->
|
ra_delete_next(DB, Shard, Iter, Selector, BatchSize) ->
|
||||||
{_Name, Node} = emqx_ds_replication_layer_shard:server(DB, Shard, local_preferred),
|
?SHARD_RPC(
|
||||||
emqx_ds_proto_v4:delete_next(Node, DB, Shard, Iter, Selector, BatchSize).
|
DB,
|
||||||
|
Shard,
|
||||||
|
Node,
|
||||||
|
?SAFE_ERPC(emqx_ds_proto_v4:delete_next(Node, DB, Shard, Iter, Selector, BatchSize))
|
||||||
|
).
|
||||||
|
|
||||||
ra_list_generations_with_lifetimes(DB, Shard) ->
|
ra_list_generations_with_lifetimes(DB, Shard) ->
|
||||||
{_Name, Node} = emqx_ds_replication_layer_shard:server(DB, Shard, local_preferred),
|
Reply = ?SHARD_RPC(
|
||||||
case ?SAFERPC(emqx_ds_proto_v4:list_generations_with_lifetimes(Node, DB, Shard)) of
|
DB,
|
||||||
|
Shard,
|
||||||
|
Node,
|
||||||
|
?SAFE_ERPC(emqx_ds_proto_v4:list_generations_with_lifetimes(Node, DB, Shard))
|
||||||
|
),
|
||||||
|
case Reply of
|
||||||
Gens = #{} ->
|
Gens = #{} ->
|
||||||
maps:map(
|
maps:map(
|
||||||
fun(_GenId, Data = #{since := Since, until := Until}) ->
|
fun(_GenId, Data = #{since := Since, until := Until}) ->
|
||||||
|
|
|
@ -28,8 +28,7 @@
|
||||||
|
|
||||||
%% Dynamic server location API
|
%% Dynamic server location API
|
||||||
-export([
|
-export([
|
||||||
servers/3,
|
servers/3
|
||||||
server/3
|
|
||||||
]).
|
]).
|
||||||
|
|
||||||
%% Membership
|
%% Membership
|
||||||
|
@ -83,16 +82,15 @@ server_name(DB, Shard, Site) ->
|
||||||
|
|
||||||
%%
|
%%
|
||||||
|
|
||||||
-spec servers(emqx_ds:db(), emqx_ds_replication_layer:shard_id(), Order) -> [server(), ...] when
|
-spec servers(emqx_ds:db(), emqx_ds_replication_layer:shard_id(), Order) -> [server()] when
|
||||||
Order :: leader_preferred | undefined.
|
Order :: leader_preferred | local_preferred | undefined.
|
||||||
servers(DB, Shard, _Order = leader_preferred) ->
|
servers(DB, Shard, leader_preferred) ->
|
||||||
get_servers_leader_preferred(DB, Shard);
|
get_servers_leader_preferred(DB, Shard);
|
||||||
|
servers(DB, Shard, local_preferred) ->
|
||||||
|
get_servers_local_preferred(DB, Shard);
|
||||||
servers(DB, Shard, _Order = undefined) ->
|
servers(DB, Shard, _Order = undefined) ->
|
||||||
get_shard_servers(DB, Shard).
|
get_shard_servers(DB, Shard).
|
||||||
|
|
||||||
server(DB, Shard, _Which = local_preferred) ->
|
|
||||||
get_server_local_preferred(DB, Shard).
|
|
||||||
|
|
||||||
get_servers_leader_preferred(DB, Shard) ->
|
get_servers_leader_preferred(DB, Shard) ->
|
||||||
%% NOTE: Contact last known leader first, then rest of shard servers.
|
%% NOTE: Contact last known leader first, then rest of shard servers.
|
||||||
ClusterName = get_cluster_name(DB, Shard),
|
ClusterName = get_cluster_name(DB, Shard),
|
||||||
|
@ -104,17 +102,24 @@ get_servers_leader_preferred(DB, Shard) ->
|
||||||
get_online_servers(DB, Shard)
|
get_online_servers(DB, Shard)
|
||||||
end.
|
end.
|
||||||
|
|
||||||
get_server_local_preferred(DB, Shard) ->
|
get_servers_local_preferred(DB, Shard) ->
|
||||||
%% NOTE: Contact either local server or a random replica.
|
%% Return list of servers, where the local replica (if exists) is
|
||||||
|
%% the first element. Note: result is _NOT_ shuffled. This can be
|
||||||
|
%% bad for the load balancing, but it makes results more
|
||||||
|
%% deterministic. Caller that doesn't care about that can shuffle
|
||||||
|
%% the results by itself.
|
||||||
ClusterName = get_cluster_name(DB, Shard),
|
ClusterName = get_cluster_name(DB, Shard),
|
||||||
case ra_leaderboard:lookup_members(ClusterName) of
|
case ra_leaderboard:lookup_members(ClusterName) of
|
||||||
Servers when is_list(Servers) ->
|
|
||||||
pick_local(Servers);
|
|
||||||
undefined ->
|
undefined ->
|
||||||
%% TODO
|
Servers = get_online_servers(DB, Shard);
|
||||||
%% Leader is unkonwn if there are no servers of this group on the
|
Servers when is_list(Servers) ->
|
||||||
%% local node. We want to pick a replica in that case as well.
|
ok
|
||||||
pick_random(get_online_servers(DB, Shard))
|
end,
|
||||||
|
case lists:keyfind(node(), 2, Servers) of
|
||||||
|
false ->
|
||||||
|
Servers;
|
||||||
|
Local when is_tuple(Local) ->
|
||||||
|
[Local | lists:delete(Local, Servers)]
|
||||||
end.
|
end.
|
||||||
|
|
||||||
lookup_leader(DB, Shard) ->
|
lookup_leader(DB, Shard) ->
|
||||||
|
@ -139,17 +144,6 @@ filter_online(Servers) ->
|
||||||
is_server_online({_Name, Node}) ->
|
is_server_online({_Name, Node}) ->
|
||||||
Node == node() orelse lists:member(Node, nodes()).
|
Node == node() orelse lists:member(Node, nodes()).
|
||||||
|
|
||||||
pick_local(Servers) ->
|
|
||||||
case lists:keyfind(node(), 2, Servers) of
|
|
||||||
Local when is_tuple(Local) ->
|
|
||||||
Local;
|
|
||||||
false ->
|
|
||||||
pick_random(Servers)
|
|
||||||
end.
|
|
||||||
|
|
||||||
pick_random(Servers) ->
|
|
||||||
lists:nth(rand:uniform(length(Servers)), Servers).
|
|
||||||
|
|
||||||
get_cluster_name(DB, Shard) ->
|
get_cluster_name(DB, Shard) ->
|
||||||
memoize(fun cluster_name/2, [DB, Shard]).
|
memoize(fun cluster_name/2, [DB, Shard]).
|
||||||
|
|
||||||
|
|
|
@ -479,11 +479,13 @@ t_rebalance_offline_restarts(Config) ->
|
||||||
%%
|
%%
|
||||||
|
|
||||||
shard_server_info(Node, DB, Shard, Site, Info) ->
|
shard_server_info(Node, DB, Shard, Site, Info) ->
|
||||||
Server = shard_server(Node, DB, Shard, Site),
|
?ON(
|
||||||
{Server, ds_repl_shard(Node, server_info, [Info, Server])}.
|
Node,
|
||||||
|
begin
|
||||||
shard_server(Node, DB, Shard, Site) ->
|
Server = emqx_ds_replication_layer_shard:shard_server(DB, Shard, Site),
|
||||||
ds_repl_shard(Node, shard_server, [DB, Shard, Site]).
|
{Server, emqx_ds_replication_layer_shard:server_info(Info, Server)}
|
||||||
|
end
|
||||||
|
).
|
||||||
|
|
||||||
ds_repl_meta(Node, Fun) ->
|
ds_repl_meta(Node, Fun) ->
|
||||||
ds_repl_meta(Node, Fun, []).
|
ds_repl_meta(Node, Fun, []).
|
||||||
|
@ -499,9 +501,6 @@ ds_repl_meta(Node, Fun, Args) ->
|
||||||
error(meta_op_failed)
|
error(meta_op_failed)
|
||||||
end.
|
end.
|
||||||
|
|
||||||
ds_repl_shard(Node, Fun, Args) ->
|
|
||||||
erpc:call(Node, emqx_ds_replication_layer_shard, Fun, Args).
|
|
||||||
|
|
||||||
shards(Node, DB) ->
|
shards(Node, DB) ->
|
||||||
erpc:call(Node, emqx_ds_replication_layer_meta, shards, [DB]).
|
erpc:call(Node, emqx_ds_replication_layer_meta, shards, [DB]).
|
||||||
|
|
||||||
|
|
Loading…
Reference in New Issue