fix(dsrepl): split shard allocator into a separate module

This commit is contained in:
Andrew Mayorov 2024-03-08 18:48:16 +01:00
parent 1b647035d0
commit ac9700dd28
No known key found for this signature in database
GPG Key ID: 2837C62ACFBFED5D
4 changed files with 203 additions and 131 deletions

View File

@ -21,19 +21,15 @@
-behaviour(supervisor). -behaviour(supervisor).
%% API: %% API:
-export([start_db/2, start_shard/1, start_egress/1, stop_shard/1, ensure_shard/1]). -export([start_db/2, start_shard/1, start_egress/1, stop_shard/1, ensure_shard/1, ensure_egress/1]).
-export([status/1]). -export([which_shards/1]).
%% behaviour callbacks: %% behaviour callbacks:
-export([init/1]). -export([init/1]).
-export([handle_call/3, handle_cast/2, handle_info/2, terminate/2]).
%% internal exports: %% internal exports:
-export([start_link_sup/2]). -export([start_link_sup/2]).
%% FIXME
-export([lookup_shard_meta/2]).
%%================================================================================ %%================================================================================
%% Type declarations %% Type declarations
%%================================================================================ %%================================================================================
@ -41,14 +37,14 @@
-define(via(REC), {via, gproc, {n, l, REC}}). -define(via(REC), {via, gproc, {n, l, REC}}).
-define(db_sup, ?MODULE). -define(db_sup, ?MODULE).
-define(shard_sup, emqx_ds_builtin_db_shard_sup). -define(shards_sup, emqx_ds_builtin_db_shards_sup).
-define(egress_sup, emqx_ds_builtin_db_egress_sup). -define(egress_sup, emqx_ds_builtin_db_egress_sup).
-define(shard_sup, emqx_ds_builtin_db_shard_sup).
-record(?db_sup, {db}). -record(?db_sup, {db}).
-record(?shard_sup, {db}). -record(?shards_sup, {db}).
-record(?egress_sup, {db}). -record(?egress_sup, {db}).
-record(?shard_sup, {db, shard}).
-define(shard_meta(DB, SHARD), {?MODULE, DB, SHARD}).
%%================================================================================ %%================================================================================
%% API funcions %% API funcions
@ -60,8 +56,8 @@ start_db(DB, Opts) ->
-spec start_shard(emqx_ds_storage_layer:shard_id()) -> -spec start_shard(emqx_ds_storage_layer:shard_id()) ->
supervisor:startchild_ret(). supervisor:startchild_ret().
start_shard(Shard = {DB, _}) -> start_shard({DB, Shard}) ->
supervisor:start_child(?via(#?shard_sup{db = DB}), shard_spec(DB, Shard)). supervisor:start_child(?via(#?shards_sup{db = DB}), shard_spec(DB, Shard)).
-spec start_egress(emqx_ds_storage_layer:shard_id()) -> -spec start_egress(emqx_ds_storage_layer:shard_id()) ->
supervisor:startchild_ret(). supervisor:startchild_ret().
@ -70,28 +66,24 @@ start_egress({DB, Shard}) ->
-spec stop_shard(emqx_ds_storage_layer:shard_id()) -> ok | {error, _}. -spec stop_shard(emqx_ds_storage_layer:shard_id()) -> ok | {error, _}.
stop_shard(Shard = {DB, _}) -> stop_shard(Shard = {DB, _}) ->
Sup = ?via(#?shard_sup{db = DB}), Sup = ?via(#?shards_sup{db = DB}),
ok = supervisor:terminate_child(Sup, Shard), ok = supervisor:terminate_child(Sup, Shard),
ok = supervisor:delete_child(Sup, Shard). ok = supervisor:delete_child(Sup, Shard).
-spec ensure_shard(emqx_ds_storage_layer:shard_id()) -> -spec ensure_shard(emqx_ds_storage_layer:shard_id()) ->
ok | {error, _Reason}. ok | {error, _Reason}.
ensure_shard(Shard) -> ensure_shard(Shard) ->
case start_shard(Shard) of ensure_started(start_shard(Shard)).
{ok, _Pid} ->
ok;
{error, {already_started, _Pid}} ->
ok;
{error, Reason} ->
{error, Reason}
end.
status(DB) -> -spec ensure_egress(emqx_ds_storage_layer:shard_id()) ->
State = sys:get_state(?via({allocator, DB})), ok | {error, _Reason}.
maps:get(status, State). ensure_egress(Shard) ->
ensure_started(start_egress(Shard)).
lookup_shard_meta(DB, Shard) -> -spec which_shards(emqx_ds:db()) ->
persistent_term:get(?shard_meta(DB, Shard)). [_Child].
which_shards(DB) ->
supervisor:which_children(?via(#?shards_sup{db = DB})).
%%================================================================================ %%================================================================================
%% behaviour callbacks %% behaviour callbacks
@ -103,7 +95,7 @@ init({#?db_sup{db = DB}, DefaultOpts}) ->
Opts = emqx_ds_replication_layer_meta:open_db(DB, DefaultOpts), Opts = emqx_ds_replication_layer_meta:open_db(DB, DefaultOpts),
ok = start_ra_system(DB, Opts), ok = start_ra_system(DB, Opts),
Children = [ Children = [
sup_spec(#?shard_sup{db = DB}, []), sup_spec(#?shards_sup{db = DB}, []),
sup_spec(#?egress_sup{db = DB}, []), sup_spec(#?egress_sup{db = DB}, []),
shard_allocator_spec(DB, Opts) shard_allocator_spec(DB, Opts)
], ],
@ -113,8 +105,8 @@ init({#?db_sup{db = DB}, DefaultOpts}) ->
period => 1 period => 1
}, },
{ok, {SupFlags, Children}}; {ok, {SupFlags, Children}};
init({#?shard_sup{db = _DB}, _}) -> init({#?shards_sup{db = _DB}, _}) ->
%% Spec for the supervisor that manages the worker processes for %% Spec for the supervisor that manages the supervisors for
%% each local shard of the DB: %% each local shard of the DB:
SupFlags = #{ SupFlags = #{
strategy => one_for_one, strategy => one_for_one,
@ -131,10 +123,18 @@ init({#?egress_sup{db = _DB}, _}) ->
period => 1 period => 1
}, },
{ok, {SupFlags, []}}; {ok, {SupFlags, []}};
init({allocator, DB, Opts}) -> init({#?shard_sup{db = DB, shard = Shard}, _}) ->
_ = erlang:process_flag(trap_exit, true), SupFlags = #{
_ = logger:set_process_metadata(#{db => DB, domain => [ds, db, shard_allocator]}), strategy => rest_for_one,
init_allocator(DB, Opts). intensity => 10,
period => 100
},
Opts = emqx_ds_replication_layer_meta:get_options(DB),
Children = [
shard_storage_spec(DB, Shard, Opts),
shard_replication_spec(DB, Shard, Opts)
],
{ok, {SupFlags, Children}}.
start_ra_system(DB, #{replication_options := ReplicationOpts}) -> start_ra_system(DB, #{replication_options := ReplicationOpts}) ->
DataDir = filename:join([emqx:data_dir(), DB, dsrepl]), DataDir = filename:join([emqx:data_dir(), DB, dsrepl]),
@ -164,25 +164,6 @@ start_ra_system(DB, #{replication_options := ReplicationOpts}) ->
ok ok
end. end.
start_shards(DB, Shards, Opts) ->
SupRef = ?via(#?shard_sup{db = DB}),
lists:foreach(
fun(Shard) ->
{ok, _} = supervisor:start_child(SupRef, shard_spec(DB, Shard, Opts)),
{ok, _} = supervisor:start_child(SupRef, shard_replication_spec(DB, Shard, Opts))
end,
Shards
).
start_egresses(DB, Shards) ->
SupRef = ?via(#?egress_sup{db = DB}),
lists:foreach(
fun(Shard) ->
{ok, _} = supervisor:start_child(SupRef, egress_spec(DB, Shard))
end,
Shards
).
%%================================================================================ %%================================================================================
%% Internal exports %% Internal exports
%%================================================================================ %%================================================================================
@ -203,12 +184,18 @@ sup_spec(Id, Options) ->
}. }.
shard_spec(DB, Shard) -> shard_spec(DB, Shard) ->
shard_spec(DB, Shard, emqx_ds_replication_layer_meta:get_options(DB)). #{
id => {shard, Shard},
start => {?MODULE, start_link_sup, [#?shard_sup{db = DB, shard = Shard}, []]},
shutdown => infinity,
restart => permanent,
type => supervisor
}.
shard_spec(DB, Shard, Options) -> shard_storage_spec(DB, Shard, Opts) ->
#{ #{
id => {Shard, storage}, id => {Shard, storage},
start => {emqx_ds_storage_layer, start_link, [{DB, Shard}, Options]}, start => {emqx_ds_storage_layer, start_link, [{DB, Shard}, Opts]},
shutdown => 5_000, shutdown => 5_000,
restart => permanent, restart => permanent,
type => worker type => worker
@ -225,8 +212,7 @@ shard_replication_spec(DB, Shard, Opts) ->
shard_allocator_spec(DB, Opts) -> shard_allocator_spec(DB, Opts) ->
#{ #{
id => shard_allocator, id => shard_allocator,
start => start => {emqx_ds_replication_shard_allocator, start_link, [DB, Opts]},
{gen_server, start_link, [?via({allocator, DB}), ?MODULE, {allocator, DB, Opts}, []]},
restart => permanent, restart => permanent,
type => worker type => worker
}. }.
@ -240,78 +226,12 @@ egress_spec(DB, Shard) ->
type => worker type => worker
}. }.
%% Allocator ensure_started(Res) ->
case Res of
-define(ALLOCATE_RETRY_TIMEOUT, 1_000). {ok, _Pid} ->
ok;
init_allocator(DB, Opts) -> {error, {already_started, _Pid}} ->
State = #{db => DB, opts => Opts, status => allocating}, ok;
case allocate_shards(State) of
NState = #{} ->
{ok, NState};
{error, Data} ->
_ = logger:notice(
Data#{
msg => "Shard allocation still in progress",
retry_in => ?ALLOCATE_RETRY_TIMEOUT
}
),
{ok, State, ?ALLOCATE_RETRY_TIMEOUT}
end.
handle_call(_Call, _From, State) ->
{reply, ignored, State}.
handle_cast(_Cast, State) ->
{noreply, State}.
handle_info(timeout, State) ->
case allocate_shards(State) of
NState = #{} ->
{noreply, NState};
{error, Data} ->
_ = logger:notice(
Data#{
msg => "Shard allocation still in progress",
retry_in => ?ALLOCATE_RETRY_TIMEOUT
}
),
{noreply, State, ?ALLOCATE_RETRY_TIMEOUT}
end.
terminate(_Reason, #{db := DB, shards := Shards}) ->
%% FIXME
erase_shards_meta(DB, Shards).
%%
allocate_shards(State = #{db := DB, opts := Opts}) ->
case emqx_ds_replication_layer_meta:allocate_shards(DB, Opts) of
{ok, Shards} ->
logger:notice(#{msg => "Shards allocated", shards => Shards}),
ok = save_shards_meta(DB, Shards),
ok = start_shards(DB, emqx_ds_replication_layer_meta:my_shards(DB), Opts),
logger:notice(#{
msg => "Shards started", shards => emqx_ds_replication_layer_meta:my_shards(DB)
}),
ok = start_egresses(DB, Shards),
logger:notice(#{msg => "Egresses started", shards => Shards}),
State#{shards => Shards, status := ready};
{error, Reason} -> {error, Reason} ->
{error, Reason} {error, Reason}
end. end.
save_shards_meta(DB, Shards) ->
lists:foreach(fun(Shard) -> save_shard_meta(DB, Shard) end, Shards).
save_shard_meta(DB, Shard) ->
Servers = emqx_ds_replication_layer_shard:shard_servers(DB, Shard),
persistent_term:put(?shard_meta(DB, Shard), #{
servers => Servers
}).
erase_shards_meta(DB, Shards) ->
lists:foreach(fun(Shard) -> erase_shard_meta(DB, Shard) end, Shards).
erase_shard_meta(DB, Shard) ->
persistent_term:erase(?shard_meta(DB, Shard)).

View File

@ -332,7 +332,7 @@ delete_next(DB, Iter0, Selector, BatchSize) ->
-spec shard_of_message(emqx_ds:db(), emqx_types:message(), clientid | topic) -> -spec shard_of_message(emqx_ds:db(), emqx_types:message(), clientid | topic) ->
emqx_ds_replication_layer:shard_id(). emqx_ds_replication_layer:shard_id().
shard_of_message(DB, #message{from = From, topic = Topic}, SerializeBy) -> shard_of_message(DB, #message{from = From, topic = Topic}, SerializeBy) ->
N = emqx_ds_replication_layer_meta:n_shards(DB), N = emqx_ds_replication_shard_allocator:n_shards(DB),
Hash = Hash =
case SerializeBy of case SerializeBy of
clientid -> erlang:phash2(From, N); clientid -> erlang:phash2(From, N);

View File

@ -127,7 +127,7 @@ get_local_server(DB, Shard) ->
?MEMOIZE(DB, Shard, local_server(DB, Shard)). ?MEMOIZE(DB, Shard, local_server(DB, Shard)).
get_shard_servers(DB, Shard) -> get_shard_servers(DB, Shard) ->
maps:get(servers, emqx_ds_builtin_db_sup:lookup_shard_meta(DB, Shard)). maps:get(servers, emqx_ds_replication_shard_allocator:shard_meta(DB, Shard)).
%% %%

View File

@ -0,0 +1,152 @@
%%--------------------------------------------------------------------
%% Copyright (c) 2024 EMQ Technologies Co., Ltd. All Rights Reserved.
%%
%% Licensed under the Apache License, Version 2.0 (the "License");
%% you may not use this file except in compliance with the License.
%% You may obtain a copy of the License at
%%
%% http://www.apache.org/licenses/LICENSE-2.0
%%
%% Unless required by applicable law or agreed to in writing, software
%% distributed under the License is distributed on an "AS IS" BASIS,
%% WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
%% See the License for the specific language governing permissions and
%% limitations under the License.
%%--------------------------------------------------------------------
-module(emqx_ds_replication_shard_allocator).
-export([start_link/2]).
-export([n_shards/1]).
-export([shard_meta/2]).
-behaviour(gen_server).
-export([
init/1,
handle_call/3,
handle_cast/2,
handle_info/2,
terminate/2
]).
-define(db_meta(DB), {?MODULE, DB}).
-define(shard_meta(DB, SHARD), {?MODULE, DB, SHARD}).
%%
start_link(DB, Opts) ->
gen_server:start_link(?MODULE, {DB, Opts}, []).
n_shards(DB) ->
Meta = persistent_term:get(?db_meta(DB)),
maps:get(n_shards, Meta).
shard_meta(DB, Shard) ->
persistent_term:get(?shard_meta(DB, Shard)).
%%
-define(ALLOCATE_RETRY_TIMEOUT, 1_000).
init({DB, Opts}) ->
_ = erlang:process_flag(trap_exit, true),
_ = logger:set_process_metadata(#{db => DB, domain => [ds, db, shard_allocator]}),
State = #{db => DB, opts => Opts, status => allocating},
case allocate_shards(State) of
NState = #{} ->
{ok, NState};
{error, Data} ->
_ = logger:notice(
Data#{
msg => "Shard allocation still in progress",
retry_in => ?ALLOCATE_RETRY_TIMEOUT
}
),
{ok, State, ?ALLOCATE_RETRY_TIMEOUT}
end.
handle_call(_Call, _From, State) ->
{reply, ignored, State}.
handle_cast(_Cast, State) ->
{noreply, State}.
handle_info(timeout, State) ->
case allocate_shards(State) of
NState = #{} ->
{noreply, NState};
{error, Data} ->
_ = logger:notice(
Data#{
msg => "Shard allocation still in progress",
retry_in => ?ALLOCATE_RETRY_TIMEOUT
}
),
{noreply, State, ?ALLOCATE_RETRY_TIMEOUT}
end.
terminate(_Reason, #{db := DB, shards := Shards}) ->
erase_db_meta(DB),
erase_shards_meta(DB, Shards);
terminate(_Reason, #{}) ->
ok.
%%
allocate_shards(State = #{db := DB, opts := Opts}) ->
case emqx_ds_replication_layer_meta:allocate_shards(DB, Opts) of
{ok, Shards} ->
logger:notice(#{msg => "Shards allocated", shards => Shards}),
ok = start_shards(DB, emqx_ds_replication_layer_meta:my_shards(DB)),
ok = start_egresses(DB, Shards),
ok = save_db_meta(DB, Shards),
ok = save_shards_meta(DB, Shards),
State#{shards => Shards, status := ready};
{error, Reason} ->
{error, Reason}
end.
start_shards(DB, Shards) ->
ok = lists:foreach(
fun(Shard) ->
ok = emqx_ds_builtin_db_sup:ensure_shard({DB, Shard})
end,
Shards
),
ok = logger:info(#{msg => "Shards started", shards => Shards}),
ok.
start_egresses(DB, Shards) ->
ok = lists:foreach(
fun(Shard) ->
ok = emqx_ds_builtin_db_sup:ensure_egress({DB, Shard})
end,
Shards
),
logger:info(#{msg => "Egresses started", shards => Shards}),
ok.
save_db_meta(DB, Shards) ->
persistent_term:put(?db_meta(DB), #{
shards => Shards,
n_shards => length(Shards)
}).
save_shards_meta(DB, Shards) ->
lists:foreach(fun(Shard) -> save_shard_meta(DB, Shard) end, Shards).
save_shard_meta(DB, Shard) ->
Servers = emqx_ds_replication_layer_shard:shard_servers(DB, Shard),
persistent_term:put(?shard_meta(DB, Shard), #{
servers => Servers
}).
erase_db_meta(DB) ->
persistent_term:erase(?db_meta(DB)).
erase_shards_meta(DB, Shards) ->
lists:foreach(fun(Shard) -> erase_shard_meta(DB, Shard) end, Shards).
erase_shard_meta(DB, Shard) ->
persistent_term:erase(?shard_meta(DB, Shard)).