fix(sso): support for SSL update && ensure update is atomic

1. support update SSL key and cert files
2. increase connection timeout
3. ensure the update is atomicity, everything will be consistent
This commit is contained in:
firest 2023-09-27 15:48:11 +08:00
parent 1f8985d09e
commit b2699c687b
3 changed files with 123 additions and 29 deletions

View File

@ -43,7 +43,7 @@
-define(MOD_TAB, emqx_dashboard_sso). -define(MOD_TAB, emqx_dashboard_sso).
-define(MOD_KEY_PATH, [dashboard, sso]). -define(MOD_KEY_PATH, [dashboard, sso]).
-define(CALL_TIMEOUT, timer:seconds(10)). -define(CALL_TIMEOUT, timer:seconds(15)).
-define(MOD_KEY_PATH(Sub), [dashboard, sso, Sub]). -define(MOD_KEY_PATH(Sub), [dashboard, sso, Sub]).
-define(RESOURCE_GROUP, <<"emqx_dashboard_sso">>). -define(RESOURCE_GROUP, <<"emqx_dashboard_sso">>).
-define(DEFAULT_RESOURCE_OPTS, #{ -define(DEFAULT_RESOURCE_OPTS, #{
@ -98,7 +98,7 @@ create_resource(ResourceId, Module, Config) ->
Config, Config,
?DEFAULT_RESOURCE_OPTS ?DEFAULT_RESOURCE_OPTS
), ),
start_resource_if_enabled(ResourceId, Result, Config, fun clean_when_start_failed/1). start_resource_if_enabled(ResourceId, Result, Config).
update_resource(ResourceId, Module, Config) -> update_resource(ResourceId, Module, Config) ->
Result = emqx_resource:recreate_local( Result = emqx_resource:recreate_local(
@ -133,8 +133,8 @@ init([]) ->
start_backend_services(), start_backend_services(),
{ok, #{}}. {ok, #{}}.
handle_call({update_config, Req, NewConf}, _From, State) -> handle_call({update_config, Req, NewConf, OldConf}, _From, State) ->
Result = on_config_update(Req, NewConf), Result = on_config_update(Req, NewConf, OldConf),
{reply, Result, State}; {reply, Result, State};
handle_call(_Request, _From, State) -> handle_call(_Request, _From, State) ->
Reply = ok, Reply = ok,
@ -175,7 +175,7 @@ start_backend_services() ->
?SLOG(error, #{ ?SLOG(error, #{
msg => "start_sso_backend_failed", msg => "start_sso_backend_failed",
backend => Backend, backend => Backend,
reason => Reason reason => emqx_utils:redact(Reason)
}) })
end end
end, end,
@ -189,17 +189,18 @@ update_config(Backend, UpdateReq) ->
?SLOG(info, #{ ?SLOG(info, #{
msg => "update_sso_successfully", msg => "update_sso_successfully",
backend => Backend, backend => Backend,
result => Result result => emqx_utils:redact(Result)
}), }),
Result; Result;
{error, Reason} -> {error, Reason} ->
SafeReason = emqx_utils:redact(Reason),
?SLOG(error, #{ ?SLOG(error, #{
msg => "update_sso_failed", msg => "update_sso_failed",
backend => Backend, backend => Backend,
reason => Reason reason => SafeReason
}), }),
{error, {error,
case Reason of case SafeReason of
{_Stage, _Mod, Reason2} -> {_Stage, _Mod, Reason2} ->
Reason2; Reason2;
_ -> _ ->
@ -208,14 +209,14 @@ update_config(Backend, UpdateReq) ->
end. end.
pre_config_update(_, {update, _Backend, Config}, _OldConf) -> pre_config_update(_, {update, _Backend, Config}, _OldConf) ->
{ok, Config}; maybe_write_certs(Config);
pre_config_update(_, {delete, _Backend}, undefined) -> pre_config_update(_, {delete, _Backend}, undefined) ->
throw(not_exists); throw(not_exists);
pre_config_update(_, {delete, _Backend}, _OldConf) -> pre_config_update(_, {delete, _Backend}, _OldConf) ->
{ok, null}. {ok, null}.
post_config_update(_, UpdateReq, NewConf, _OldConf, _AppEnvs) -> post_config_update(_, UpdateReq, NewConf, OldConf, _AppEnvs) ->
call({update_config, UpdateReq, NewConf}). call({update_config, UpdateReq, NewConf, OldConf}).
propagated_post_config_update( propagated_post_config_update(
?MOD_KEY_PATH(BackendBin) = Path, _UpdateReq, undefined, OldConf, AppEnvs ?MOD_KEY_PATH(BackendBin) = Path, _UpdateReq, undefined, OldConf, AppEnvs
@ -236,7 +237,7 @@ propagated_post_config_update(
Error Error
end. end.
on_config_update({update, Backend, _RawConfig}, Config) -> on_config_update({update, Backend, _RawConfig}, Config, OldConfig) ->
Provider = provider(Backend), Provider = provider(Backend),
case lookup(Backend) of case lookup(Backend) of
undefined -> undefined ->
@ -247,14 +248,27 @@ on_config_update({update, Backend, _RawConfig}, Config) ->
end end
); );
Data -> Data ->
%% the steps for updating/recreating a resource are:
%% 1. destroy the old resource
%% 2. create a new resource
%% to keep consistency we need to follow those steps too,
%% however a failed update will not change the config, but will lose the resource
%% hence for consistency and atomicity, we should rollback when the update fails
ets:delete(?MOD_TAB, Backend),
UpdateState = fun(State) ->
ets:insert(?MOD_TAB, Data#?MOD_TAB{state = State})
end,
on_backend_updated( on_backend_updated(
emqx_dashboard_sso:update(Provider, Config, Data#?MOD_TAB.state), emqx_dashboard_sso:update(Provider, Config, Data#?MOD_TAB.state),
fun(State) -> UpdateState,
ets:insert(?MOD_TAB, Data#?MOD_TAB{state = State}) rollback(
end Backend,
OldConfig,
UpdateState
)
) )
end; end;
on_config_update({delete, Backend}, _NewConf) -> on_config_update({delete, Backend}, _NewConf, _OldConf) ->
case lookup(Backend) of case lookup(Backend) of
undefined -> undefined ->
{error, not_exists}; {error, not_exists};
@ -276,12 +290,7 @@ lookup(Backend) ->
undefined undefined
end. end.
start_resource_if_enabled(ResourceId, Result, Config) -> start_resource_if_enabled(ResourceId, {ok, _} = Result, #{enable := true}) ->
start_resource_if_enabled(ResourceId, Result, Config, undefined).
start_resource_if_enabled(
ResourceId, {ok, _} = Result, #{enable := true}, CleanWhenStartFailed
) ->
case emqx_resource:start(ResourceId) of case emqx_resource:start(ResourceId) of
ok -> ok ->
Result; Result;
@ -292,11 +301,10 @@ start_resource_if_enabled(
resource_id => ResourceId, resource_id => ResourceId,
reason => SafeReason reason => SafeReason
}), }),
erlang:is_function(CleanWhenStartFailed) andalso clean_when_start_failed(ResourceId),
CleanWhenStartFailed(ResourceId),
{error, emqx_dashboard_sso:format(["Start backend failed, Reason: ", SafeReason])} {error, emqx_dashboard_sso:format(["Start backend failed, Reason: ", SafeReason])}
end; end;
start_resource_if_enabled(_ResourceId, Result, _Config, _) -> start_resource_if_enabled(_ResourceId, Result, _Config) ->
Result. Result.
%% ensure the backend creation is atomic, clean the corresponding resource when necessary, %% ensure the backend creation is atomic, clean the corresponding resource when necessary,
@ -309,14 +317,18 @@ clean_when_start_failed(ResourceId) ->
_ = emqx_resource:remove_local(ResourceId), _ = emqx_resource:remove_local(ResourceId),
ok. ok.
on_backend_updated(Result, OkFun) ->
on_backend_updated(Result, OkFun, undefined).
%% this first level `ok` is for emqx_config_handler, and the second level is for the caller %% this first level `ok` is for emqx_config_handler, and the second level is for the caller
on_backend_updated({ok, State} = Ok, Fun) -> on_backend_updated({ok, State} = Ok, Fun, _ErrFun) ->
Fun(State), Fun(State),
{ok, Ok}; {ok, Ok};
on_backend_updated(ok, Fun) -> on_backend_updated(ok, Fun, _ErrFun) ->
Fun(), Fun(),
{ok, ok}; {ok, ok};
on_backend_updated(Error, _) -> on_backend_updated(Error, _, ErrFun) ->
erlang:is_function(ErrFun) andalso ErrFun(Error),
Error. Error.
bin(A) when is_atom(A) -> atom_to_binary(A, utf8); bin(A) when is_atom(A) -> atom_to_binary(A, utf8);
@ -331,3 +343,53 @@ add_handler() ->
remove_handler() -> remove_handler() ->
ok = emqx_conf:remove_handler(?MOD_KEY_PATH('?')). ok = emqx_conf:remove_handler(?MOD_KEY_PATH('?')).
maybe_write_certs(#{<<"backend">> := Backend} = Conf) ->
case
emqx_tls_lib:ensure_ssl_files(
ssl_file_path(Backend), maps:get(<<"ssl">>, Conf, undefined)
)
of
{ok, SSL} ->
{ok, new_ssl_source(Conf, SSL)};
{error, Reason} ->
?SLOG(error, Reason#{msg => "bad_ssl_config"}),
throw({bad_ssl_config, Reason})
end.
ssl_file_path(Backend) ->
filename:join(["sso", Backend]).
new_ssl_source(Source, undefined) ->
Source;
new_ssl_source(Source, SSL) ->
Source#{<<"ssl">> => SSL}.
rollback(Backend, OldConf, OnSucc) ->
fun(_) ->
try_recreate(Backend, OldConf, OnSucc)
end.
try_recreate(_Backend, undefined, _OnSucc) ->
ok;
try_recreate(_Backend, #{enable := false}, _OnSucc) ->
ok;
try_recreate(Backend, Config, OnSucc) ->
Provider = provider(Backend),
?SLOG(info, #{
msg => "backend_rollback",
backend => Backend,
reason => "update_sso_failed",
config => emqx_utils:redact(Config)
}),
on_backend_updated(
emqx_dashboard_sso:create(Provider, Config),
OnSucc,
fun(Error) ->
?SLOG(error, #{
msg => "backend_rollback_failed",
backend => Backend,
reason => emqx_utils:redact(Error)
})
end
).

View File

@ -27,6 +27,7 @@ all() ->
t_create_atomicly, t_create_atomicly,
t_create, t_create,
t_update, t_update,
t_update_atomicly,
t_get, t_get,
t_login_with_bad, t_login_with_bad,
t_first_login, t_first_login,
@ -111,6 +112,37 @@ t_update(_) ->
?assertNotEqual(undefined, emqx_dashboard_sso_manager:lookup_state(ldap)), ?assertNotEqual(undefined, emqx_dashboard_sso_manager:lookup_state(ldap)),
ok. ok.
%% update fails can rollback able
t_update_atomicly(_) ->
CurrRes = emqx_resource_manager:list_group(?RESOURCE_GROUP),
Path = uri(["sso", "ldap"]),
?assertMatch(
{ok, 400, _},
request(
put,
Path,
ldap_config(#{
<<"username">> => <<"invalid">>,
<<"enable">> => true,
<<"request_timeout">> => <<"1s">>
})
)
),
?assertMatch(#{backend := ldap}, emqx:get_config(?MOD_KEY_PATH, undefined)),
?assertMatch([_], ets:tab2list(?MOD_TAB)),
?retry(
_Interval = 5,
_NAttempts = 1000,
begin
Res = emqx_resource_manager:list_group(?RESOURCE_GROUP),
?assertMatch([_], Res),
?assertNotMatch(CurrRes, Res)
end
),
ok.
t_get(_) -> t_get(_) ->
Path = uri(["sso", "ldap"]), Path = uri(["sso", "ldap"]),
{ok, 200, Result} = request(get, Path), {ok, 200, Result} = request(get, Path),

View File

@ -74,7 +74,7 @@ fields(config) ->
{request_timeout, {request_timeout,
?HOCON(emqx_schema:timeout_duration_ms(), #{ ?HOCON(emqx_schema:timeout_duration_ms(), #{
desc => ?DESC(request_timeout), desc => ?DESC(request_timeout),
default => <<"5s">> default => <<"10s">>
})}, })},
{ssl, {ssl,
?HOCON(?R_REF(?MODULE, ssl), #{ ?HOCON(?R_REF(?MODULE, ssl), #{