408 lines
12 KiB
Erlang
408 lines
12 KiB
Erlang
%%--------------------------------------------------------------------
|
|
%% Copyright (c) 2024 EMQ Technologies Co., Ltd. All Rights Reserved.
|
|
%%
|
|
%% Licensed under the Apache License, Version 2.0 (the "License");
|
|
%% you may not use this file except in compliance with the License.
|
|
%% You may obtain a copy of the License at
|
|
%%
|
|
%% http://www.apache.org/licenses/LICENSE-2.0
|
|
%%
|
|
%% Unless required by applicable law or agreed to in writing, software
|
|
%% distributed under the License is distributed on an "AS IS" BASIS,
|
|
%% WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
%% See the License for the specific language governing permissions and
|
|
%% limitations under the License.
|
|
%%--------------------------------------------------------------------
|
|
|
|
%% This module provides a loose parser for URIs.
|
|
%% The standard library's `uri_string' module is strict and does not allow
|
|
%% to parse invalid URIs, like templates: `http://example.com/${username}'.
|
|
|
|
-module(emqx_utils_uri).
|
|
|
|
-export([parse/1, format/1]).
|
|
|
|
-export([
|
|
scheme/1,
|
|
userinfo/1,
|
|
host/1,
|
|
port/1,
|
|
path/1,
|
|
query/1,
|
|
fragment/1,
|
|
base_url/1,
|
|
request_base/1
|
|
]).
|
|
|
|
-type scheme() :: binary().
|
|
-type userinfo() :: binary().
|
|
-type host() :: binary().
|
|
-type port_number() :: inet:port_number().
|
|
-type path() :: binary().
|
|
-type query() :: binary().
|
|
-type fragment() :: binary().
|
|
-type request_base() :: #{
|
|
scheme := http | https,
|
|
host := iolist(),
|
|
port := inet:port_number()
|
|
}.
|
|
|
|
-type authority() :: #{
|
|
userinfo := emqx_maybe:t(userinfo()),
|
|
host := host(),
|
|
%% Types:
|
|
%% ipv6: `\[[a-z\d:\.]*\]` — bracketed "ivp6-like" address
|
|
%% regular: `example.com` — arbitrary host not containg `:` which is forbidden in hostnames other than ipv6
|
|
%% loose: non ipv6-like host containing `:`, probably invalid for a strictly valid URI
|
|
host_type := ipv6 | regular | loose,
|
|
port := emqx_maybe:t(port_number())
|
|
}.
|
|
|
|
-type uri() :: #{
|
|
scheme := emqx_maybe:t(scheme()),
|
|
authority := emqx_maybe:t(authority()),
|
|
path := path(),
|
|
query := emqx_maybe:t(query()),
|
|
fragment := emqx_maybe:t(fragment())
|
|
}.
|
|
|
|
-export_type([
|
|
scheme/0,
|
|
userinfo/0,
|
|
host/0,
|
|
port_number/0,
|
|
path/0,
|
|
query/0,
|
|
fragment/0,
|
|
authority/0,
|
|
uri/0,
|
|
request_base/0
|
|
]).
|
|
|
|
-on_load(init/0).
|
|
|
|
%% https://datatracker.ietf.org/doc/html/rfc3986#appendix-B
|
|
%%
|
|
%% > The following line is the regular expression for breaking-down a
|
|
%% > well-formed URI reference into its components.
|
|
%%
|
|
%% > ^(([^:/?#]+):)?(//([^/?#]*))?([^?#]*)(\?([^#]*))?(#(.*))?
|
|
%%
|
|
%% We skip capturing some unused parts of the regex.
|
|
|
|
-define(URI_REGEX,
|
|
("^(?:(?<scheme>[^:/?#]+):)?(?<authority>//[^/?#]*)?"
|
|
"(?<path>[^?#]*)(?<query>\\?[^#]*)?(?<fragment>#.*)?")
|
|
).
|
|
|
|
-define(URI_REGEX_PT_KEY, {?MODULE, uri_re}).
|
|
|
|
-define(AUTHORITY_REGEX,
|
|
("^(?<userinfo>.*@)?"
|
|
"(?:(?:\\[(?<host_ipv6>[a-z\\d\\.:]*)\\])|(?<host_regular>[^:]*?)|(?<host_loose>.*?))"
|
|
"(?<port>:\\d+)?$")
|
|
).
|
|
|
|
-define(AUTHORITY_REGEX_PT_KEY, {?MODULE, authority_re}).
|
|
|
|
%%-------------------------------------------------------------------
|
|
%% Internal API
|
|
%%-------------------------------------------------------------------
|
|
|
|
init() ->
|
|
{ok, UriRE} = re:compile(?URI_REGEX),
|
|
persistent_term:put(?URI_REGEX_PT_KEY, UriRE),
|
|
|
|
{ok, AuthorityRE} = re:compile(?AUTHORITY_REGEX, [caseless]),
|
|
persistent_term:put(?AUTHORITY_REGEX_PT_KEY, AuthorityRE).
|
|
|
|
%%-------------------------------------------------------------------
|
|
%% API
|
|
%%-------------------------------------------------------------------
|
|
|
|
-spec parse(binary()) -> uri().
|
|
parse(URIString) ->
|
|
{match, [SchemeMatch, AuthorityMatch, PathMatch, QueryMatch, FragmentMatch]} = re:run(
|
|
URIString, uri_regexp(), [{capture, [scheme, authority, path, query, fragment], binary}]
|
|
),
|
|
Scheme = parse_scheme(SchemeMatch),
|
|
Authority = parse_authority(AuthorityMatch),
|
|
Path = PathMatch,
|
|
Query = parse_query(QueryMatch),
|
|
Fragment = parse_fragment(FragmentMatch),
|
|
|
|
#{
|
|
scheme => Scheme,
|
|
authority => Authority,
|
|
path => Path,
|
|
query => Query,
|
|
fragment => Fragment
|
|
}.
|
|
|
|
-spec base_url(uri()) -> iodata().
|
|
base_url(#{scheme := Scheme, authority := Authority}) ->
|
|
[format_scheme(Scheme), format_authority(Authority)].
|
|
|
|
-spec format(uri()) -> iodata().
|
|
format(#{path := Path, query := Query, fragment := Fragment} = URI) ->
|
|
[
|
|
base_url(URI),
|
|
Path,
|
|
format_query(Query),
|
|
format_fragment(Fragment)
|
|
].
|
|
|
|
-spec scheme(uri()) -> emqx_maybe:t(scheme()).
|
|
scheme(#{scheme := Scheme}) -> Scheme.
|
|
|
|
-spec userinfo(uri()) -> emqx_maybe:t(userinfo()).
|
|
userinfo(#{authority := undefined}) -> undefined;
|
|
userinfo(#{authority := #{userinfo := UserInfo}}) -> UserInfo.
|
|
|
|
-spec host(uri()) -> emqx_maybe:t(host()).
|
|
host(#{authority := undefined}) -> undefined;
|
|
host(#{authority := #{host := Host}}) -> Host.
|
|
|
|
-spec port(uri()) -> emqx_maybe:t(port_number()).
|
|
port(#{authority := undefined}) -> undefined;
|
|
port(#{authority := #{port := Port}}) -> Port.
|
|
|
|
-spec path(uri()) -> path().
|
|
path(#{path := Path}) -> Path.
|
|
|
|
-spec query(uri()) -> emqx_maybe:t(query()).
|
|
query(#{query := Query}) -> Query.
|
|
|
|
-spec fragment(uri()) -> emqx_maybe:t(fragment()).
|
|
fragment(#{fragment := Fragment}) -> Fragment.
|
|
|
|
-spec request_base(uri()) -> {ok, request_base()} | {error, term()}.
|
|
request_base(URI) when is_map(URI) ->
|
|
case emqx_http_lib:uri_parse(iolist_to_binary(base_url(URI))) of
|
|
{error, Reason} -> {error, Reason};
|
|
{ok, URIMap} -> {ok, maps:with([scheme, host, port], URIMap)}
|
|
end;
|
|
request_base(URIString) when is_list(URIString) orelse is_binary(URIString) ->
|
|
request_base(parse(URIString)).
|
|
|
|
%%--------------------------------------------------------------------
|
|
%% Helper functions
|
|
%%--------------------------------------------------------------------
|
|
|
|
parse_scheme(<<>>) -> undefined;
|
|
parse_scheme(Scheme) -> Scheme.
|
|
|
|
parse_query(<<>>) -> undefined;
|
|
parse_query(<<$?, Query/binary>>) -> Query.
|
|
|
|
parse_fragment(<<>>) -> undefined;
|
|
parse_fragment(<<$#, Fragment/binary>>) -> Fragment.
|
|
|
|
authority_regexp() ->
|
|
persistent_term:get(?AUTHORITY_REGEX_PT_KEY).
|
|
|
|
parse_authority(<<>>) ->
|
|
undefined;
|
|
parse_authority(<<$/, $/, Authority/binary>>) ->
|
|
%% Authority regexp always matches
|
|
{match, [UserInfoMatch, HostIPv6, HostRegular, HostLoose, PortMatch]} = re:run(
|
|
Authority, authority_regexp(), [
|
|
{capture, [userinfo, host_ipv6, host_regular, host_loose, port], binary}
|
|
]
|
|
),
|
|
UserInfo = parse_userinfo(UserInfoMatch),
|
|
{HostType, Host} = parse_host(HostIPv6, HostRegular, HostLoose),
|
|
Port = parse_port(PortMatch),
|
|
#{
|
|
userinfo => UserInfo,
|
|
host => Host,
|
|
host_type => HostType,
|
|
port => Port
|
|
}.
|
|
|
|
parse_userinfo(<<>>) -> undefined;
|
|
parse_userinfo(UserInfoMatch) -> binary:part(UserInfoMatch, 0, byte_size(UserInfoMatch) - 1).
|
|
|
|
parse_host(<<>>, <<>>, Host) -> {loose, Host};
|
|
parse_host(<<>>, Host, <<>>) -> {regular, Host};
|
|
parse_host(Host, <<>>, <<>>) -> {ipv6, Host}.
|
|
|
|
parse_port(<<>>) -> undefined;
|
|
parse_port(<<$:, Port/binary>>) -> binary_to_integer(Port).
|
|
|
|
uri_regexp() ->
|
|
persistent_term:get(?URI_REGEX_PT_KEY).
|
|
|
|
format_scheme(undefined) -> <<>>;
|
|
format_scheme(Scheme) -> [Scheme, $:].
|
|
|
|
format_authority(undefined) ->
|
|
<<>>;
|
|
format_authority(#{userinfo := UserInfo, host := Host, host_type := HostType, port := Port}) ->
|
|
[$/, $/, format_userinfo(UserInfo), format_host(HostType, Host), format_port(Port)].
|
|
|
|
format_userinfo(undefined) -> <<>>;
|
|
format_userinfo(UserInfo) -> [UserInfo, $@].
|
|
|
|
format_host(ipv6, Host) -> [$[, Host, $]];
|
|
format_host(_, Host) -> Host.
|
|
|
|
format_port(undefined) -> <<>>;
|
|
format_port(Port) -> [$:, integer_to_binary(Port)].
|
|
|
|
format_query(undefined) -> <<>>;
|
|
format_query(Query) -> [$?, Query].
|
|
|
|
format_fragment(undefined) -> <<>>;
|
|
format_fragment(Fragment) -> [$#, Fragment].
|
|
|
|
-ifdef(TEST).
|
|
-include_lib("eunit/include/eunit.hrl").
|
|
|
|
-define(URLS, [
|
|
"https://www.example.com/page",
|
|
"http://subdomain.example.com/path/to/page",
|
|
"https://www.example.com:8080/path/to/page",
|
|
"https://user:password@example.com/path/to/page",
|
|
"https://www.example.com/path%20with%20${spaces}",
|
|
"http://192.0.2.1/path/to/page",
|
|
"http://[2001:0db8:85a3:0000:0000:8a2e:0370:7334]/${path}/to/page",
|
|
"http://[2001:0db8:85a3:0000:0000:8a2e:0370:7334]/to/page",
|
|
"http://[2001:0db8:85a3:0000:0000:8a2e:0370:7334]:4444/to/page",
|
|
"ftp://ftp.example.com/${path}/to/file",
|
|
"ftps://ftp.example.com/path/to/file",
|
|
"mailto:user@example.com",
|
|
"tel:+1234567890",
|
|
"sms:+1234567890?body=Hello%20World",
|
|
"git://github.com/user/repo.git",
|
|
"a:b:c",
|
|
"svn://svn.example.com/project/trunk",
|
|
"https://www.${example}.com/path/to/page?query_param=value",
|
|
"https://www.example.com/path/to/page?query_param1=value1&query_param2=value2",
|
|
"https://www.example.com?query_param1=value1&query_param2=value2",
|
|
"https://www.example.com/path/to/page#section1",
|
|
"https://www.example.com/path/to/page?query_param=value#section1",
|
|
"https://www.example.com/path/to/page?query_param1=value1&query_param2=${value2}#section1",
|
|
"https://www.example.com?query_param1=value1&query_param2=value2#section1",
|
|
"file:///path/to/file.txt",
|
|
"localhost",
|
|
"localhost:8080",
|
|
"localhost:8080/path/to/page",
|
|
"localhost:8080/path/to/page?query_param=value",
|
|
"localhost:8080/path/to/page?query_param1=value1&query_param2=value2",
|
|
"/abc/${def}",
|
|
"/abc/def?query_param=value",
|
|
"?query_param=value",
|
|
"#section1"
|
|
]).
|
|
|
|
parse_format_test_() ->
|
|
[
|
|
{URI, ?_assertEqual(list_to_binary(URI), iolist_to_binary(format(parse(URI))))}
|
|
|| URI <- ?URLS
|
|
].
|
|
|
|
base_url_test_() ->
|
|
[
|
|
{URI, ?_assert(is_prefix(iolist_to_binary(base_url(parse(URI))), list_to_binary(URI)))}
|
|
|| URI <- ?URLS
|
|
].
|
|
|
|
scheme_test_() ->
|
|
[
|
|
if_parseable_by_uri_string(URI, fun(Expected, Parsed) ->
|
|
?assertEqual(maybe_get_bin(scheme, Expected), scheme(Parsed))
|
|
end)
|
|
|| URI <- ?URLS
|
|
].
|
|
|
|
host_test_() ->
|
|
[
|
|
if_parseable_by_uri_string(URI, fun(Expected, Parsed) ->
|
|
?assertEqual(maybe_get_bin(host, Expected), host(Parsed))
|
|
end)
|
|
|| URI <- ?URLS
|
|
].
|
|
|
|
path_test_() ->
|
|
[
|
|
if_parseable_by_uri_string(URI, fun(Expected, Parsed) ->
|
|
?assertEqual(maybe_get_bin(path, Expected), path(Parsed))
|
|
end)
|
|
|| URI <- ?URLS
|
|
].
|
|
|
|
query_test_() ->
|
|
[
|
|
if_parseable_by_uri_string(URI, fun(Expected, Parsed) ->
|
|
?assertEqual(maybe_get_bin(query, Expected), query(Parsed))
|
|
end)
|
|
|| URI <- ?URLS
|
|
].
|
|
|
|
fragment_test_() ->
|
|
[
|
|
if_parseable_by_uri_string(URI, fun(Expected, Parsed) ->
|
|
?assertEqual(maybe_get_bin(fragment, Expected), fragment(Parsed))
|
|
end)
|
|
|| URI <- ?URLS
|
|
].
|
|
|
|
templates_test_() ->
|
|
[
|
|
{"template in path",
|
|
?_assertEqual(
|
|
<<"/${client_attrs.group}">>,
|
|
path(parse("https://www.example.com/${client_attrs.group}"))
|
|
)},
|
|
{"template in query, no path",
|
|
?_assertEqual(
|
|
<<"group=${client_attrs.group}">>,
|
|
query(parse("https://www.example.com?group=${client_attrs.group}"))
|
|
)},
|
|
{"template in query, path",
|
|
?_assertEqual(
|
|
<<"group=${client_attrs.group}">>,
|
|
query(parse("https://www.example.com/path/?group=${client_attrs.group}"))
|
|
)}
|
|
].
|
|
|
|
request_target_test_() ->
|
|
[
|
|
?_assertEqual(
|
|
{ok, #{port => 443, scheme => https, host => "www.example.com"}},
|
|
request_base(parse("https://www.example.com/path/to/page?query_param=value#fr"))
|
|
),
|
|
?_assertEqual(
|
|
{error, empty_host_not_allowed},
|
|
request_base(parse("localhost?query_param=value#fr"))
|
|
),
|
|
?_assertEqual(
|
|
{error, {unsupported_scheme, <<"ftp">>}},
|
|
request_base(parse("ftp://localhost"))
|
|
)
|
|
].
|
|
|
|
is_prefix(Prefix, Binary) ->
|
|
case Binary of
|
|
<<Prefix:(byte_size(Prefix))/binary, _/binary>> -> true;
|
|
_ -> false
|
|
end.
|
|
|
|
if_parseable_by_uri_string(URI, Fun) ->
|
|
case uri_string:parse(URI) of
|
|
{error, _, _} ->
|
|
{"skipped", fun() -> true end};
|
|
ExpectedMap ->
|
|
ParsedMap = parse(URI),
|
|
{URI, fun() -> Fun(ExpectedMap, ParsedMap) end}
|
|
end.
|
|
|
|
maybe_get_bin(Key, Map) ->
|
|
maybe_bin(maps:get(Key, Map, undefined)).
|
|
|
|
maybe_bin(String) when is_list(String) -> list_to_binary(String);
|
|
maybe_bin(undefined) -> undefined.
|
|
|
|
-endif.
|