diff --git a/apps/emqx/include/bpapi.hrl b/apps/emqx/include/bpapi.hrl index 1373e0381..ed7693e78 100644 --- a/apps/emqx/include/bpapi.hrl +++ b/apps/emqx/include/bpapi.hrl @@ -14,9 +14,4 @@ %% limitations under the License. %%-------------------------------------------------------------------- --ifndef(EMQX_BPAPI_HRL). --define(EMQX_BPAPI_HRL, true). - --compile({parse_transform, emqx_bpapi_trans}). - --endif. +-include_lib("emqx_utils/include/bpapi.hrl"). diff --git a/apps/emqx/include/emqx.hrl b/apps/emqx/include/emqx.hrl index 664ec5803..86a64d8bb 100644 --- a/apps/emqx/include/emqx.hrl +++ b/apps/emqx/include/emqx.hrl @@ -55,29 +55,7 @@ -record(subscription, {topic, subid, subopts}). -%% See 'Application Message' in MQTT Version 5.0 --record(message, { - %% Global unique message ID - id :: binary(), - %% Message QoS - qos = 0, - %% Message from - from :: atom() | binary(), - %% Message flags - flags = #{} :: emqx_types:flags(), - %% Message headers. May contain any metadata. e.g. the - %% protocol version number, username, peerhost or - %% the PUBLISH properties (MQTT 5.0). - headers = #{} :: emqx_types:headers(), - %% Topic that the message is published to - topic :: emqx_types:topic(), - %% Message Payload - payload :: emqx_types:payload(), - %% Timestamp (Unit: millisecond) - timestamp :: integer(), - %% not used so far, for future extension - extra = [] :: term() -}). +-include_lib("emqx_utils/include/emqx_message.hrl"). -record(delivery, { %% Sender of the delivery diff --git a/apps/emqx/integration_test/emqx_ds_SUITE.erl b/apps/emqx/integration_test/emqx_persistent_session_ds_SUITE.erl similarity index 68% rename from apps/emqx/integration_test/emqx_ds_SUITE.erl rename to apps/emqx/integration_test/emqx_persistent_session_ds_SUITE.erl index 34c15b505..ee5d203e4 100644 --- a/apps/emqx/integration_test/emqx_ds_SUITE.erl +++ b/apps/emqx/integration_test/emqx_persistent_session_ds_SUITE.erl @@ -1,7 +1,7 @@ %%-------------------------------------------------------------------- %% Copyright (c) 2023 EMQ Technologies Co., Ltd. All Rights Reserved. %%-------------------------------------------------------------------- --module(emqx_ds_SUITE). +-module(emqx_persistent_session_ds_SUITE). -compile(export_all). -compile(nowarn_export_all). @@ -14,7 +14,6 @@ -define(DEFAULT_KEYSPACE, default). -define(DS_SHARD_ID, <<"local">>). -define(DS_SHARD, {?DEFAULT_KEYSPACE, ?DS_SHARD_ID}). --define(ITERATOR_REF_TAB, emqx_ds_iterator_ref). -import(emqx_common_test_helpers, [on_exit/1]). @@ -91,9 +90,6 @@ get_mqtt_port(Node, Type) -> {_IP, Port} = erpc:call(Node, emqx_config, get, [[listeners, Type, default, bind]]), Port. -get_all_iterator_refs(Node) -> - erpc:call(Node, mnesia, dirty_all_keys, [?ITERATOR_REF_TAB]). - get_all_iterator_ids(Node) -> Fn = fun(K, _V, Acc) -> [K | Acc] end, erpc:call(Node, fun() -> @@ -126,6 +122,32 @@ start_client(Opts0 = #{}) -> on_exit(fun() -> catch emqtt:stop(Client) end), Client. +restart_node(Node, NodeSpec) -> + ?tp(will_restart_node, #{}), + ?tp(notice, "restarting node", #{node => Node}), + true = monitor_node(Node, true), + ok = erpc:call(Node, init, restart, []), + receive + {nodedown, Node} -> + ok + after 10_000 -> + ct:fail("node ~p didn't stop", [Node]) + end, + ?tp(notice, "waiting for nodeup", #{node => Node}), + wait_nodeup(Node), + wait_gen_rpc_down(NodeSpec), + ?tp(notice, "restarting apps", #{node => Node}), + Apps = maps:get(apps, NodeSpec), + ok = erpc:call(Node, emqx_cth_suite, load_apps, [Apps]), + _ = erpc:call(Node, emqx_cth_suite, start_apps, [Apps, NodeSpec]), + %% have to re-inject this so that we may stop the node succesfully at the + %% end.... + ok = emqx_cth_cluster:set_node_opts(Node, NodeSpec), + ok = snabbkaffe:forward_trace(Node), + ?tp(notice, "node restarted", #{node => Node}), + ?tp(restarted_node, #{}), + ok. + %%------------------------------------------------------------------------------ %% Testcases %%------------------------------------------------------------------------------ @@ -143,24 +165,14 @@ t_non_persistent_session_subscription(_Config) -> {ok, _} = emqtt:connect(Client), ?tp(notice, "subscribing", #{}), {ok, _, [?RC_GRANTED_QOS_2]} = emqtt:subscribe(Client, SubTopicFilter, qos2), - IteratorRefs = get_all_iterator_refs(node()), - IteratorIds = get_all_iterator_ids(node()), ok = emqtt:stop(Client), - #{ - iterator_refs => IteratorRefs, - iterator_ids => IteratorIds - } + ok end, - fun(Res, Trace) -> + fun(Trace) -> ct:pal("trace:\n ~p", [Trace]), - #{ - iterator_refs := IteratorRefs, - iterator_ids := IteratorIds - } = Res, - ?assertEqual([], IteratorRefs), - ?assertEqual({ok, []}, IteratorIds), + ?assertEqual([], ?of_kind(ds_session_subscription_added, Trace)), ok end ), @@ -175,7 +187,7 @@ t_session_subscription_idempotency(Config) -> ?check_trace( begin ?force_ordering( - #{?snk_kind := persistent_session_ds_iterator_added}, + #{?snk_kind := persistent_session_ds_subscription_added}, _NEvents0 = 1, #{?snk_kind := will_restart_node}, _Guard0 = true @@ -187,32 +199,7 @@ t_session_subscription_idempotency(Config) -> _Guard1 = true ), - spawn_link(fun() -> - ?tp(will_restart_node, #{}), - ?tp(notice, "restarting node", #{node => Node1}), - true = monitor_node(Node1, true), - ok = erpc:call(Node1, init, restart, []), - receive - {nodedown, Node1} -> - ok - after 10_000 -> - ct:fail("node ~p didn't stop", [Node1]) - end, - ?tp(notice, "waiting for nodeup", #{node => Node1}), - wait_nodeup(Node1), - wait_gen_rpc_down(Node1Spec), - ?tp(notice, "restarting apps", #{node => Node1}), - Apps = maps:get(apps, Node1Spec), - ok = erpc:call(Node1, emqx_cth_suite, load_apps, [Apps]), - _ = erpc:call(Node1, emqx_cth_suite, start_apps, [Apps, Node1Spec]), - %% have to re-inject this so that we may stop the node succesfully at the - %% end.... - ok = emqx_cth_cluster:set_node_opts(Node1, Node1Spec), - ok = snabbkaffe:forward_trace(Node1), - ?tp(notice, "node restarted", #{node => Node1}), - ?tp(restarted_node, #{}), - ok - end), + spawn_link(fun() -> restart_node(Node1, Node1Spec) end), ?tp(notice, "starting 1", #{}), Client0 = start_client(#{port => Port, clientid => ClientId}), @@ -223,7 +210,7 @@ t_session_subscription_idempotency(Config) -> receive {'EXIT', {shutdown, _}} -> ok - after 0 -> ok + after 100 -> ok end, process_flag(trap_exit, false), @@ -240,10 +227,7 @@ t_session_subscription_idempotency(Config) -> end, fun(Trace) -> ct:pal("trace:\n ~p", [Trace]), - %% Exactly one iterator should have been opened. SubTopicFilterWords = emqx_topic:words(SubTopicFilter), - ?assertEqual([{ClientId, SubTopicFilterWords}], get_all_iterator_refs(Node1)), - ?assertMatch({ok, [_]}, get_all_iterator_ids(Node1)), ?assertMatch( {ok, #{}, #{SubTopicFilterWords := #{}}}, erpc:call(Node1, emqx_persistent_session_ds, session_open, [ClientId]) @@ -262,7 +246,10 @@ t_session_unsubscription_idempotency(Config) -> ?check_trace( begin ?force_ordering( - #{?snk_kind := persistent_session_ds_close_iterators, ?snk_span := {complete, _}}, + #{ + ?snk_kind := persistent_session_ds_subscription_delete, + ?snk_span := {complete, _} + }, _NEvents0 = 1, #{?snk_kind := will_restart_node}, _Guard0 = true @@ -270,36 +257,11 @@ t_session_unsubscription_idempotency(Config) -> ?force_ordering( #{?snk_kind := restarted_node}, _NEvents1 = 1, - #{?snk_kind := persistent_session_ds_iterator_delete, ?snk_span := start}, + #{?snk_kind := persistent_session_ds_subscription_route_delete, ?snk_span := start}, _Guard1 = true ), - spawn_link(fun() -> - ?tp(will_restart_node, #{}), - ?tp(notice, "restarting node", #{node => Node1}), - true = monitor_node(Node1, true), - ok = erpc:call(Node1, init, restart, []), - receive - {nodedown, Node1} -> - ok - after 10_000 -> - ct:fail("node ~p didn't stop", [Node1]) - end, - ?tp(notice, "waiting for nodeup", #{node => Node1}), - wait_nodeup(Node1), - wait_gen_rpc_down(Node1Spec), - ?tp(notice, "restarting apps", #{node => Node1}), - Apps = maps:get(apps, Node1Spec), - ok = erpc:call(Node1, emqx_cth_suite, load_apps, [Apps]), - _ = erpc:call(Node1, emqx_cth_suite, start_apps, [Apps, Node1Spec]), - %% have to re-inject this so that we may stop the node succesfully at the - %% end.... - ok = emqx_cth_cluster:set_node_opts(Node1, Node1Spec), - ok = snabbkaffe:forward_trace(Node1), - ?tp(notice, "node restarted", #{node => Node1}), - ?tp(restarted_node, #{}), - ok - end), + spawn_link(fun() -> restart_node(Node1, Node1Spec) end), ?tp(notice, "starting 1", #{}), Client0 = start_client(#{port => Port, clientid => ClientId}), @@ -312,7 +274,7 @@ t_session_unsubscription_idempotency(Config) -> receive {'EXIT', {shutdown, _}} -> ok - after 0 -> ok + after 100 -> ok end, process_flag(trap_exit, false), @@ -327,7 +289,7 @@ t_session_unsubscription_idempotency(Config) -> ?wait_async_action( emqtt:unsubscribe(Client1, SubTopicFilter), #{ - ?snk_kind := persistent_session_ds_iterator_delete, + ?snk_kind := persistent_session_ds_subscription_route_delete, ?snk_span := {complete, _} }, 15_000 @@ -339,9 +301,10 @@ t_session_unsubscription_idempotency(Config) -> end, fun(Trace) -> ct:pal("trace:\n ~p", [Trace]), - %% No iterators remaining - ?assertEqual([], get_all_iterator_refs(Node1)), - ?assertEqual({ok, []}, get_all_iterator_ids(Node1)), + ?assertMatch( + {ok, #{}, Subs = #{}} when map_size(Subs) =:= 0, + erpc:call(Node1, emqx_persistent_session_ds, session_open, [ClientId]) + ), ok end ), diff --git a/apps/emqx/priv/bpapi.versions b/apps/emqx/priv/bpapi.versions index 47967cb1e..f647c660f 100644 --- a/apps/emqx/priv/bpapi.versions +++ b/apps/emqx/priv/bpapi.versions @@ -18,6 +18,7 @@ {emqx_dashboard,1}. {emqx_delayed,1}. {emqx_delayed,2}. +{emqx_ds,1}. {emqx_eviction_agent,1}. {emqx_eviction_agent,2}. {emqx_exhook,1}. diff --git a/apps/emqx/src/emqx_message.erl b/apps/emqx/src/emqx_message.erl index 509d4c90d..4ff36504d 100644 --- a/apps/emqx/src/emqx_message.erl +++ b/apps/emqx/src/emqx_message.erl @@ -66,7 +66,8 @@ -export([ is_expired/1, - update_expiry/1 + update_expiry/1, + timestamp_now/0 ]). -export([ @@ -113,14 +114,13 @@ make(From, Topic, Payload) -> emqx_types:payload() ) -> emqx_types:message(). make(From, QoS, Topic, Payload) when ?QOS_0 =< QoS, QoS =< ?QOS_2 -> - Now = erlang:system_time(millisecond), #message{ id = emqx_guid:gen(), qos = QoS, from = From, topic = Topic, payload = Payload, - timestamp = Now + timestamp = timestamp_now() }. -spec make( @@ -137,7 +137,6 @@ make(From, QoS, Topic, Payload, Flags, Headers) when is_map(Flags), is_map(Headers) -> - Now = erlang:system_time(millisecond), #message{ id = emqx_guid:gen(), qos = QoS, @@ -146,7 +145,7 @@ make(From, QoS, Topic, Payload, Flags, Headers) when headers = Headers, topic = Topic, payload = Payload, - timestamp = Now + timestamp = timestamp_now() }. -spec make( @@ -164,7 +163,6 @@ make(MsgId, From, QoS, Topic, Payload, Flags, Headers) when is_map(Flags), is_map(Headers) -> - Now = erlang:system_time(millisecond), #message{ id = MsgId, qos = QoS, @@ -173,7 +171,7 @@ make(MsgId, From, QoS, Topic, Payload, Flags, Headers) when headers = Headers, topic = Topic, payload = Payload, - timestamp = Now + timestamp = timestamp_now() }. %% optimistic esitmation of a message size after serialization @@ -403,6 +401,11 @@ from_map(#{ extra = Extra }. +%% @doc Get current timestamp in milliseconds. +-spec timestamp_now() -> integer(). +timestamp_now() -> + erlang:system_time(millisecond). + %% MilliSeconds elapsed(Since) -> - max(0, erlang:system_time(millisecond) - Since). + max(0, timestamp_now() - Since). diff --git a/apps/emqx/src/emqx_persistent_message.erl b/apps/emqx/src/emqx_persistent_message.erl index 609b0139d..632ff2a27 100644 --- a/apps/emqx/src/emqx_persistent_message.erl +++ b/apps/emqx/src/emqx_persistent_message.erl @@ -23,16 +23,12 @@ %% Message persistence -export([ - persist/1, - serialize/1, - deserialize/1 + persist/1 ]). -%% FIXME --define(DS_SHARD_ID, <<"local">>). --define(DEFAULT_KEYSPACE, default). --define(DS_SHARD, {?DEFAULT_KEYSPACE, ?DS_SHARD_ID}). +-define(PERSISTENT_MESSAGE_DB, emqx_persistent_message). +%% FIXME -define(WHEN_ENABLED(DO), case is_store_enabled() of true -> DO; @@ -44,18 +40,10 @@ init() -> ?WHEN_ENABLED(begin - ok = emqx_ds:ensure_shard( - ?DS_SHARD, - #{ - dir => filename:join([ - emqx:data_dir(), - ds, - messages, - ?DEFAULT_KEYSPACE, - ?DS_SHARD_ID - ]) - } - ), + ok = emqx_ds:open_db(?PERSISTENT_MESSAGE_DB, #{ + backend => builtin, + storage => {emqx_ds_storage_bitfield_lts, #{}} + }), ok = emqx_persistent_session_ds_router:init_tables(), ok = emqx_persistent_session_ds:create_tables(), ok @@ -82,19 +70,11 @@ persist(Msg) -> needs_persistence(Msg) -> not (emqx_message:get_flag(dup, Msg) orelse emqx_message:is_sys(Msg)). +-spec store_message(emqx_types:message()) -> emqx_ds:store_batch_result(). store_message(Msg) -> - ID = emqx_message:id(Msg), - Timestamp = emqx_guid:timestamp(ID), - Topic = emqx_topic:words(emqx_message:topic(Msg)), - emqx_ds_storage_layer:store(?DS_SHARD, ID, Timestamp, Topic, serialize(Msg)). + emqx_ds:store_batch(?PERSISTENT_MESSAGE_DB, [Msg]). has_subscribers(#message{topic = Topic}) -> emqx_persistent_session_ds_router:has_any_route(Topic). %% - -serialize(Msg) -> - term_to_binary(emqx_message:to_map(Msg)). - -deserialize(Bin) -> - emqx_message:from_map(binary_to_term(Bin)). diff --git a/apps/emqx/src/emqx_persistent_message_ds_replayer.erl b/apps/emqx/src/emqx_persistent_message_ds_replayer.erl new file mode 100644 index 000000000..d137891a2 --- /dev/null +++ b/apps/emqx/src/emqx_persistent_message_ds_replayer.erl @@ -0,0 +1,213 @@ +%%-------------------------------------------------------------------- +%% Copyright (c) 2023 EMQ Technologies Co., Ltd. All Rights Reserved. +%% +%% Licensed under the Apache License, Version 2.0 (the "License"); +%% you may not use this file except in compliance with the License. +%% You may obtain a copy of the License at +%% +%% http://www.apache.org/licenses/LICENSE-2.0 +%% +%% Unless required by applicable law or agreed to in writing, software +%% distributed under the License is distributed on an "AS IS" BASIS, +%% WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +%% See the License for the specific language governing permissions and +%% limitations under the License. +%%-------------------------------------------------------------------- + +%% @doc This module implements the routines for replaying streams of +%% messages. +-module(emqx_persistent_message_ds_replayer). + +%% API: +-export([new/0, next_packet_id/1, replay/2, commit_offset/3, poll/3]). + +%% internal exports: +-export([]). + +-export_type([inflight/0]). + +-include("emqx_persistent_session_ds.hrl"). + +%%================================================================================ +%% Type declarations +%%================================================================================ + +%% Note: sequence numbers are monotonic; they don't wrap around: +-type seqno() :: non_neg_integer(). + +-record(range, { + stream :: emqx_ds:stream(), + first :: seqno(), + last :: seqno(), + iterator_next :: emqx_ds:iterator() | undefined +}). + +-type range() :: #range{}. + +-record(inflight, { + next_seqno = 0 :: seqno(), + acked_seqno = 0 :: seqno(), + offset_ranges = [] :: [range()] +}). + +-opaque inflight() :: #inflight{}. + +%%================================================================================ +%% API funcions +%%================================================================================ + +-spec new() -> inflight(). +new() -> + #inflight{}. + +-spec next_packet_id(inflight()) -> {emqx_types:packet_id(), inflight()}. +next_packet_id(Inflight0 = #inflight{next_seqno = LastSeqno}) -> + Inflight = Inflight0#inflight{next_seqno = LastSeqno + 1}, + {seqno_to_packet_id(LastSeqno), Inflight}. + +-spec replay(emqx_persistent_session_ds:id(), inflight()) -> + emqx_session:replies(). +replay(_SessionId, _Inflight = #inflight{offset_ranges = _Ranges}) -> + []. + +-spec commit_offset(emqx_persistent_session_ds:id(), emqx_types:packet_id(), inflight()) -> + {_IsValidOffset :: boolean(), inflight()}. +commit_offset( + SessionId, + PacketId, + Inflight0 = #inflight{ + acked_seqno = AckedSeqno0, next_seqno = NextSeqNo, offset_ranges = Ranges0 + } +) -> + AckedSeqno = packet_id_to_seqno(NextSeqNo, PacketId), + true = AckedSeqno0 < AckedSeqno, + Ranges = lists:filter( + fun(#range{stream = Stream, last = LastSeqno, iterator_next = ItNext}) -> + case LastSeqno =< AckedSeqno of + true -> + %% This range has been fully + %% acked. Remove it and replace saved + %% iterator with the trailing iterator. + update_iterator(SessionId, Stream, ItNext), + false; + false -> + %% This range still has unacked + %% messages: + true + end + end, + Ranges0 + ), + Inflight = Inflight0#inflight{acked_seqno = AckedSeqno, offset_ranges = Ranges}, + {true, Inflight}. + +-spec poll(emqx_persistent_session_ds:id(), inflight(), pos_integer()) -> + {emqx_session:replies(), inflight()}. +poll(SessionId, Inflight0, WindowSize) when WindowSize > 0, WindowSize < 16#7fff -> + #inflight{next_seqno = NextSeqNo0, acked_seqno = AckedSeqno} = + Inflight0, + FetchThreshold = max(1, WindowSize div 2), + FreeSpace = AckedSeqno + WindowSize - NextSeqNo0, + case FreeSpace >= FetchThreshold of + false -> + %% TODO: this branch is meant to avoid fetching data from + %% the DB in chunks that are too small. However, this + %% logic is not exactly good for the latency. Can the + %% client get stuck even? + {[], Inflight0}; + true -> + Streams = shuffle(get_streams(SessionId)), + fetch(SessionId, Inflight0, Streams, FreeSpace, []) + end. + +%%================================================================================ +%% Internal exports +%%================================================================================ + +%%================================================================================ +%% Internal functions +%%================================================================================ + +fetch(_SessionId, Inflight, _Streams = [], _N, Acc) -> + {lists:reverse(Acc), Inflight}; +fetch(_SessionId, Inflight, _Streams, 0, Acc) -> + {lists:reverse(Acc), Inflight}; +fetch(SessionId, Inflight0, [#ds_stream{stream = Stream} | Streams], N, Publishes0) -> + #inflight{next_seqno = FirstSeqNo, offset_ranges = Ranges0} = Inflight0, + ItBegin = get_last_iterator(SessionId, Stream, Ranges0), + {ok, ItEnd, Messages} = emqx_ds:next(ItBegin, N), + {Publishes, Inflight1} = + lists:foldl( + fun(Msg, {PubAcc0, InflightAcc0}) -> + {PacketId, InflightAcc} = next_packet_id(InflightAcc0), + PubAcc = [{PacketId, Msg} | PubAcc0], + {PubAcc, InflightAcc} + end, + {Publishes0, Inflight0}, + Messages + ), + #inflight{next_seqno = LastSeqNo} = Inflight1, + NMessages = LastSeqNo - FirstSeqNo, + case NMessages > 0 of + true -> + Range = #range{ + first = FirstSeqNo, + last = LastSeqNo - 1, + stream = Stream, + iterator_next = ItEnd + }, + Inflight = Inflight1#inflight{offset_ranges = Ranges0 ++ [Range]}, + fetch(SessionId, Inflight, Streams, N - NMessages, Publishes); + false -> + fetch(SessionId, Inflight1, Streams, N, Publishes) + end. + +update_iterator(SessionId, Stream, Iterator) -> + mria:dirty_write(?SESSION_ITER_TAB, #ds_iter{id = {SessionId, Stream}, iter = Iterator}). + +get_last_iterator(SessionId, Stream, Ranges) -> + case lists:keyfind(Stream, #range.stream, lists:reverse(Ranges)) of + false -> + get_iterator(SessionId, Stream); + #range{iterator_next = Next} -> + Next + end. + +get_iterator(SessionId, Stream) -> + Id = {SessionId, Stream}, + [#ds_iter{iter = It}] = mnesia:dirty_read(?SESSION_ITER_TAB, Id), + It. + +get_streams(SessionId) -> + mnesia:dirty_read(?SESSION_STREAM_TAB, SessionId). + +%% Packet ID as defined by MQTT protocol is a 16-bit integer in range +%% 1..FFFF. This function translates internal session sequence number +%% to MQTT packet ID by chopping off most significant bits and adding +%% 1. This assumes that there's never more FFFF in-flight packets at +%% any time: +-spec seqno_to_packet_id(non_neg_integer()) -> emqx_types:packet_id(). +seqno_to_packet_id(Counter) -> + Counter rem 16#ffff + 1. + +%% Reconstruct session counter by adding most significant bits from +%% the current counter to the packet id. +-spec packet_id_to_seqno(non_neg_integer(), emqx_types:packet_id()) -> non_neg_integer(). +packet_id_to_seqno(NextSeqNo, PacketId) -> + N = ((NextSeqNo bsr 16) bsl 16) + PacketId, + case N > NextSeqNo of + true -> N - 16#10000; + false -> N + end. + +-spec shuffle([A]) -> [A]. +shuffle(L0) -> + L1 = lists:map( + fun(A) -> + {rand:uniform(), A} + end, + L0 + ), + L2 = lists:sort(L1), + {_, L} = lists:unzip(L2), + L. diff --git a/apps/emqx/src/emqx_persistent_session_ds.erl b/apps/emqx/src/emqx_persistent_session_ds.erl index e456211fc..f3027f500 100644 --- a/apps/emqx/src/emqx_persistent_session_ds.erl +++ b/apps/emqx/src/emqx_persistent_session_ds.erl @@ -18,9 +18,12 @@ -include("emqx.hrl"). -include_lib("snabbkaffe/include/snabbkaffe.hrl"). +-include_lib("stdlib/include/ms_transform.hrl"). -include("emqx_mqtt.hrl"). +-include("emqx_persistent_session_ds.hrl"). + %% Session API -export([ create/3, @@ -50,7 +53,7 @@ -export([ deliver/3, replay/3, - % handle_timeout/3, + handle_timeout/3, disconnect/1, terminate/2 ]). @@ -58,33 +61,27 @@ %% session table operations -export([create_tables/0]). --ifdef(TEST). --export([session_open/1]). --endif. - -%% RPC --export([ - ensure_iterator_closed_on_all_shards/1, - ensure_all_iterators_closed/1 -]). +%% Remove me later (satisfy checks for an unused BPAPI) -export([ do_open_iterator/3, do_ensure_iterator_closed/1, do_ensure_all_iterators_closed/1 ]). -%% FIXME --define(DS_SHARD_ID, <<"local">>). --define(DEFAULT_KEYSPACE, default). --define(DS_SHARD, {?DEFAULT_KEYSPACE, ?DS_SHARD_ID}). +-ifdef(TEST). +-export([session_open/1]). +-endif. %% Currently, this is the clientid. We avoid `emqx_types:clientid()' because that can be %% an atom, in theory (?). -type id() :: binary(). --type iterator() :: emqx_ds:iterator(). --type iterator_id() :: emqx_ds:iterator_id(). -type topic_filter() :: emqx_ds:topic_filter(). --type iterators() :: #{topic_filter() => iterator()}. +-type subscription_id() :: {id(), topic_filter()}. +-type subscription() :: #{ + start_time := emqx_ds:time(), + propts := map(), + extra := map() +}. -type session() :: #{ %% Client ID id := id(), @@ -93,11 +90,15 @@ %% When the session should expire expires_at := timestamp() | never, %% Client’s Subscriptions. - iterators := #{topic() => iterator()}, + iterators := #{topic() => subscription()}, + %% Inflight messages + inflight := emqx_persistent_message_ds_replayer:inflight(), %% props := map() }. +%% -type session() :: #session{}. + -type timestamp() :: emqx_utils_calendar:epoch_millisecond(). -type topic() :: emqx_types:topic(). -type clientinfo() :: emqx_types:clientinfo(). @@ -106,12 +107,15 @@ -export_type([id/0]). +-define(PERSISTENT_MESSAGE_DB, emqx_persistent_message). + %% -spec create(clientinfo(), conninfo(), emqx_session:conf()) -> session(). create(#{clientid := ClientID}, _ConnInfo, Conf) -> % TODO: expiration + ensure_timers(), ensure_session(ClientID, Conf). -spec open(clientinfo(), conninfo()) -> @@ -126,6 +130,7 @@ open(#{clientid := ClientID}, _ConnInfo) -> ok = emqx_cm:discard_session(ClientID), case open_session(ClientID) of Session = #{} -> + ensure_timers(), {true, Session, []}; false -> false @@ -137,17 +142,17 @@ ensure_session(ClientID, Conf) -> open_session(ClientID) -> case session_open(ClientID) of - {ok, Session, Iterators} -> - Session#{iterators => prep_iterators(Iterators)}; + {ok, Session, Subscriptions} -> + Session#{iterators => prep_subscriptions(Subscriptions)}; false -> false end. -prep_iterators(Iterators) -> +prep_subscriptions(Subscriptions) -> maps:fold( - fun(Topic, Iterator, Acc) -> Acc#{emqx_topic:join(Topic) => Iterator} end, + fun(Topic, Subscription, Acc) -> Acc#{emqx_topic:join(Topic) => Subscription} end, #{}, - Iterators + Subscriptions ). -spec destroy(session() | clientinfo()) -> ok. @@ -157,7 +162,6 @@ destroy(#{clientid := ClientID}) -> destroy_session(ClientID). destroy_session(ClientID) -> - _ = ensure_all_iterators_closed(ClientID), session_drop(ClientID). %%-------------------------------------------------------------------- @@ -245,7 +249,7 @@ unsubscribe( ) when is_map_key(TopicFilter, Iters) -> Iterator = maps:get(TopicFilter, Iters), SubOpts = maps:get(props, Iterator), - ok = del_subscription(TopicFilter, Iterator, ID), + ok = del_subscription(TopicFilter, ID), {ok, Session#{iterators := maps:remove(TopicFilter, Iters)}, SubOpts}; unsubscribe( _TopicFilter, @@ -271,19 +275,29 @@ get_subscription(TopicFilter, #{iterators := Iters}) -> {ok, emqx_types:publish_result(), replies(), session()} | {error, emqx_types:reason_code()}. publish(_PacketId, Msg, Session) -> - % TODO: stub - {ok, emqx_broker:publish(Msg), [], Session}. + %% TODO: + Result = emqx_broker:publish(Msg), + {ok, Result, [], Session}. %%-------------------------------------------------------------------- %% Client -> Broker: PUBACK %%-------------------------------------------------------------------- +%% FIXME: parts of the commit offset function are mocked +-dialyzer({nowarn_function, puback/3}). + -spec puback(clientinfo(), emqx_types:packet_id(), session()) -> {ok, emqx_types:message(), replies(), session()} | {error, emqx_types:reason_code()}. -puback(_ClientInfo, _PacketId, _Session = #{}) -> - % TODO: stub - {error, ?RC_PACKET_IDENTIFIER_NOT_FOUND}. +puback(_ClientInfo, PacketId, Session = #{id := Id, inflight := Inflight0}) -> + case emqx_persistent_message_ds_replayer:commit_offset(Id, PacketId, Inflight0) of + {true, Inflight} -> + %% TODO + Msg = #message{}, + {ok, Msg, [], Session#{inflight => Inflight}}; + {false, _} -> + {error, ?RC_PACKET_IDENTIFIER_NOT_FOUND} + end. %%-------------------------------------------------------------------- %% Client -> Broker: PUBREC @@ -320,10 +334,22 @@ pubcomp(_ClientInfo, _PacketId, _Session = #{}) -> %%-------------------------------------------------------------------- -spec deliver(clientinfo(), [emqx_types:deliver()], session()) -> - no_return(). -deliver(_ClientInfo, _Delivers, _Session = #{}) -> - % TODO: ensure it's unreachable somehow - error(unexpected). + {ok, replies(), session()}. +deliver(_ClientInfo, _Delivers, Session) -> + %% TODO: QoS0 and system messages end up here. + {ok, [], Session}. + +-spec handle_timeout(clientinfo(), _Timeout, session()) -> + {ok, replies(), session()} | {ok, replies(), timeout(), session()}. +handle_timeout(_ClientInfo, pull, Session = #{id := Id, inflight := Inflight0}) -> + WindowSize = 100, + {Publishes, Inflight} = emqx_persistent_message_ds_replayer:poll(Id, Inflight0, WindowSize), + ensure_timer(pull), + {ok, Publishes, Session#{inflight => Inflight}}; +handle_timeout(_ClientInfo, get_streams, Session = #{id := Id}) -> + renew_streams(Id), + ensure_timer(get_streams), + {ok, [], Session}. -spec replay(clientinfo(), [], session()) -> {ok, replies(), session()}. @@ -344,151 +370,69 @@ terminate(_Reason, _Session = #{}) -> %%-------------------------------------------------------------------- -spec add_subscription(topic(), emqx_types:subopts(), id()) -> - emqx_ds:iterator(). + subscription(). add_subscription(TopicFilterBin, SubOpts, DSSessionID) -> - % N.B.: we chose to update the router before adding the subscription to the - % session/iterator table. The reasoning for this is as follows: - % - % Messages matching this topic filter should start to be persisted as soon as - % possible to avoid missing messages. If this is the first such persistent - % session subscription, it's important to do so early on. - % - % This could, in turn, lead to some inconsistency: if such a route gets - % created but the session/iterator data fails to be updated accordingly, we - % have a dangling route. To remove such dangling routes, we may have a - % periodic GC process that removes routes that do not have a matching - % persistent subscription. Also, route operations use dirty mnesia - % operations, which inherently have room for inconsistencies. - % - % In practice, we use the iterator reference table as a source of truth, - % since it is guarded by a transaction context: we consider a subscription - % operation to be successful if it ended up changing this table. Both router - % and iterator information can be reconstructed from this table, if needed. + %% N.B.: we chose to update the router before adding the subscription to the + %% session/iterator table. The reasoning for this is as follows: + %% + %% Messages matching this topic filter should start to be persisted as soon as + %% possible to avoid missing messages. If this is the first such persistent + %% session subscription, it's important to do so early on. + %% + %% This could, in turn, lead to some inconsistency: if such a route gets + %% created but the session/iterator data fails to be updated accordingly, we + %% have a dangling route. To remove such dangling routes, we may have a + %% periodic GC process that removes routes that do not have a matching + %% persistent subscription. Also, route operations use dirty mnesia + %% operations, which inherently have room for inconsistencies. + %% + %% In practice, we use the iterator reference table as a source of truth, + %% since it is guarded by a transaction context: we consider a subscription + %% operation to be successful if it ended up changing this table. Both router + %% and iterator information can be reconstructed from this table, if needed. ok = emqx_persistent_session_ds_router:do_add_route(TopicFilterBin, DSSessionID), TopicFilter = emqx_topic:words(TopicFilterBin), - {ok, Iterator, IsNew} = session_add_iterator( + {ok, DSSubExt, IsNew} = session_add_subscription( DSSessionID, TopicFilter, SubOpts ), - Ctx = #{iterator => Iterator, is_new => IsNew}, - ?tp(persistent_session_ds_iterator_added, Ctx), - ?tp_span( - persistent_session_ds_open_iterators, - Ctx, - ok = open_iterator_on_all_shards(TopicFilter, Iterator) - ), - Iterator. + ?tp(persistent_session_ds_subscription_added, #{sub => DSSubExt, is_new => IsNew}), + %% we'll list streams and open iterators when implementing message replay. + DSSubExt. --spec update_subscription(topic(), iterator(), emqx_types:subopts(), id()) -> - iterator(). -update_subscription(TopicFilterBin, Iterator, SubOpts, DSSessionID) -> +-spec update_subscription(topic(), subscription(), emqx_types:subopts(), id()) -> + subscription(). +update_subscription(TopicFilterBin, DSSubExt, SubOpts, DSSessionID) -> TopicFilter = emqx_topic:words(TopicFilterBin), - {ok, NIterator, false} = session_add_iterator( + {ok, NDSSubExt, false} = session_add_subscription( DSSessionID, TopicFilter, SubOpts ), - ok = ?tp(persistent_session_ds_iterator_updated, #{iterator => Iterator}), - NIterator. + ok = ?tp(persistent_session_ds_iterator_updated, #{sub => DSSubExt}), + NDSSubExt. --spec open_iterator_on_all_shards(emqx_types:words(), emqx_ds:iterator()) -> ok. -open_iterator_on_all_shards(TopicFilter, Iterator) -> - ?tp(persistent_session_ds_will_open_iterators, #{iterator => Iterator}), - %% Note: currently, shards map 1:1 to nodes, but this will change in the future. - Nodes = emqx:running_nodes(), - Results = emqx_persistent_session_ds_proto_v1:open_iterator( - Nodes, - TopicFilter, - maps:get(start_time, Iterator), - maps:get(id, Iterator) - ), - %% TODO - %% 1. Handle errors. - %% 2. Iterator handles are rocksdb resources, it's doubtful they survive RPC. - %% Even if they do, we throw them away here anyway. All in all, we probably should - %% hold each of them in a process on the respective node. - true = lists:all(fun(Res) -> element(1, Res) =:= ok end, Results), +-spec del_subscription(topic(), id()) -> ok. - -%% RPC target. --spec do_open_iterator(emqx_types:words(), emqx_ds:time(), emqx_ds:iterator_id()) -> - {ok, emqx_ds_storage_layer:iterator()} | {error, _Reason}. -do_open_iterator(TopicFilter, StartMS, IteratorID) -> - Replay = {TopicFilter, StartMS}, - emqx_ds_storage_layer:ensure_iterator(?DS_SHARD, IteratorID, Replay). - --spec del_subscription(topic(), iterator(), id()) -> - ok. -del_subscription(TopicFilterBin, #{id := IteratorID}, DSSessionID) -> - % N.B.: see comments in `?MODULE:add_subscription' for a discussion about the - % order of operations here. +del_subscription(TopicFilterBin, DSSessionId) -> TopicFilter = emqx_topic:words(TopicFilterBin), - Ctx = #{iterator_id => IteratorID}, ?tp_span( - persistent_session_ds_close_iterators, - Ctx, - ok = ensure_iterator_closed_on_all_shards(IteratorID) + persistent_session_ds_subscription_delete, + #{session_id => DSSessionId}, + ok = session_del_subscription(DSSessionId, TopicFilter) ), ?tp_span( - persistent_session_ds_iterator_delete, - Ctx, - session_del_iterator(DSSessionID, TopicFilter) - ), - ok = emqx_persistent_session_ds_router:do_delete_route(TopicFilterBin, DSSessionID). - --spec ensure_iterator_closed_on_all_shards(emqx_ds:iterator_id()) -> ok. -ensure_iterator_closed_on_all_shards(IteratorID) -> - %% Note: currently, shards map 1:1 to nodes, but this will change in the future. - Nodes = emqx:running_nodes(), - Results = emqx_persistent_session_ds_proto_v1:close_iterator(Nodes, IteratorID), - %% TODO: handle errors - true = lists:all(fun(Res) -> Res =:= {ok, ok} end, Results), - ok. - -%% RPC target. --spec do_ensure_iterator_closed(emqx_ds:iterator_id()) -> ok. -do_ensure_iterator_closed(IteratorID) -> - ok = emqx_ds_storage_layer:discard_iterator(?DS_SHARD, IteratorID), - ok. - --spec ensure_all_iterators_closed(id()) -> ok. -ensure_all_iterators_closed(DSSessionID) -> - %% Note: currently, shards map 1:1 to nodes, but this will change in the future. - Nodes = emqx:running_nodes(), - Results = emqx_persistent_session_ds_proto_v1:close_all_iterators(Nodes, DSSessionID), - %% TODO: handle errors - true = lists:all(fun(Res) -> Res =:= {ok, ok} end, Results), - ok. - -%% RPC target. --spec do_ensure_all_iterators_closed(id()) -> ok. -do_ensure_all_iterators_closed(DSSessionID) -> - ok = emqx_ds_storage_layer:discard_iterator_prefix(?DS_SHARD, DSSessionID), - ok. + persistent_session_ds_subscription_route_delete, + #{session_id => DSSessionId}, + ok = emqx_persistent_session_ds_router:do_delete_route(TopicFilterBin, DSSessionId) + ). %%-------------------------------------------------------------------- %% Session tables operations %%-------------------------------------------------------------------- --define(SESSION_TAB, emqx_ds_session). --define(ITERATOR_REF_TAB, emqx_ds_iterator_ref). --define(DS_MRIA_SHARD, emqx_ds_shard). - --record(session, { - %% same as clientid - id :: id(), - %% creation time - created_at :: _Millisecond :: non_neg_integer(), - expires_at = never :: _Millisecond :: non_neg_integer() | never, - %% for future usage - props = #{} :: map() -}). - --record(iterator_ref, { - ref_id :: {id(), emqx_ds:topic_filter()}, - it_id :: emqx_ds:iterator_id(), - start_time :: emqx_ds:time(), - props = #{} :: map() -}). - create_tables() -> + ok = emqx_ds:open_db(?PERSISTENT_MESSAGE_DB, #{ + backend => builtin, + storage => {emqx_ds_storage_bitfield_lts, #{}} + }), ok = mria:create_table( ?SESSION_TAB, [ @@ -500,15 +444,38 @@ create_tables() -> ] ), ok = mria:create_table( - ?ITERATOR_REF_TAB, + ?SESSION_SUBSCRIPTIONS_TAB, [ {rlog_shard, ?DS_MRIA_SHARD}, {type, ordered_set}, {storage, storage()}, - {record_name, iterator_ref}, - {attributes, record_info(fields, iterator_ref)} + {record_name, ds_sub}, + {attributes, record_info(fields, ds_sub)} ] ), + ok = mria:create_table( + ?SESSION_STREAM_TAB, + [ + {rlog_shard, ?DS_MRIA_SHARD}, + {type, bag}, + {storage, storage()}, + {record_name, ds_stream}, + {attributes, record_info(fields, ds_stream)} + ] + ), + ok = mria:create_table( + ?SESSION_ITER_TAB, + [ + {rlog_shard, ?DS_MRIA_SHARD}, + {type, set}, + {storage, storage()}, + {record_name, ds_iter}, + {attributes, record_info(fields, ds_iter)} + ] + ), + ok = mria:wait_for_tables([ + ?SESSION_TAB, ?SESSION_SUBSCRIPTIONS_TAB, ?SESSION_STREAM_TAB, ?SESSION_ITER_TAB + ]), ok. -dialyzer({nowarn_function, storage/0}). @@ -529,26 +496,26 @@ storage() -> %% Note: session API doesn't handle session takeovers, it's the job of %% the broker. -spec session_open(id()) -> - {ok, session(), iterators()} | false. + {ok, session(), #{topic() => subscription()}} | false. session_open(SessionId) -> transaction(fun() -> case mnesia:read(?SESSION_TAB, SessionId, write) of [Record = #session{}] -> - Session = export_record(Record), - IteratorRefs = session_read_iterators(SessionId), - Iterators = export_iterators(IteratorRefs), - {ok, Session, Iterators}; + Session = export_session(Record), + DSSubs = session_read_subscriptions(SessionId), + Subscriptions = export_subscriptions(DSSubs), + {ok, Session, Subscriptions}; [] -> false end end). -spec session_ensure_new(id(), _Props :: map()) -> - {ok, session(), iterators()}. + {ok, session(), #{topic() => subscription()}}. session_ensure_new(SessionId, Props) -> transaction(fun() -> - ok = session_drop_iterators(SessionId), - Session = export_record(session_create(SessionId, Props)), + ok = session_drop_subscriptions(SessionId), + Session = export_session(session_create(SessionId, Props)), {ok, Session, #{}} end). @@ -557,7 +524,8 @@ session_create(SessionId, Props) -> id = SessionId, created_at = erlang:system_time(millisecond), expires_at = never, - props = Props + props = Props, + inflight = emqx_persistent_message_ds_replayer:new() }, ok = mnesia:write(?SESSION_TAB, Session, write), Session. @@ -568,80 +536,143 @@ session_create(SessionId, Props) -> session_drop(DSSessionId) -> transaction(fun() -> %% TODO: ensure all iterators from this clientid are closed? - ok = session_drop_iterators(DSSessionId), + ok = session_drop_subscriptions(DSSessionId), ok = mnesia:delete(?SESSION_TAB, DSSessionId, write) end). -session_drop_iterators(DSSessionId) -> - IteratorRefs = session_read_iterators(DSSessionId), - ok = lists:foreach(fun session_del_iterator/1, IteratorRefs). +session_drop_subscriptions(DSSessionId) -> + IteratorRefs = session_read_subscriptions(DSSessionId), + ok = lists:foreach(fun session_del_subscription/1, IteratorRefs). %% @doc Called when a client subscribes to a topic. Idempotent. --spec session_add_iterator(id(), topic_filter(), _Props :: map()) -> - {ok, iterator(), _IsNew :: boolean()}. -session_add_iterator(DSSessionId, TopicFilter, Props) -> - IteratorRefId = {DSSessionId, TopicFilter}, +-spec session_add_subscription(id(), topic_filter(), _Props :: map()) -> + {ok, subscription(), _IsNew :: boolean()}. +session_add_subscription(DSSessionId, TopicFilter, Props) -> + DSSubId = {DSSessionId, TopicFilter}, transaction(fun() -> - case mnesia:read(?ITERATOR_REF_TAB, IteratorRefId, write) of + case mnesia:read(?SESSION_SUBSCRIPTIONS_TAB, DSSubId, write) of [] -> - IteratorRef = session_insert_iterator(DSSessionId, TopicFilter, Props), - Iterator = export_record(IteratorRef), + DSSub = session_insert_subscription(DSSessionId, TopicFilter, Props), + DSSubExt = export_subscription(DSSub), ?tp( ds_session_subscription_added, - #{iterator => Iterator, session_id => DSSessionId} + #{sub => DSSubExt, session_id => DSSessionId} ), - {ok, Iterator, _IsNew = true}; - [#iterator_ref{} = IteratorRef] -> - NIteratorRef = session_update_iterator(IteratorRef, Props), - NIterator = export_record(NIteratorRef), + {ok, DSSubExt, _IsNew = true}; + [#ds_sub{} = DSSub] -> + NDSSub = session_update_subscription(DSSub, Props), + NDSSubExt = export_subscription(NDSSub), ?tp( ds_session_subscription_present, - #{iterator => NIterator, session_id => DSSessionId} + #{sub => NDSSubExt, session_id => DSSessionId} ), - {ok, NIterator, _IsNew = false} + {ok, NDSSubExt, _IsNew = false} end end). -session_insert_iterator(DSSessionId, TopicFilter, Props) -> - {IteratorId, StartMS} = new_iterator_id(DSSessionId), - IteratorRef = #iterator_ref{ - ref_id = {DSSessionId, TopicFilter}, - it_id = IteratorId, +-spec session_insert_subscription(id(), topic_filter(), map()) -> ds_sub(). +session_insert_subscription(DSSessionId, TopicFilter, Props) -> + {DSSubId, StartMS} = new_subscription_id(DSSessionId, TopicFilter), + DSSub = #ds_sub{ + id = DSSubId, start_time = StartMS, - props = Props + props = Props, + extra = #{} }, - ok = mnesia:write(?ITERATOR_REF_TAB, IteratorRef, write), - IteratorRef. + ok = mnesia:write(?SESSION_SUBSCRIPTIONS_TAB, DSSub, write), + DSSub. -session_update_iterator(IteratorRef, Props) -> - NIteratorRef = IteratorRef#iterator_ref{props = Props}, - ok = mnesia:write(?ITERATOR_REF_TAB, NIteratorRef, write), - NIteratorRef. +-spec session_update_subscription(ds_sub(), map()) -> ds_sub(). +session_update_subscription(DSSub, Props) -> + NDSSub = DSSub#ds_sub{props = Props}, + ok = mnesia:write(?SESSION_SUBSCRIPTIONS_TAB, NDSSub, write), + NDSSub. -%% @doc Called when a client unsubscribes from a topic. --spec session_del_iterator(id(), topic_filter()) -> ok. -session_del_iterator(DSSessionId, TopicFilter) -> - IteratorRefId = {DSSessionId, TopicFilter}, +session_del_subscription(DSSessionId, TopicFilter) -> + DSSubId = {DSSessionId, TopicFilter}, transaction(fun() -> - mnesia:delete(?ITERATOR_REF_TAB, IteratorRefId, write) + mnesia:delete(?SESSION_SUBSCRIPTIONS_TAB, DSSubId, write) end). -session_del_iterator(#iterator_ref{ref_id = IteratorRefId}) -> - mnesia:delete(?ITERATOR_REF_TAB, IteratorRefId, write). +session_del_subscription(#ds_sub{id = DSSubId}) -> + mnesia:delete(?SESSION_SUBSCRIPTIONS_TAB, DSSubId, write). -session_read_iterators(DSSessionId) -> - % NOTE: somewhat convoluted way to trick dialyzer - Pat = erlang:make_tuple(record_info(size, iterator_ref), '_', [ - {1, iterator_ref}, - {#iterator_ref.ref_id, {DSSessionId, '_'}} - ]), - mnesia:match_object(?ITERATOR_REF_TAB, Pat, read). +session_read_subscriptions(DSSessionId) -> + MS = ets:fun2ms( + fun(Sub = #ds_sub{id = {Sess, _}}) when Sess =:= DSSessionId -> + Sub + end + ), + mnesia:select(?SESSION_SUBSCRIPTIONS_TAB, MS, read). --spec new_iterator_id(id()) -> {iterator_id(), emqx_ds:time()}. -new_iterator_id(DSSessionId) -> - NowMS = erlang:system_time(microsecond), - IteratorId = <>, - {IteratorId, NowMS}. +-spec new_subscription_id(id(), topic_filter()) -> {subscription_id(), integer()}. +new_subscription_id(DSSessionId, TopicFilter) -> + %% Note: here we use _milliseconds_ to match with the timestamp + %% field of `#message' record. + NowMS = erlang:system_time(millisecond), + DSSubId = {DSSessionId, TopicFilter}, + {DSSubId, NowMS}. + +%%-------------------------------------------------------------------- +%% RPC targets (v1) +%%-------------------------------------------------------------------- + +%% RPC target. +-spec do_open_iterator(emqx_types:words(), emqx_ds:time(), emqx_ds:iterator_id()) -> + {ok, emqx_ds_storage_layer:iterator()} | {error, _Reason}. +do_open_iterator(_TopicFilter, _StartMS, _IteratorID) -> + {error, not_implemented}. + +%% RPC target. +-spec do_ensure_iterator_closed(emqx_ds:iterator_id()) -> ok. +do_ensure_iterator_closed(_IteratorID) -> + ok. + +%% RPC target. +-spec do_ensure_all_iterators_closed(id()) -> ok. +do_ensure_all_iterators_closed(_DSSessionID) -> + ok. + +%%-------------------------------------------------------------------- +%% Reading batches +%%-------------------------------------------------------------------- + +renew_streams(Id) -> + Subscriptions = ro_transaction(fun() -> session_read_subscriptions(Id) end), + ExistingStreams = ro_transaction(fun() -> mnesia:read(?SESSION_STREAM_TAB, Id) end), + lists:foreach( + fun(#ds_sub{id = {_, TopicFilter}, start_time = StartTime}) -> + renew_streams(Id, ExistingStreams, TopicFilter, StartTime) + end, + Subscriptions + ). + +renew_streams(Id, ExistingStreams, TopicFilter, StartTime) -> + AllStreams = emqx_ds:get_streams(?PERSISTENT_MESSAGE_DB, TopicFilter, StartTime), + transaction( + fun() -> + lists:foreach( + fun({Rank, Stream}) -> + Rec = #ds_stream{ + session = Id, + topic_filter = TopicFilter, + stream = Stream, + rank = Rank + }, + case lists:member(Rec, ExistingStreams) of + true -> + ok; + false -> + mnesia:write(?SESSION_STREAM_TAB, Rec, write), + {ok, Iterator} = emqx_ds:make_iterator(Stream, TopicFilter, StartTime), + IterRec = #ds_iter{id = {Id, Stream}, iter = Iterator}, + mnesia:write(?SESSION_ITER_TAB, IterRec, write) + end + end, + AllStreams + ) + end + ). %%-------------------------------------------------------------------------------- @@ -649,23 +680,39 @@ transaction(Fun) -> {atomic, Res} = mria:transaction(?DS_MRIA_SHARD, Fun), Res. +ro_transaction(Fun) -> + {atomic, Res} = mria:ro_transaction(?DS_MRIA_SHARD, Fun), + Res. + %%-------------------------------------------------------------------------------- -export_iterators(IteratorRefs) -> +export_subscriptions(DSSubs) -> lists:foldl( - fun(IteratorRef = #iterator_ref{ref_id = {_DSSessionId, TopicFilter}}, Acc) -> - Acc#{TopicFilter => export_record(IteratorRef)} + fun(DSSub = #ds_sub{id = {_DSSessionId, TopicFilter}}, Acc) -> + Acc#{TopicFilter => export_subscription(DSSub)} end, #{}, - IteratorRefs + DSSubs ). -export_record(#session{} = Record) -> - export_record(Record, #session.id, [id, created_at, expires_at, props], #{}); -export_record(#iterator_ref{} = Record) -> - export_record(Record, #iterator_ref.it_id, [id, start_time, props], #{}). +export_session(#session{} = Record) -> + export_record(Record, #session.id, [id, created_at, expires_at, inflight, props], #{}). + +export_subscription(#ds_sub{} = Record) -> + export_record(Record, #ds_sub.start_time, [start_time, props, extra], #{}). export_record(Record, I, [Field | Rest], Acc) -> export_record(Record, I + 1, Rest, Acc#{Field => element(I, Record)}); export_record(_, _, [], Acc) -> Acc. + +%% TODO: find a more reliable way to perform actions that have side +%% effects. Add `CBM:init' callback to the session behavior? +ensure_timers() -> + ensure_timer(pull), + ensure_timer(get_streams). + +-spec ensure_timer(pull | get_streams) -> ok. +ensure_timer(Type) -> + _ = emqx_utils:start_timer(100, {emqx_session, Type}), + ok. diff --git a/apps/emqx/src/emqx_persistent_session_ds.hrl b/apps/emqx/src/emqx_persistent_session_ds.hrl new file mode 100644 index 000000000..54b077795 --- /dev/null +++ b/apps/emqx/src/emqx_persistent_session_ds.hrl @@ -0,0 +1,56 @@ +%%-------------------------------------------------------------------- +%% Copyright (c) 2023 EMQ Technologies Co., Ltd. All Rights Reserved. +%% +%% Licensed under the Apache License, Version 2.0 (the "License"); +%% you may not use this file except in compliance with the License. +%% You may obtain a copy of the License at +%% +%% http://www.apache.org/licenses/LICENSE-2.0 +%% +%% Unless required by applicable law or agreed to in writing, software +%% distributed under the License is distributed on an "AS IS" BASIS, +%% WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +%% See the License for the specific language governing permissions and +%% limitations under the License. +%%-------------------------------------------------------------------- +-ifndef(EMQX_PERSISTENT_SESSION_DS_HRL_HRL). +-define(EMQX_PERSISTENT_SESSION_DS_HRL_HRL, true). + +-define(SESSION_TAB, emqx_ds_session). +-define(SESSION_SUBSCRIPTIONS_TAB, emqx_ds_session_subscriptions). +-define(SESSION_STREAM_TAB, emqx_ds_stream_tab). +-define(SESSION_ITER_TAB, emqx_ds_iter_tab). +-define(DS_MRIA_SHARD, emqx_ds_session_shard). + +-record(ds_sub, { + id :: emqx_persistent_session_ds:subscription_id(), + start_time :: emqx_ds:time(), + props = #{} :: map(), + extra = #{} :: map() +}). +-type ds_sub() :: #ds_sub{}. + +-record(ds_stream, { + session :: emqx_persistent_session_ds:id(), + topic_filter :: emqx_ds:topic_filter(), + stream :: emqx_ds:stream(), + rank :: emqx_ds:stream_rank() +}). + +-record(ds_iter, { + id :: {emqx_persistent_session_ds:id(), emqx_ds:stream()}, + iter :: emqx_ds:iterator() +}). + +-record(session, { + %% same as clientid + id :: emqx_persistent_session_ds:id(), + %% creation time + created_at :: _Millisecond :: non_neg_integer(), + expires_at = never :: _Millisecond :: non_neg_integer() | never, + inflight :: emqx_persistent_message_ds_replayer:inflight(), + %% for future usage + props = #{} :: map() +}). + +-endif. diff --git a/apps/emqx/src/proto/emqx_persistent_session_ds_proto_v1.erl b/apps/emqx/src/proto/emqx_persistent_session_ds_proto_v1.erl index d35ccd963..e879b495c 100644 --- a/apps/emqx/src/proto/emqx_persistent_session_ds_proto_v1.erl +++ b/apps/emqx/src/proto/emqx_persistent_session_ds_proto_v1.erl @@ -20,6 +20,7 @@ -export([ introduced_in/0, + deprecated_since/0, open_iterator/4, close_iterator/2, @@ -31,9 +32,11 @@ -define(TIMEOUT, 30_000). introduced_in() -> - %% FIXME "5.3.0". +deprecated_since() -> + "5.4.0". + -spec open_iterator( [node()], emqx_types:words(), diff --git a/apps/emqx/test/emqx_persistent_messages_SUITE.erl b/apps/emqx/test/emqx_persistent_messages_SUITE.erl index 751b7e4b8..52ba090b5 100644 --- a/apps/emqx/test/emqx_persistent_messages_SUITE.erl +++ b/apps/emqx/test/emqx_persistent_messages_SUITE.erl @@ -26,9 +26,7 @@ -import(emqx_common_test_helpers, [on_exit/1]). --define(DEFAULT_KEYSPACE, default). --define(DS_SHARD_ID, <<"local">>). --define(DS_SHARD, {?DEFAULT_KEYSPACE, ?DS_SHARD_ID}). +-define(PERSISTENT_MESSAGE_DB, emqx_persistent_message). all() -> emqx_common_test_helpers:all(?MODULE). @@ -48,6 +46,7 @@ init_per_testcase(t_session_subscription_iterators = TestCase, Config) -> Nodes = emqx_cth_cluster:start(Cluster, #{work_dir => emqx_cth_suite:work_dir(TestCase, Config)}), [{nodes, Nodes} | Config]; init_per_testcase(TestCase, Config) -> + ok = emqx_ds:drop_db(?PERSISTENT_MESSAGE_DB), Apps = emqx_cth_suite:start( app_specs(), #{work_dir => emqx_cth_suite:work_dir(TestCase, Config)} @@ -58,10 +57,11 @@ end_per_testcase(t_session_subscription_iterators, Config) -> Nodes = ?config(nodes, Config), emqx_common_test_helpers:call_janitor(60_000), ok = emqx_cth_cluster:stop(Nodes), - ok; + end_per_testcase(common, Config); end_per_testcase(_TestCase, Config) -> - Apps = ?config(apps, Config), + Apps = proplists:get_value(apps, Config, []), emqx_common_test_helpers:call_janitor(60_000), + clear_db(), emqx_cth_suite:stop(Apps), ok. @@ -95,14 +95,15 @@ t_messages_persisted(_Config) -> Results = [emqtt:publish(CP, Topic, Payload, 1) || {Topic, Payload} <- Messages], ct:pal("Results = ~p", [Results]), + timer:sleep(2000), - Persisted = consume(?DS_SHARD, {['#'], 0}), + Persisted = consume(['#'], 0), ct:pal("Persisted = ~p", [Persisted]), ?assertEqual( - [M1, M2, M5, M7, M9, M10], - [{emqx_message:topic(M), emqx_message:payload(M)} || M <- Persisted] + lists:sort([M1, M2, M5, M7, M9, M10]), + lists:sort([{emqx_message:topic(M), emqx_message:payload(M)} || M <- Persisted]) ), ok. @@ -139,23 +140,25 @@ t_messages_persisted_2(_Config) -> {ok, #{reason_code := ?RC_NO_MATCHING_SUBSCRIBERS}} = emqtt:publish(CP, T(<<"client/2/topic">>), <<"8">>, 1), - Persisted = consume(?DS_SHARD, {['#'], 0}), + timer:sleep(2000), + + Persisted = consume(['#'], 0), ct:pal("Persisted = ~p", [Persisted]), ?assertEqual( - [ + lists:sort([ {T(<<"client/1/topic">>), <<"4">>}, {T(<<"client/2/topic">>), <<"5">>} - ], - [{emqx_message:topic(M), emqx_message:payload(M)} || M <- Persisted] + ]), + lists:sort([{emqx_message:topic(M), emqx_message:payload(M)} || M <- Persisted]) ), ok. %% TODO: test quic and ws too t_session_subscription_iterators(Config) -> - [Node1, Node2] = ?config(nodes, Config), + [Node1, _Node2] = ?config(nodes, Config), Port = get_mqtt_port(Node1, tcp), Topic = <<"t/topic">>, SubTopicFilter = <<"t/+">>, @@ -202,11 +205,8 @@ t_session_subscription_iterators(Config) -> messages => [Message1, Message2, Message3, Message4] } end, - fun(Results, Trace) -> + fun(Trace) -> ct:pal("trace:\n ~p", [Trace]), - #{ - messages := [_Message1, Message2, Message3 | _] - } = Results, case ?of_kind(ds_session_subscription_added, Trace) of [] -> %% Since `emqx_durable_storage' is a dependency of `emqx', it gets @@ -228,17 +228,6 @@ t_session_subscription_iterators(Config) -> ), ok end, - ?assertMatch({ok, [_]}, get_all_iterator_ids(Node1)), - {ok, [IteratorId]} = get_all_iterator_ids(Node1), - ?assertMatch({ok, [IteratorId]}, get_all_iterator_ids(Node2)), - ReplayMessages1 = erpc:call(Node1, fun() -> consume(?DS_SHARD, IteratorId) end), - ExpectedMessages = [Message2, Message3], - %% Note: it is expected that this will break after replayers are in place. - %% They might have consumed all the messages by this time. - ?assertEqual(ExpectedMessages, ReplayMessages1), - %% Different DS shard - ReplayMessages2 = erpc:call(Node2, fun() -> consume(?DS_SHARD, IteratorId) end), - ?assertEqual([], ReplayMessages2), ok end ), @@ -263,33 +252,26 @@ connect(Opts0 = #{}) -> {ok, _} = emqtt:connect(Client), Client. -consume(Shard, Replay = {_TopicFiler, _StartMS}) -> - {ok, It} = emqx_ds_storage_layer:make_iterator(Shard, Replay), - consume(It); -consume(Shard, IteratorId) when is_binary(IteratorId) -> - {ok, It} = emqx_ds_storage_layer:restore_iterator(Shard, IteratorId), - consume(It). +consume(TopicFilter, StartMS) -> + Streams = emqx_ds:get_streams(?PERSISTENT_MESSAGE_DB, TopicFilter, StartMS), + lists:flatmap( + fun({_Rank, Stream}) -> + {ok, It} = emqx_ds:make_iterator(Stream, TopicFilter, StartMS), + consume(It) + end, + Streams + ). consume(It) -> - case emqx_ds_storage_layer:next(It) of - {value, Msg, NIt} -> - [emqx_persistent_message:deserialize(Msg) | consume(NIt)]; - none -> + case emqx_ds:next(It, 100) of + {ok, _NIt, _Msgs = []} -> + []; + {ok, NIt, Msgs} -> + Msgs ++ consume(NIt); + {ok, end_of_stream} -> [] end. -delete_all_messages() -> - Persisted = consume(?DS_SHARD, {['#'], 0}), - lists:foreach( - fun(Msg) -> - GUID = emqx_message:id(Msg), - Topic = emqx_topic:words(emqx_message:topic(Msg)), - Timestamp = emqx_guid:timestamp(GUID), - ok = emqx_ds_storage_layer:delete(?DS_SHARD, GUID, Timestamp, Topic) - end, - Persisted - ). - receive_messages(Count) -> receive_messages(Count, []). @@ -306,13 +288,6 @@ receive_messages(Count, Msgs) -> publish(Node, Message) -> erpc:call(Node, emqx, publish, [Message]). -get_iterator_ids(Node, ClientId) -> - Channel = erpc:call(Node, fun() -> - [ConnPid] = emqx_cm:lookup_channels(ClientId), - sys:get_state(ConnPid) - end), - emqx_connection:info({channel, {session, iterators}}, Channel). - app_specs() -> [ emqx_durable_storage, @@ -330,5 +305,6 @@ get_mqtt_port(Node, Type) -> {_IP, Port} = erpc:call(Node, emqx_config, get, [[listeners, Type, default, bind]]), Port. -get_all_iterator_ids(Node) -> - erpc:call(Node, emqx_ds_storage_layer, list_iterator_prefix, [?DS_SHARD, <<>>]). +clear_db() -> + ok = emqx_ds:drop_db(?PERSISTENT_MESSAGE_DB), + ok. diff --git a/apps/emqx/test/emqx_persistent_session_SUITE.erl b/apps/emqx/test/emqx_persistent_session_SUITE.erl index be3bf6e6a..5a14e0bc9 100644 --- a/apps/emqx/test/emqx_persistent_session_SUITE.erl +++ b/apps/emqx/test/emqx_persistent_session_SUITE.erl @@ -24,6 +24,8 @@ -compile(export_all). -compile(nowarn_export_all). +-define(PERSISTENT_MESSAGE_DB, emqx_persistent_message). + %%-------------------------------------------------------------------- %% SUITE boilerplate %%-------------------------------------------------------------------- @@ -131,6 +133,7 @@ get_listener_port(Type, Name) -> end_per_group(Group, Config) when Group == tcp; Group == ws; Group == quic -> ok = emqx_cth_suite:stop(?config(group_apps, Config)); end_per_group(_, _Config) -> + ok = emqx_ds:drop_db(?PERSISTENT_MESSAGE_DB), ok. init_per_testcase(TestCase, Config) -> @@ -188,7 +191,7 @@ receive_messages(Count, Msgs) -> receive_messages(Count - 1, [Msg | Msgs]); _Other -> receive_messages(Count, Msgs) - after 5000 -> + after 15000 -> Msgs end. @@ -227,11 +230,11 @@ wait_for_cm_unregister(ClientId, N) -> end. publish(Topic, Payloads) -> - publish(Topic, Payloads, false). + publish(Topic, Payloads, false, 2). -publish(Topic, Payloads, WaitForUnregister) -> +publish(Topic, Payloads, WaitForUnregister, QoS) -> Fun = fun(Client, Payload) -> - {ok, _} = emqtt:publish(Client, Topic, Payload, 2) + {ok, _} = emqtt:publish(Client, Topic, Payload, QoS) end, do_publish(Payloads, Fun, WaitForUnregister). @@ -510,6 +513,48 @@ t_process_dies_session_expires(Config) -> emqtt:disconnect(Client2). +t_publish_while_client_is_gone_qos1(Config) -> + %% A persistent session should receive messages in its + %% subscription even if the process owning the session dies. + ConnFun = ?config(conn_fun, Config), + Topic = ?config(topic, Config), + STopic = ?config(stopic, Config), + Payload1 = <<"hello1">>, + Payload2 = <<"hello2">>, + ClientId = ?config(client_id, Config), + {ok, Client1} = emqtt:start_link([ + {proto_ver, v5}, + {clientid, ClientId}, + {properties, #{'Session-Expiry-Interval' => 30}}, + {clean_start, true} + | Config + ]), + {ok, _} = emqtt:ConnFun(Client1), + {ok, _, [1]} = emqtt:subscribe(Client1, STopic, qos1), + + ok = emqtt:disconnect(Client1), + maybe_kill_connection_process(ClientId, Config), + + ok = publish(Topic, [Payload1, Payload2], false, 1), + + {ok, Client2} = emqtt:start_link([ + {proto_ver, v5}, + {clientid, ClientId}, + {properties, #{'Session-Expiry-Interval' => 30}}, + {clean_start, false} + | Config + ]), + {ok, _} = emqtt:ConnFun(Client2), + Msgs = receive_messages(2), + ?assertMatch([_, _], Msgs), + [Msg2, Msg1] = Msgs, + ?assertEqual({ok, iolist_to_binary(Payload1)}, maps:find(payload, Msg1)), + ?assertEqual({ok, 1}, maps:find(qos, Msg1)), + ?assertEqual({ok, iolist_to_binary(Payload2)}, maps:find(payload, Msg2)), + ?assertEqual({ok, 1}, maps:find(qos, Msg2)), + + ok = emqtt:disconnect(Client2). + t_publish_while_client_is_gone(init, Config) -> skip_ds_tc(Config); t_publish_while_client_is_gone('end', _Config) -> ok. t_publish_while_client_is_gone(Config) -> diff --git a/apps/emqx_durable_storage/IMPLEMENTATION.md b/apps/emqx_durable_storage/IMPLEMENTATION.md index 9c0c5928c..33f02dfc4 100644 --- a/apps/emqx_durable_storage/IMPLEMENTATION.md +++ b/apps/emqx_durable_storage/IMPLEMENTATION.md @@ -31,48 +31,6 @@ Read pattern: pseudoserial Number of records: O(total write throughput * retention time) -## Session storage - -Data there is updated when: - -- A new client connects with clean session = false -- Client subscribes to a topic -- Client unsubscribes to a topic -- Garbage collection is performed - -Write throughput: low - -Data is read when a client connects and replay agents are started - -Read throughput: low - -Data format: - -`#session{clientId = "foobar", iterators = [ItKey1, ItKey2, ItKey3, ...]}` - -Number of records: O(N clients) - -Size of record: O(N subscriptions per clients) - -## Iterator storage - -Data is written every time a client acks a message. - -Data is read when a client reconnects and we restart replay agents. - -`#iterator{key = IterKey, data = Blob}` - -Number of records: O(N clients * N subscriptions per client) - -Size of record: O(1) - -Write throughput: high, lots of small updates - -Write pattern: mostly key overwrite - -Read throughput: low - -Read pattern: random # Push vs. Pull model diff --git a/apps/emqx_durable_storage/README.md b/apps/emqx_durable_storage/README.md index 7de43bee0..f01af0c37 100644 --- a/apps/emqx_durable_storage/README.md +++ b/apps/emqx_durable_storage/README.md @@ -1,9 +1,10 @@ # EMQX Replay -`emqx_ds` is a durable storage for MQTT messages within EMQX. -It implements the following scenarios: -- Persisting messages published by clients -- +`emqx_ds` is a generic durable storage for MQTT messages within EMQX. + +Concepts: + + > 0. App overview introduction > 1. let people know what your project can do specifically. Is it a base diff --git a/apps/emqx_durable_storage/include/emqx_ds.hrl b/apps/emqx_durable_storage/include/emqx_ds.hrl new file mode 100644 index 000000000..c9ee4b7f7 --- /dev/null +++ b/apps/emqx_durable_storage/include/emqx_ds.hrl @@ -0,0 +1,19 @@ +%%-------------------------------------------------------------------- +%% Copyright (c) 2023 EMQ Technologies Co., Ltd. All Rights Reserved. +%% +%% Licensed under the Apache License, Version 2.0 (the "License"); +%% you may not use this file except in compliance with the License. +%% You may obtain a copy of the License at +%% +%% http://www.apache.org/licenses/LICENSE-2.0 +%% +%% Unless required by applicable law or agreed to in writing, software +%% distributed under the License is distributed on an "AS IS" BASIS, +%% WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +%% See the License for the specific language governing permissions and +%% limitations under the License. +%%-------------------------------------------------------------------- +-ifndef(EMQX_DS_HRL_HRL). +-define(EMQX_DS_HRL_HRL, true). + +-endif. diff --git a/apps/emqx_durable_storage/rebar.config b/apps/emqx_durable_storage/rebar.config new file mode 100644 index 000000000..f04819025 --- /dev/null +++ b/apps/emqx_durable_storage/rebar.config @@ -0,0 +1,3 @@ +%% -*- mode:erlang -*- +{deps, + [{emqx_utils, {path, "../emqx_utils"}}]}. diff --git a/apps/emqx_durable_storage/src/emqx_ds.erl b/apps/emqx_durable_storage/src/emqx_ds.erl index feaa37bc0..27a0745bc 100644 --- a/apps/emqx_durable_storage/src/emqx_ds.erl +++ b/apps/emqx_durable_storage/src/emqx_ds.erl @@ -13,50 +13,48 @@ %% See the License for the specific language governing permissions and %% limitations under the License. %%-------------------------------------------------------------------- + +%% @doc Main interface module for `emqx_durable_storage' application. +%% +%% It takes care of forwarding calls to the underlying DBMS. Currently +%% only the embedded `emqx_ds_replication_layer' storage is supported, +%% so all the calls are simply passed through. -module(emqx_ds). --include_lib("stdlib/include/ms_transform.hrl"). --include_lib("snabbkaffe/include/snabbkaffe.hrl"). +%% Management API: +-export([open_db/2, drop_db/1]). -%% API: --export([ensure_shard/2]). -%% Messages: --export([message_store/2, message_store/1, message_stats/0]). -%% Iterator: --export([iterator_update/2, iterator_next/1, iterator_stats/0]). +%% Message storage API: +-export([store_batch/2, store_batch/3]). -%% internal exports: +%% Message replay API: +-export([get_streams/3, make_iterator/3, next/2]). + +%% Misc. API: -export([]). -export_type([ - keyspace/0, - message_id/0, - message_stats/0, - message_store_opts/0, - replay/0, - replay_id/0, - iterator_id/0, - iterator/0, - shard/0, - shard_id/0, - topic/0, + create_db_opts/0, + builtin_db_opts/0, + db/0, + time/0, topic_filter/0, - time/0 + topic/0, + stream/0, + stream_rank/0, + iterator/0, + message_id/0, + next_result/1, next_result/0, + store_batch_result/0, + make_iterator_result/1, make_iterator_result/0, + get_iterator_result/1 ]). %%================================================================================ %% Type declarations %%================================================================================ --type iterator() :: term(). - --type iterator_id() :: binary(). - --type message_store_opts() :: #{}. - --type message_stats() :: #{}. - --type message_id() :: binary(). +-type db() :: atom(). %% Parsed topic. -type topic() :: list(binary()). @@ -64,9 +62,22 @@ %% Parsed topic filter. -type topic_filter() :: list(binary() | '+' | '#' | ''). --type keyspace() :: atom(). --type shard_id() :: binary(). --type shard() :: {keyspace(), shard_id()}. +-type stream_rank() :: {term(), integer()}. + +-opaque stream() :: emqx_ds_replication_layer:stream(). + +-opaque iterator() :: emqx_ds_replication_layer:iterator(). + +-type store_batch_result() :: ok | {error, _}. + +-type make_iterator_result(Iterator) :: {ok, Iterator} | {error, _}. + +-type make_iterator_result() :: make_iterator_result(iterator()). + +-type next_result(Iterator) :: + {ok, Iterator, [emqx_types:message()]} | {ok, end_of_stream} | {error, _}. + +-type next_result() :: next_result(iterator()). %% Timestamp %% Earliest possible timestamp is 0. @@ -74,70 +85,102 @@ %% use in emqx_guid. Otherwise, the iterators won't match the message timestamps. -type time() :: non_neg_integer(). --type replay_id() :: binary(). +-type message_store_opts() :: #{}. --type replay() :: { - _TopicFilter :: topic_filter(), - _StartTime :: time() -}. +-type builtin_db_opts() :: + #{ + backend := builtin, + storage := emqx_ds_storage_layer:prototype() + }. + +-type create_db_opts() :: + builtin_db_opts(). + +-type message_id() :: emqx_ds_replication_layer:message_id(). + +-type get_iterator_result(Iterator) :: {ok, Iterator} | undefined. %%================================================================================ %% API funcions %%================================================================================ --spec ensure_shard(shard(), emqx_ds_storage_layer:options()) -> - ok | {error, _Reason}. -ensure_shard(Shard, Options) -> - case emqx_ds_storage_layer_sup:start_shard(Shard, Options) of - {ok, _Pid} -> - ok; - {error, {already_started, _Pid}} -> - ok; - {error, Reason} -> - {error, Reason} - end. +%% @doc Different DBs are completely independent from each other. They +%% could represent something like different tenants. +-spec open_db(db(), create_db_opts()) -> ok. +open_db(DB, Opts = #{backend := builtin}) -> + emqx_ds_replication_layer:open_db(DB, Opts). -%%-------------------------------------------------------------------------------- -%% Message -%%-------------------------------------------------------------------------------- --spec message_store([emqx_types:message()], message_store_opts()) -> - {ok, [message_id()]} | {error, _}. -message_store(_Msg, _Opts) -> - %% TODO - {error, not_implemented}. +%% @doc TODO: currently if one or a few shards are down, they won't be --spec message_store([emqx_types:message()]) -> {ok, [message_id()]} | {error, _}. -message_store(Msg) -> - %% TODO - message_store(Msg, #{}). +%% deleted. +-spec drop_db(db()) -> ok. +drop_db(DB) -> + emqx_ds_replication_layer:drop_db(DB). --spec message_stats() -> message_stats(). -message_stats() -> - #{}. +-spec store_batch(db(), [emqx_types:message()], message_store_opts()) -> store_batch_result(). +store_batch(DB, Msgs, Opts) -> + emqx_ds_replication_layer:store_batch(DB, Msgs, Opts). -%%-------------------------------------------------------------------------------- -%% Session -%%-------------------------------------------------------------------------------- +-spec store_batch(db(), [emqx_types:message()]) -> store_batch_result(). +store_batch(DB, Msgs) -> + store_batch(DB, Msgs, #{}). -%%-------------------------------------------------------------------------------- -%% Iterator (pull API) -%%-------------------------------------------------------------------------------- +%% @doc Get a list of streams needed for replaying a topic filter. +%% +%% Motivation: under the hood, EMQX may store different topics at +%% different locations or even in different databases. A wildcard +%% topic filter may require pulling data from any number of locations. +%% +%% Stream is an abstraction exposed by `emqx_ds' that, on one hand, +%% reflects the notion that different topics can be stored +%% differently, but hides the implementation details. +%% +%% While having to work with multiple iterators to replay a topic +%% filter may be cumbersome, it opens up some possibilities: +%% +%% 1. It's possible to parallelize replays +%% +%% 2. Streams can be shared between different clients to implement +%% shared subscriptions +%% +%% IMPORTANT RULES: +%% +%% 0. There is no 1-to-1 mapping between MQTT topics and streams. One +%% stream can contain any number of MQTT topics. +%% +%% 1. New streams matching the topic filter and start time can appear +%% without notice, so the replayer must periodically call this +%% function to get the updated list of streams. +%% +%% 2. Streams may depend on one another. Therefore, care should be +%% taken while replaying them in parallel to avoid out-of-order +%% replay. This function returns stream together with its +%% "coordinate": `stream_rank()'. +%% +%% Stream rank is a tuple of two integers, let's call them X and Y. If +%% X coordinate of two streams is different, they are independent and +%% can be replayed in parallel. If it's the same, then the stream with +%% smaller Y coordinate should be replayed first. If Y coordinates are +%% equal, then the streams are independent. +%% +%% Stream is fully consumed when `next/3' function returns +%% `end_of_stream'. Then and only then the client can proceed to +%% replaying streams that depend on the given one. +-spec get_streams(db(), topic_filter(), time()) -> [{stream_rank(), stream()}]. +get_streams(DB, TopicFilter, StartTime) -> + emqx_ds_replication_layer:get_streams(DB, TopicFilter, StartTime). -%% @doc Called when a client acks a message --spec iterator_update(iterator_id(), iterator()) -> ok. -iterator_update(_IterId, _Iter) -> - %% TODO - ok. +-spec make_iterator(stream(), topic_filter(), time()) -> make_iterator_result(). +make_iterator(Stream, TopicFilter, StartTime) -> + emqx_ds_replication_layer:make_iterator(Stream, TopicFilter, StartTime). -%% @doc Called when a client acks a message --spec iterator_next(iterator()) -> {value, emqx_types:message(), iterator()} | none | {error, _}. -iterator_next(_Iter) -> - %% TODO - none. +-spec next(iterator(), pos_integer()) -> next_result(). +next(Iter, BatchSize) -> + emqx_ds_replication_layer:next(Iter, BatchSize). --spec iterator_stats() -> #{}. -iterator_stats() -> - #{}. +%%================================================================================ +%% Internal exports +%%================================================================================ %%================================================================================ %% Internal functions diff --git a/apps/emqx_durable_storage/src/emqx_ds_bitmask.hrl b/apps/emqx_durable_storage/src/emqx_ds_bitmask.hrl new file mode 100644 index 000000000..31af0e034 --- /dev/null +++ b/apps/emqx_durable_storage/src/emqx_ds_bitmask.hrl @@ -0,0 +1,36 @@ +%%-------------------------------------------------------------------- +%% Copyright (c) 2022-2023 EMQ Technologies Co., Ltd. All Rights Reserved. +%% +%% Licensed under the Apache License, Version 2.0 (the "License"); +%% you may not use this file except in compliance with the License. +%% You may obtain a copy of the License at +%% +%% http://www.apache.org/licenses/LICENSE-2.0 +%% +%% Unless required by applicable law or agreed to in writing, software +%% distributed under the License is distributed on an "AS IS" BASIS, +%% WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +%% See the License for the specific language governing permissions and +%% limitations under the License. +%%-------------------------------------------------------------------- +-ifndef(EMQX_DS_BITMASK_HRL). +-define(EMQX_DS_BITMASK_HRL, true). + +-record(filter_scan_action, { + offset :: emqx_ds_bitmask_keymapper:offset(), + size :: emqx_ds_bitmask_keymapper:bitsize(), + min :: non_neg_integer(), + max :: non_neg_integer() +}). + +-record(filter, { + size :: non_neg_integer(), + bitfilter :: non_neg_integer(), + bitmask :: non_neg_integer(), + %% Ranges (in _bitsource_ basis): + bitsource_ranges :: array:array(#filter_scan_action{}), + range_min :: non_neg_integer(), + range_max :: non_neg_integer() +}). + +-endif. diff --git a/apps/emqx_durable_storage/src/emqx_ds_bitmask_keymapper.erl b/apps/emqx_durable_storage/src/emqx_ds_bitmask_keymapper.erl new file mode 100644 index 000000000..a3b65c7e6 --- /dev/null +++ b/apps/emqx_durable_storage/src/emqx_ds_bitmask_keymapper.erl @@ -0,0 +1,824 @@ +%%-------------------------------------------------------------------- +%% Copyright (c) 2022-2023 EMQ Technologies Co., Ltd. All Rights Reserved. +%%-------------------------------------------------------------------- +-module(emqx_ds_bitmask_keymapper). + +%%================================================================================ +%% @doc This module is used to map N-dimensional coordinates to a +%% 1-dimensional space. +%% +%% Example: +%% +%% Let us assume that `T' is a topic and `t' is time. These are the two +%% dimensions used to index messages. They can be viewed as +%% "coordinates" of an MQTT message in a 2D space. +%% +%% Oftentimes, when wildcard subscription is used, keys must be +%% scanned in both dimensions simultaneously. +%% +%% Rocksdb allows to iterate over sorted keys very fast. This means we +%% need to map our two-dimentional keys to a single index that is +%% sorted in a way that helps to iterate over both time and topic +%% without having to do a lot of random seeks. +%% +%% == Mapping of 2D keys to rocksdb keys == +%% +%% We use "zigzag" pattern to store messages, where rocksdb key is +%% composed like like this: +%% +%% |ttttt|TTTTTTTTT|tttt| +%% ^ ^ ^ +%% | | | +%% +-------+ | +---------+ +%% | | | +%% most significant topic hash least significant +%% bits of timestamp bits of timestamp +%% (a.k.a epoch) (a.k.a time offset) +%% +%% Topic hash is level-aware: each topic level is hashed separately +%% and the resulting hashes are bitwise-concatentated. This allows us +%% to map topics to fixed-length bitstrings while keeping some degree +%% of information about the hierarchy. +%% +%% Next important concept is what we call "epoch". Duration of the +%% epoch is determined by maximum time offset. Epoch is calculated by +%% shifting bits of the timestamp right. +%% +%% The resulting index is a space-filling curve that looks like +%% this in the topic-time 2D space: +%% +%% T ^ ---->------ |---->------ |---->------ +%% | --/ / --/ / --/ +%% | -<-/ | -<-/ | -<-/ +%% | -/ | -/ | -/ +%% | ---->------ | ---->------ | ---->------ +%% | --/ / --/ / --/ +%% | ---/ | ---/ | ---/ +%% | -/ ^ -/ ^ -/ +%% | ---->------ | ---->------ | ---->------ +%% | --/ / --/ / --/ +%% | -<-/ | -<-/ | -<-/ +%% | -/ | -/ | -/ +%% | ---->------| ---->------| ----------> +%% | +%% -+------------+-----------------------------> t +%% epoch +%% +%% This structure allows to quickly seek to a the first message that +%% was recorded in a certain epoch in a certain topic or a +%% group of topics matching filter like `foo/bar/#`. +%% +%% Due to its structure, for each pair of rocksdb keys K1 and K2, such +%% that K1 > K2 and topic(K1) = topic(K2), timestamp(K1) > +%% timestamp(K2). +%% That is, replay doesn't reorder messages published in each +%% individual topic. +%% +%% This property doesn't hold between different topics, but it's not deemed +%% a problem right now. +%% +%%================================================================================ + +%% API: +-export([ + make_keymapper/1, + vector_to_key/2, + bin_vector_to_key/2, + key_to_vector/2, + bin_key_to_vector/2, + key_to_bitstring/2, + bitstring_to_key/2, + make_filter/2, + ratchet/2, + bin_increment/2, + bin_checkmask/2, + bitsize/1 +]). + +-export_type([vector/0, key/0, dimension/0, offset/0, bitsize/0, bitsource/0, keymapper/0]). + +-compile( + {inline, [ + ones/1, + extract/2, + extract_inv/2 + ]} +). + +-elvis([{elvis_style, no_if_expression, disable}]). + +-ifdef(TEST). +-include_lib("proper/include/proper.hrl"). +-include_lib("eunit/include/eunit.hrl"). +-endif. + +%%================================================================================ +%% Type declarations +%%================================================================================ + +-type scalar() :: integer(). + +-type vector() :: [scalar()]. + +%% N-th coordinate of a vector: +-type dimension() :: pos_integer(). + +-type offset() :: non_neg_integer(). + +-type bitsize() :: pos_integer(). + +%% The resulting 1D key: +-type key() :: non_neg_integer(). + +-type bitsource() :: + %% Consume `_Size` bits from timestamp starting at `_Offset`th + %% bit from N-th element of the input vector: + {dimension(), offset(), bitsize()}. + +-record(scan_action, { + src_bitmask :: integer(), + src_offset :: offset(), + dst_offset :: offset() +}). + +-type scan_action() :: #scan_action{}. + +-type scanner() :: [[scan_action()]]. + +-record(keymapper, { + schema :: [bitsource()], + scanner :: scanner(), + size :: non_neg_integer(), + dim_sizeof :: [non_neg_integer()] +}). + +-opaque keymapper() :: #keymapper{}. + +-type scalar_range() :: + any | {'=', scalar() | infinity} | {'>=', scalar()} | {scalar(), '..', scalar()}. + +-include("emqx_ds_bitmask.hrl"). + +-type filter() :: #filter{}. + +%%================================================================================ +%% API functions +%%================================================================================ + +%% @doc Create a keymapper object that stores the "schema" of the +%% transformation from a list of bitsources. +%% +%% Note: Dimension is 1-based. +%% +%% Note: order of bitsources is important. First element of the list +%% is mapped to the _least_ significant bits of the key, and the last +%% element becomes most significant bits. +-spec make_keymapper([bitsource()]) -> keymapper(). +make_keymapper(Bitsources) -> + Arr0 = array:new([{fixed, false}, {default, {0, []}}]), + {Size, Arr} = fold_bitsources( + fun(DestOffset, {Dim0, Offset, Size}, Acc) -> + Dim = Dim0 - 1, + Action = #scan_action{ + src_bitmask = ones(Size), src_offset = Offset, dst_offset = DestOffset + }, + {DimSizeof, Actions} = array:get(Dim, Acc), + array:set(Dim, {DimSizeof + Size, [Action | Actions]}, Acc) + end, + Arr0, + Bitsources + ), + {DimSizeof, Scanner} = lists:unzip(array:to_list(Arr)), + #keymapper{ + schema = Bitsources, + scanner = Scanner, + size = Size, + dim_sizeof = DimSizeof + }. + +-spec bitsize(keymapper()) -> pos_integer(). +bitsize(#keymapper{size = Size}) -> + Size. + +%% @doc Map N-dimensional vector to a scalar key. +%% +%% Note: this function is not injective. +-spec vector_to_key(keymapper(), vector()) -> key(). +vector_to_key(#keymapper{scanner = []}, []) -> + 0; +vector_to_key(#keymapper{scanner = [Actions | Scanner]}, [Coord | Vector]) -> + do_vector_to_key(Actions, Scanner, Coord, Vector, 0). + +%% @doc Same as `vector_to_key', but it works with binaries, and outputs a binary. +-spec bin_vector_to_key(keymapper(), [binary()]) -> binary(). +bin_vector_to_key(Keymapper = #keymapper{dim_sizeof = DimSizeof, size = Size}, Binaries) -> + Vec = lists:zipwith( + fun(Bin, SizeOf) -> + <> = Bin, + Int + end, + Binaries, + DimSizeof + ), + Key = vector_to_key(Keymapper, Vec), + <>. + +%% @doc Map key to a vector. +%% +%% Note: `vector_to_key(key_to_vector(K)) = K' but +%% `key_to_vector(vector_to_key(V)) = V' is not guaranteed. +-spec key_to_vector(keymapper(), key()) -> vector(). +key_to_vector(#keymapper{scanner = Scanner}, Key) -> + lists:map( + fun(Actions) -> + lists:foldl( + fun(Action, Acc) -> + Acc bor extract_inv(Key, Action) + end, + 0, + Actions + ) + end, + Scanner + ). + +%% @doc Same as `key_to_vector', but it works with binaries. +-spec bin_key_to_vector(keymapper(), binary()) -> [binary()]. +bin_key_to_vector(Keymapper = #keymapper{dim_sizeof = DimSizeof, size = Size}, BinKey) -> + <> = BinKey, + Vector = key_to_vector(Keymapper, Key), + lists:zipwith( + fun(Elem, SizeOf) -> + <> + end, + Vector, + DimSizeof + ). + +%% @doc Transform a bitstring to a key +-spec bitstring_to_key(keymapper(), bitstring()) -> key(). +bitstring_to_key(#keymapper{size = Size}, Bin) -> + case Bin of + <> -> + Key; + _ -> + error({invalid_key, Bin, Size}) + end. + +%% @doc Transform key to a fixed-size bistring +-spec key_to_bitstring(keymapper(), key()) -> bitstring(). +key_to_bitstring(#keymapper{size = Size}, Key) -> + <>. + +%% @doc Create a filter object that facilitates range scans. +-spec make_filter(keymapper(), [scalar_range()]) -> filter(). +make_filter( + KeyMapper = #keymapper{schema = Schema, dim_sizeof = DimSizeof, size = TotalSize}, Filter0 +) -> + NDim = length(DimSizeof), + %% Transform "symbolic" constraints to ranges: + Filter1 = constraints_to_ranges(KeyMapper, Filter0), + {Bitmask, Bitfilter} = make_bitfilter(KeyMapper, Filter1), + %% Calculate maximum source offset as per bitsource specification: + MaxOffset = lists:foldl( + fun({Dim, Offset, _Size}, Acc) -> + maps:update_with( + Dim, fun(OldVal) -> max(OldVal, Offset) end, maps:merge(#{Dim => 0}, Acc) + ) + end, + #{}, + Schema + ), + %% Adjust minimum and maximum values for each interval like this: + %% + %% Min: 110100|101011 -> 110100|00000 + %% Max: 110101|001011 -> 110101|11111 + %% ^ + %% | + %% max offset + %% + %% This is needed so when we increment the vector, we always scan + %% the full range of least significant bits. + Filter2 = lists:zipwith( + fun + ({Val, Val}, _Dim) -> + {Val, Val}; + ({Min0, Max0}, Dim) -> + Offset = maps:get(Dim, MaxOffset, 0), + %% Set least significant bits of Min to 0: + Min = (Min0 bsr Offset) bsl Offset, + %% Set least significant bits of Max to 1: + Max = Max0 bor ones(Offset), + {Min, Max} + end, + Filter1, + lists:seq(1, NDim) + ), + %% Project the vector into "bitsource coordinate system": + {_, Filter} = fold_bitsources( + fun(DstOffset, {Dim, SrcOffset, Size}, Acc) -> + {Min0, Max0} = lists:nth(Dim, Filter2), + Min = (Min0 bsr SrcOffset) band ones(Size), + Max = (Max0 bsr SrcOffset) band ones(Size), + Action = #filter_scan_action{ + offset = DstOffset, + size = Size, + min = Min, + max = Max + }, + [Action | Acc] + end, + [], + Schema + ), + Ranges = array:from_list(lists:reverse(Filter)), + %% Compute estimated upper and lower bounds of a _continous_ + %% interval where all keys lie: + case Filter of + [] -> + RangeMin = 0, + RangeMax = 0; + [#filter_scan_action{offset = MSBOffset, min = MSBMin, max = MSBMax} | _] -> + RangeMin = MSBMin bsl MSBOffset, + RangeMax = MSBMax bsl MSBOffset bor ones(MSBOffset) + end, + %% Final value + #filter{ + size = TotalSize, + bitmask = Bitmask, + bitfilter = Bitfilter, + bitsource_ranges = Ranges, + range_min = RangeMin, + range_max = RangeMax + }. + +%% @doc Given a filter `F' and key `K0', return the smallest key `K' +%% that satisfies the following conditions: +%% +%% 1. `K >= K0' +%% +%% 2. `K' satisfies filter `F'. +%% +%% If these conditions cannot be satisfied, return `overflow'. +%% +%% Corollary: `K' may be equal to `K0'. +-spec ratchet(filter(), key()) -> key() | overflow. +ratchet(#filter{bitsource_ranges = Ranges, range_max = Max}, Key) when Key =< Max -> + %% This function works in two steps: first, it finds the position + %% of bitsource ("pivot point") corresponding to the part of the + %% key that should be incremented (or set to the _minimum_ value + %% of the range, in case the respective part of the original key + %% is less than the minimum). It also returns "increment": value + %% that should be added to the part of the key at the pivot point. + %% Increment can be 0 or 1. + %% + %% Then it transforms the key using the following operation: + %% + %% 1. Parts of the key that are less than the pivot point are + %% reset to their minimum values. + %% + %% 2. `Increment' is added to the part of the key at the pivot + %% point. + %% + %% 3. The rest of key stays the same + NDim = array:size(Ranges), + case ratchet_scan(Ranges, NDim, Key, 0, {_Pivot0 = -1, _Increment0 = 0}, _Carry = 0) of + overflow -> + overflow; + {Pivot, Increment} -> + ratchet_do(Ranges, Key, NDim - 1, Pivot, Increment) + end; +ratchet(_, _) -> + overflow. + +%% @doc Given a binary representing a key and a filter, return the +%% next key matching the filter, or `overflow' if such key doesn't +%% exist. +-spec bin_increment(filter(), binary()) -> binary() | overflow. +bin_increment(Filter = #filter{size = Size}, <<>>) -> + Key = ratchet(Filter, 0), + <>; +bin_increment( + Filter = #filter{size = Size, bitmask = Bitmask, bitfilter = Bitfilter, range_max = RangeMax}, + KeyBin +) -> + %% The key may contain random suffix, skip it: + <> = KeyBin, + Key1 = Key0 + 1, + if + Key1 band Bitmask =:= Bitfilter, Key1 =< RangeMax -> + <>; + true -> + case ratchet(Filter, Key1) of + overflow -> + overflow; + Key -> + <> + end + end. + +%% @doc Given a filter and a binary representation of a key, return +%% `false' if the key _doesn't_ match the fitler. This function +%% returning `true' is necessary, but not sufficient condition that +%% the key satisfies the filter. +-spec bin_checkmask(filter(), binary()) -> boolean(). +bin_checkmask(#filter{size = Size, bitmask = Bitmask, bitfilter = Bitfilter}, Key) -> + case Key of + <> -> + Int band Bitmask =:= Bitfilter; + _ -> + false + end. + +%%================================================================================ +%% Internal functions +%%================================================================================ + +%% Note: this function operates in bitsource basis, scanning it from 0 +%% to NDim (i.e. from the least significant bits to the most +%% significant bits) +ratchet_scan(_Ranges, NDim, _Key, NDim, Pivot, 0) -> + %% We've reached the end: + Pivot; +ratchet_scan(_Ranges, NDim, _Key, NDim, _Pivot, 1) -> + %% We've reached the end, but key is still not large enough: + overflow; +ratchet_scan(Ranges, NDim, Key, I, Pivot0, Carry) -> + #filter_scan_action{offset = Offset, size = Size, min = Min, max = Max} = array:get(I, Ranges), + %% Extract I-th element of the vector from the original key: + Elem = ((Key bsr Offset) band ones(Size)) + Carry, + if + Elem < Min -> + %% I-th coordinate is less than the specified minimum. + %% + %% We reset this coordinate to the minimum value. It means + %% we incremented this bitposition, the less significant + %% bits have to be reset to their respective minimum + %% values: + Pivot = {I + 1, 0}, + ratchet_scan(Ranges, NDim, Key, I + 1, Pivot, 0); + Elem > Max -> + %% I-th coordinate is larger than the specified + %% minimum. We can only fix this problem by incrementing + %% the next coordinate (i.e. more significant bits). + %% + %% We reset this coordinate to the minimum value, and + %% increment the next coordinate (by setting `Carry' to + %% 1). + Pivot = {I + 1, 1}, + ratchet_scan(Ranges, NDim, Key, I + 1, Pivot, 1); + true -> + %% Coordinate is within range: + ratchet_scan(Ranges, NDim, Key, I + 1, Pivot0, 0) + end. + +%% Note: this function operates in bitsource basis, scanning it from +%% NDim to 0. It applies the transformation specified by +%% `ratchet_scan'. +ratchet_do(_Ranges, _Key, I, _Pivot, _Increment) when I < 0 -> + 0; +ratchet_do(Ranges, Key, I, Pivot, Increment) -> + #filter_scan_action{offset = Offset, size = Size, min = Min} = array:get(I, Ranges), + Mask = ones(Offset + Size) bxor ones(Offset), + Elem = + if + I > Pivot -> + Mask band Key; + I =:= Pivot -> + (Mask band Key) + (Increment bsl Offset); + true -> + Min bsl Offset + end, + %% erlang:display( + %% {ratchet_do, I, integer_to_list(Key, 16), integer_to_list(Mask, 2), + %% integer_to_list(Elem, 16)} + %% ), + Elem bor ratchet_do(Ranges, Key, I - 1, Pivot, Increment). + +-spec make_bitfilter(keymapper(), [{non_neg_integer(), non_neg_integer()}]) -> + {non_neg_integer(), non_neg_integer()}. +make_bitfilter(Keymapper = #keymapper{dim_sizeof = DimSizeof}, Ranges) -> + L = lists:zipwith( + fun + ({N, N}, Bits) -> + %% For strict equality we can employ bitmask: + {ones(Bits), N}; + (_, _) -> + {0, 0} + end, + Ranges, + DimSizeof + ), + {Bitmask, Bitfilter} = lists:unzip(L), + {vector_to_key(Keymapper, Bitmask), vector_to_key(Keymapper, Bitfilter)}. + +%% Transform constraints into a list of closed intervals that the +%% vector elements should lie in. +constraints_to_ranges(#keymapper{dim_sizeof = DimSizeof}, Filter) -> + lists:zipwith( + fun(Constraint, Bitsize) -> + Max = ones(Bitsize), + case Constraint of + any -> + {0, Max}; + {'=', infinity} -> + {Max, Max}; + {'=', Val} when Val =< Max -> + {Val, Val}; + {'>=', Val} when Val =< Max -> + {Val, Max}; + {A, '..', B} when A =< Max, B =< Max -> + {A, B} + end + end, + Filter, + DimSizeof + ). + +-spec fold_bitsources(fun((_DstOffset :: non_neg_integer(), bitsource(), Acc) -> Acc), Acc, [ + bitsource() +]) -> {bitsize(), Acc}. +fold_bitsources(Fun, InitAcc, Bitsources) -> + lists:foldl( + fun(Bitsource = {_Dim, _Offset, Size}, {DstOffset, Acc0}) -> + Acc = Fun(DstOffset, Bitsource, Acc0), + {DstOffset + Size, Acc} + end, + {0, InitAcc}, + Bitsources + ). + +do_vector_to_key([], [], _Coord, [], Acc) -> + Acc; +do_vector_to_key([], [NewActions | Scanner], _Coord, [NewCoord | Vector], Acc) -> + do_vector_to_key(NewActions, Scanner, NewCoord, Vector, Acc); +do_vector_to_key([Action | Actions], Scanner, Coord, Vector, Acc0) -> + Acc = Acc0 bor extract(Coord, Action), + do_vector_to_key(Actions, Scanner, Coord, Vector, Acc). + +-spec extract(_Source :: scalar(), scan_action()) -> integer(). +extract(Src, #scan_action{src_bitmask = SrcBitmask, src_offset = SrcOffset, dst_offset = DstOffset}) -> + ((Src bsr SrcOffset) band SrcBitmask) bsl DstOffset. + +%% extract^-1 +-spec extract_inv(_Dest :: scalar(), scan_action()) -> integer(). +extract_inv(Dest, #scan_action{ + src_bitmask = SrcBitmask, src_offset = SrcOffset, dst_offset = DestOffset +}) -> + ((Dest bsr DestOffset) band SrcBitmask) bsl SrcOffset. + +ones(Bits) -> + 1 bsl Bits - 1. + +%%================================================================================ +%% Unit tests +%%================================================================================ + +-ifdef(TEST). + +make_keymapper0_test() -> + Schema = [], + ?assertEqual( + #keymapper{ + schema = Schema, + scanner = [], + size = 0, + dim_sizeof = [] + }, + make_keymapper(Schema) + ). + +make_keymapper1_test() -> + Schema = [{1, 0, 3}, {2, 0, 5}], + ?assertEqual( + #keymapper{ + schema = Schema, + scanner = [ + [#scan_action{src_bitmask = 2#111, src_offset = 0, dst_offset = 0}], + [#scan_action{src_bitmask = 2#11111, src_offset = 0, dst_offset = 3}] + ], + size = 8, + dim_sizeof = [3, 5] + }, + make_keymapper(Schema) + ). + +make_keymapper2_test() -> + Schema = [{1, 0, 3}, {2, 0, 5}, {1, 3, 5}], + ?assertEqual( + #keymapper{ + schema = Schema, + scanner = [ + [ + #scan_action{src_bitmask = 2#11111, src_offset = 3, dst_offset = 8}, + #scan_action{src_bitmask = 2#111, src_offset = 0, dst_offset = 0} + ], + [#scan_action{src_bitmask = 2#11111, src_offset = 0, dst_offset = 3}] + ], + size = 13, + dim_sizeof = [8, 5] + }, + make_keymapper(Schema) + ). + +vector_to_key0_test() -> + Schema = [], + Vector = [], + ?assertEqual(0, vec2key(Schema, Vector)). + +vector_to_key1_test() -> + Schema = [{1, 0, 8}], + ?assertEqual(16#ff, vec2key(Schema, [16#ff])), + ?assertEqual(16#1a, vec2key(Schema, [16#1a])), + ?assertEqual(16#ff, vec2key(Schema, [16#aaff])). + +%% Test handling of source offset: +vector_to_key2_test() -> + Schema = [{1, 8, 8}], + ?assertEqual(0, vec2key(Schema, [16#ff])), + ?assertEqual(16#1a, vec2key(Schema, [16#1aff])), + ?assertEqual(16#aa, vec2key(Schema, [16#11aaff])). + +%% Basic test of 2D vector: +vector_to_key3_test() -> + Schema = [{1, 0, 8}, {2, 0, 8}], + ?assertEqual(16#aaff, vec2key(Schema, [16#ff, 16#aa])), + ?assertEqual(16#2211, vec2key(Schema, [16#aa11, 16#bb22])). + +%% Advanced test with 2D vector: +vector_to_key4_test() -> + Schema = [{1, 0, 8}, {2, 0, 8}, {1, 8, 8}, {2, 16, 8}], + ?assertEqual(16#bb112211, vec2key(Schema, [16#aa1111, 16#bb2222])). + +%% Test with binaries: +vector_to_key_bin_test() -> + Schema = [{1, 0, 8 * 4}, {2, 0, 8 * 5}, {3, 0, 8 * 5}], + Keymapper = make_keymapper(lists:reverse(Schema)), + ?assertMatch( + <<"wellhelloworld">>, bin_vector_to_key(Keymapper, [<<"well">>, <<"hello">>, <<"world">>]) + ). + +key_to_vector0_test() -> + Schema = [], + key2vec(Schema, []). + +key_to_vector1_test() -> + Schema = [{1, 0, 8}, {2, 0, 8}], + key2vec(Schema, [1, 1]), + key2vec(Schema, [255, 255]), + key2vec(Schema, [255, 1]), + key2vec(Schema, [0, 1]), + key2vec(Schema, [255, 0]). + +key_to_vector2_test() -> + Schema = [{1, 0, 3}, {2, 0, 8}, {1, 3, 5}], + key2vec(Schema, [1, 1]), + key2vec(Schema, [255, 255]), + key2vec(Schema, [255, 1]), + key2vec(Schema, [0, 1]), + key2vec(Schema, [255, 0]). + +make_bitmask0_test() -> + Keymapper = make_keymapper([]), + ?assertMatch({0, 0}, mkbmask(Keymapper, [])). + +make_bitmask1_test() -> + Keymapper = make_keymapper([{1, 0, 8}]), + ?assertEqual({0, 0}, mkbmask(Keymapper, [any])), + ?assertEqual({16#ff, 1}, mkbmask(Keymapper, [{'=', 1}])), + ?assertEqual({16#ff, 255}, mkbmask(Keymapper, [{'=', 255}])), + ?assertEqual({0, 0}, mkbmask(Keymapper, [{'>=', 0}])), + ?assertEqual({0, 0}, mkbmask(Keymapper, [{'>=', 1}])), + ?assertEqual({0, 0}, mkbmask(Keymapper, [{'>=', 16#f}])). + +make_bitmask2_test() -> + Keymapper = make_keymapper([{1, 0, 3}, {2, 0, 4}, {3, 0, 2}]), + ?assertEqual({2#00_0000_000, 2#00_0000_000}, mkbmask(Keymapper, [any, any, any])), + ?assertEqual({2#11_0000_000, 2#00_0000_000}, mkbmask(Keymapper, [any, any, {'=', 0}])), + ?assertEqual({2#00_1111_000, 2#00_0000_000}, mkbmask(Keymapper, [any, {'=', 0}, any])), + ?assertEqual({2#00_0000_111, 2#00_0000_000}, mkbmask(Keymapper, [{'=', 0}, any, any])). + +make_bitmask3_test() -> + %% Key format of type |TimeOffset|Topic|Epoch|: + Keymapper = make_keymapper([{1, 0, 8}, {2, 0, 8}, {1, 8, 8}]), + ?assertEqual({2#00000000_00000000_00000000, 16#00_00_00}, mkbmask(Keymapper, [any, any])), + ?assertEqual( + {2#11111111_11111111_11111111, 16#aa_cc_bb}, + mkbmask(Keymapper, [{'=', 16#aabb}, {'=', 16#cc}]) + ), + ?assertEqual( + {2#00000000_11111111_00000000, 16#00_bb_00}, mkbmask(Keymapper, [{'>=', 255}, {'=', 16#bb}]) + ). + +make_filter_test() -> + KeyMapper = make_keymapper([]), + Filter = [], + ?assertMatch(#filter{size = 0, bitmask = 0, bitfilter = 0}, make_filter(KeyMapper, Filter)). + +ratchet1_test() -> + Bitsources = [{1, 0, 8}], + M = make_keymapper(Bitsources), + F = make_filter(M, [any]), + #filter{bitsource_ranges = Rarr} = F, + ?assertMatch( + [ + #filter_scan_action{ + offset = 0, + size = 8, + min = 0, + max = 16#ff + } + ], + array:to_list(Rarr) + ), + ?assertEqual(0, ratchet(F, 0)), + ?assertEqual(16#fa, ratchet(F, 16#fa)), + ?assertEqual(16#ff, ratchet(F, 16#ff)), + ?assertEqual(overflow, ratchet(F, 16#100)). + +%% erlfmt-ignore +ratchet2_test() -> + Bitsources = [{1, 0, 8}, %% Static topic index + {2, 8, 8}, %% Epoch + {3, 0, 8}, %% Varying topic hash + {2, 0, 8}], %% Timestamp offset + M = make_keymapper(lists:reverse(Bitsources)), + F1 = make_filter(M, [{'=', 16#aa}, any, {'=', 16#cc}]), + ?assertEqual(16#aa00cc00, ratchet(F1, 0)), + ?assertEqual(16#aa01cc00, ratchet(F1, 16#aa00cd00)), + ?assertEqual(16#aa01cc11, ratchet(F1, 16#aa01cc11)), + ?assertEqual(16#aa11cc00, ratchet(F1, 16#aa10cd00)), + ?assertEqual(16#aa11cc00, ratchet(F1, 16#aa10dc11)), + ?assertEqual(overflow, ratchet(F1, 16#ab000000)), + F2 = make_filter(M, [{'=', 16#aa}, {'>=', 16#dddd}, {'=', 16#cc}]), + %% TODO: note that it's `16#aaddcc00` instead of + %% `16#aaddccdd'. That is because currently ratchet function + %% doesn't take LSBs of an '>=' interval if it has a hole in the + %% middle (see `make_filter/2'). This only adds extra keys to the + %% very first interval, so it's not deemed a huge problem. + ?assertEqual(16#aaddcc00, ratchet(F2, 0)), + ?assertEqual(16#aa_de_cc_00, ratchet(F2, 16#aa_dd_cd_11)). + +%% erlfmt-ignore +ratchet3_test_() -> + EpochBits = 4, + Bitsources = [{1, 0, 2}, %% Static topic index + {2, EpochBits, 4}, %% Epoch + {3, 0, 2}, %% Varying topic hash + {2, 0, EpochBits}], %% Timestamp offset + Keymapper = make_keymapper(lists:reverse(Bitsources)), + Filter1 = make_filter(Keymapper, [{'=', 2#10}, any, {'=', 2#01}]), + Filter2 = make_filter(Keymapper, [{'=', 2#01}, any, any]), + Filter3 = make_filter(Keymapper, [{'=', 2#01}, {'>=', 16#aa}, any]), + {timeout, 15, + [?_assert(test_iterate(Filter1, 0)), + ?_assert(test_iterate(Filter2, 0)), + %% Not starting from 0 here for simplicity, since the beginning + %% of a >= interval can't be properly checked with a bitmask: + ?_assert(test_iterate(Filter3, ratchet(Filter3, 1))) + ]}. + +%% Note: this function iterates through the full range of keys, so its +%% complexity grows _exponentially_ with the total size of the +%% keymapper. +test_iterate(_Filter, overflow) -> + true; +test_iterate(Filter, Key0) -> + Key = ratchet(Filter, Key0 + 1), + ?assert(ratchet_prop(Filter, Key0, Key)), + test_iterate(Filter, Key). + +ratchet_prop(#filter{bitfilter = Bitfilter, bitmask = Bitmask, size = Size}, Key0, Key) -> + %% Validate basic properties of the generated key. It must be + %% greater than the old key, and match the bitmask: + ?assert(Key =:= overflow orelse (Key band Bitmask =:= Bitfilter)), + ?assert(Key > Key0, {Key, '>=', Key0}), + IMax = ones(Size), + %% Iterate through all keys between `Key0 + 1' and `Key' and + %% validate that none of them match the bitmask. Ultimately, it + %% means that `ratchet' function doesn't skip over any valid keys: + CheckGaps = fun + F(I) when I >= Key; I > IMax -> + true; + F(I) -> + ?assertNot( + I band Bitmask =:= Bitfilter, + {found_gap, Key0, I, Key} + ), + F(I + 1) + end, + CheckGaps(Key0 + 1). + +mkbmask(Keymapper, Filter0) -> + Filter = constraints_to_ranges(Keymapper, Filter0), + make_bitfilter(Keymapper, Filter). + +key2vec(Schema, Vector) -> + Keymapper = make_keymapper(Schema), + Key = vector_to_key(Keymapper, Vector), + ?assertEqual(Vector, key_to_vector(Keymapper, Key)). + +vec2key(Schema, Vector) -> + vector_to_key(make_keymapper(Schema), Vector). + +-endif. diff --git a/apps/emqx_durable_storage/src/emqx_ds_lts.erl b/apps/emqx_durable_storage/src/emqx_ds_lts.erl new file mode 100644 index 000000000..d06854fd0 --- /dev/null +++ b/apps/emqx_durable_storage/src/emqx_ds_lts.erl @@ -0,0 +1,619 @@ +%%-------------------------------------------------------------------- +%% Copyright (c) 2023 EMQ Technologies Co., Ltd. All Rights Reserved. +%% +%% Licensed under the Apache License, Version 2.0 (the "License"); +%% you may not use this file except in compliance with the License. +%% You may obtain a copy of the License at +%% +%% http://www.apache.org/licenses/LICENSE-2.0 +%% +%% Unless required by applicable law or agreed to in writing, software +%% distributed under the License is distributed on an "AS IS" BASIS, +%% WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +%% See the License for the specific language governing permissions and +%% limitations under the License. +%%-------------------------------------------------------------------- + +-module(emqx_ds_lts). + +%% API: +-export([ + trie_create/1, trie_create/0, trie_restore/2, topic_key/3, match_topics/2, lookup_topic_key/2 +]). + +%% Debug: +-export([trie_next/3, trie_insert/3, dump_to_dot/2]). + +-export_type([options/0, static_key/0, trie/0]). + +-include_lib("stdlib/include/ms_transform.hrl"). + +-ifdef(TEST). +-include_lib("eunit/include/eunit.hrl"). +-endif. + +-elvis([{elvis_style, variable_naming_convention, disable}]). + +%%================================================================================ +%% Type declarations +%%================================================================================ + +%% End Of Topic +-define(EOT, []). +-define(PLUS, '+'). + +-type edge() :: binary() | ?EOT | ?PLUS. + +%% Fixed size binary +-type static_key() :: non_neg_integer(). + +-define(PREFIX, prefix). +-type state() :: static_key() | ?PREFIX. + +-type varying() :: [binary() | ?PLUS]. + +-type msg_storage_key() :: {static_key(), varying()}. + +-type threshold_fun() :: fun((non_neg_integer()) -> non_neg_integer()). + +-type persist_callback() :: fun((_Key, _Val) -> ok). + +-type options() :: + #{ + persist_callback => persist_callback(), + static_key_size => pos_integer() + }. + +-record(trie, { + persist :: persist_callback(), + static_key_size :: pos_integer(), + trie :: ets:tid(), + stats :: ets:tid() +}). + +-opaque trie() :: #trie{}. + +-record(trans, { + key :: {state(), edge()}, + next :: state() +}). + +%%================================================================================ +%% API funcions +%%================================================================================ + +%% @doc Create an empty trie +-spec trie_create(options()) -> trie(). +trie_create(UserOpts) -> + Defaults = #{ + persist_callback => fun(_, _) -> ok end, + static_key_size => 8 + }, + #{ + persist_callback := Persist, + static_key_size := StaticKeySize + } = maps:merge(Defaults, UserOpts), + Trie = ets:new(trie, [{keypos, #trans.key}, set, public]), + Stats = ets:new(stats, [{keypos, 1}, set, public]), + #trie{ + persist = Persist, + static_key_size = StaticKeySize, + trie = Trie, + stats = Stats + }. + +-spec trie_create() -> trie(). +trie_create() -> + trie_create(#{}). + +%% @doc Restore trie from a dump +-spec trie_restore(options(), [{_Key, _Val}]) -> trie(). +trie_restore(Options, Dump) -> + Trie = trie_create(Options), + lists:foreach( + fun({{StateFrom, Token}, StateTo}) -> + trie_insert(Trie, StateFrom, Token, StateTo) + end, + Dump + ), + Trie. + +%% @doc Lookup the topic key. Create a new one, if not found. +-spec topic_key(trie(), threshold_fun(), [binary()]) -> msg_storage_key(). +topic_key(Trie, ThresholdFun, Tokens) -> + do_topic_key(Trie, ThresholdFun, 0, ?PREFIX, Tokens, []). + +%% @doc Return an exisiting topic key if it exists. +-spec lookup_topic_key(trie(), [binary()]) -> {ok, msg_storage_key()} | undefined. +lookup_topic_key(Trie, Tokens) -> + do_lookup_topic_key(Trie, ?PREFIX, Tokens, []). + +%% @doc Return list of keys of topics that match a given topic filter +-spec match_topics(trie(), [binary() | '+' | '#']) -> + [msg_storage_key()]. +match_topics(Trie, TopicFilter) -> + do_match_topics(Trie, ?PREFIX, [], TopicFilter). + +%% @doc Dump trie to graphviz format for debugging +-spec dump_to_dot(trie(), file:filename()) -> ok. +dump_to_dot(#trie{trie = Trie, stats = Stats}, Filename) -> + L = ets:tab2list(Trie), + {Nodes0, Edges} = + lists:foldl( + fun(#trans{key = {From, Label}, next = To}, {AccN, AccEdge}) -> + Edge = {From, To, Label}, + {[From, To] ++ AccN, [Edge | AccEdge]} + end, + {[], []}, + L + ), + Nodes = + lists:map( + fun(Node) -> + case ets:lookup(Stats, Node) of + [{_, NChildren}] -> ok; + [] -> NChildren = 0 + end, + {Node, NChildren} + end, + lists:usort(Nodes0) + ), + {ok, FD} = file:open(Filename, [write]), + Print = fun + (?PREFIX) -> "prefix"; + (NodeId) -> integer_to_binary(NodeId, 16) + end, + io:format(FD, "digraph {~n", []), + lists:foreach( + fun({Node, NChildren}) -> + Id = Print(Node), + io:format(FD, " \"~s\" [label=\"~s : ~p\"];~n", [Id, Id, NChildren]) + end, + Nodes + ), + lists:foreach( + fun({From, To, Label}) -> + io:format(FD, " \"~s\" -> \"~s\" [label=\"~s\"];~n", [Print(From), Print(To), Label]) + end, + Edges + ), + io:format(FD, "}~n", []), + file:close(FD). + +%%================================================================================ +%% Internal exports +%%================================================================================ + +-spec trie_next(trie(), state(), binary() | ?EOT) -> {Wildcard, state()} | undefined when + Wildcard :: boolean(). +trie_next(#trie{trie = Trie}, State, ?EOT) -> + case ets:lookup(Trie, {State, ?EOT}) of + [#trans{next = Next}] -> {false, Next}; + [] -> undefined + end; +trie_next(#trie{trie = Trie}, State, Token) -> + case ets:lookup(Trie, {State, Token}) of + [#trans{next = Next}] -> + {false, Next}; + [] -> + case ets:lookup(Trie, {State, ?PLUS}) of + [#trans{next = Next}] -> {true, Next}; + [] -> undefined + end + end. + +-spec trie_insert(trie(), state(), edge()) -> {Updated, state()} when + NChildren :: non_neg_integer(), + Updated :: false | NChildren. +trie_insert(Trie, State, Token) -> + trie_insert(Trie, State, Token, get_id_for_key(Trie, State, Token)). + +%%================================================================================ +%% Internal functions +%%================================================================================ + +-spec trie_insert(trie(), state(), edge(), state()) -> {Updated, state()} when + NChildren :: non_neg_integer(), + Updated :: false | NChildren. +trie_insert(#trie{trie = Trie, stats = Stats, persist = Persist}, State, Token, NewState) -> + Key = {State, Token}, + Rec = #trans{ + key = Key, + next = NewState + }, + case ets:insert_new(Trie, Rec) of + true -> + ok = Persist(Key, NewState), + Inc = + case Token of + ?EOT -> 0; + ?PLUS -> 0; + _ -> 1 + end, + NChildren = ets:update_counter(Stats, State, {2, Inc}, {State, 0}), + {NChildren, NewState}; + false -> + [#trans{next = NextState}] = ets:lookup(Trie, Key), + {false, NextState} + end. + +-spec get_id_for_key(trie(), state(), edge()) -> static_key(). +get_id_for_key(#trie{static_key_size = Size}, _State, _Token) -> + %% Requirements for the return value: + %% + %% It should be globally unique for the `{State, Token}` pair. Other + %% than that, there's no requirements. The return value doesn't even + %% have to be deterministic, since the states are saved in the trie. + %% + %% The generated value becomes the ID of the topic in the durable + %% storage. Its size should be relatively small to reduce the + %% overhead of storing messages. + %% + %% If we want to impress computer science crowd, sorry, I mean to + %% minimize storage requirements, we can even employ Huffman coding + %% based on the frequency of messages. + <> = crypto:strong_rand_bytes(Size), + Int. + +%% erlfmt-ignore +-spec do_match_topics(trie(), state(), [binary() | '+'], [binary() | '+' | '#']) -> + list(). +do_match_topics(Trie, State, Varying, []) -> + case trie_next(Trie, State, ?EOT) of + {false, Static} -> [{Static, lists:reverse(Varying)}]; + undefined -> [] + end; +do_match_topics(Trie, State, Varying, ['#']) -> + Emanating = emanating(Trie, State, ?PLUS), + lists:flatmap( + fun + ({?EOT, Static}) -> + [{Static, lists:reverse(Varying)}]; + ({?PLUS, NextState}) -> + do_match_topics(Trie, NextState, [?PLUS | Varying], ['#']); + ({_, NextState}) -> + do_match_topics(Trie, NextState, Varying, ['#']) + end, + Emanating + ); +do_match_topics(Trie, State, Varying, [Level | Rest]) -> + Emanating = emanating(Trie, State, Level), + lists:flatmap( + fun + ({?EOT, _NextState}) -> + []; + ({?PLUS, NextState}) -> + do_match_topics(Trie, NextState, [Level | Varying], Rest); + ({_, NextState}) -> + do_match_topics(Trie, NextState, Varying, Rest) + end, + Emanating + ). + +-spec do_lookup_topic_key(trie(), state(), [binary()], [binary()]) -> + {ok, msg_storage_key()} | undefined. +do_lookup_topic_key(Trie, State, [], Varying) -> + case trie_next(Trie, State, ?EOT) of + {false, Static} -> + {ok, {Static, lists:reverse(Varying)}}; + undefined -> + undefined + end; +do_lookup_topic_key(Trie, State, [Tok | Rest], Varying) -> + case trie_next(Trie, State, Tok) of + {true, NextState} -> + do_lookup_topic_key(Trie, NextState, Rest, [Tok | Varying]); + {false, NextState} -> + do_lookup_topic_key(Trie, NextState, Rest, Varying); + undefined -> + undefined + end. + +do_topic_key(Trie, _, _, State, [], Varying) -> + %% We reached the end of topic. Assert: Trie node that corresponds + %% to EOT cannot be a wildcard. + {_, false, Static} = trie_next_(Trie, State, ?EOT), + {Static, lists:reverse(Varying)}; +do_topic_key(Trie, ThresholdFun, Depth, State, [Tok | Rest], Varying0) -> + % TODO: it's not necessary to call it every time. + Threshold = ThresholdFun(Depth), + Varying = + case trie_next_(Trie, State, Tok) of + {NChildren, _, NextState} when is_integer(NChildren), NChildren >= Threshold -> + %% Number of children for the trie node reached the + %% threshold, we need to insert wildcard here. + {_, _WildcardState} = trie_insert(Trie, State, ?PLUS), + Varying0; + {_, false, NextState} -> + Varying0; + {_, true, NextState} -> + %% This topic level is marked as wildcard in the trie, + %% we need to add it to the varying part of the key: + [Tok | Varying0] + end, + do_topic_key(Trie, ThresholdFun, Depth + 1, NextState, Rest, Varying). + +%% @doc Has side effects! Inserts missing elements +-spec trie_next_(trie(), state(), binary() | ?EOT) -> {New, Wildcard, state()} when + New :: false | non_neg_integer(), + Wildcard :: boolean(). +trie_next_(Trie, State, Token) -> + case trie_next(Trie, State, Token) of + {Wildcard, NextState} -> + {false, Wildcard, NextState}; + undefined -> + {Updated, NextState} = trie_insert(Trie, State, Token), + {Updated, false, NextState} + end. + +%% @doc Return all edges emanating from a node: +%% erlfmt-ignore +-spec emanating(trie(), state(), edge()) -> [{edge(), state()}]. +emanating(#trie{trie = Tab}, State, ?PLUS) -> + ets:select( + Tab, + ets:fun2ms( + fun(#trans{key = {S, Edge}, next = Next}) when S == State -> + {Edge, Next} + end + ) + ); +emanating(#trie{trie = Tab}, State, ?EOT) -> + case ets:lookup(Tab, {State, ?EOT}) of + [#trans{next = Next}] -> [{?EOT, Next}]; + [] -> [] + end; +emanating(#trie{trie = Tab}, State, Bin) when is_binary(Bin) -> + [ + {Edge, Next} + || #trans{key = {_, Edge}, next = Next} <- + ets:lookup(Tab, {State, ?PLUS}) ++ + ets:lookup(Tab, {State, Bin}) + ]. + +%%================================================================================ +%% Tests +%%================================================================================ + +-ifdef(TEST). + +trie_basic_test() -> + T = trie_create(), + ?assertMatch(undefined, trie_next(T, ?PREFIX, <<"foo">>)), + {1, S1} = trie_insert(T, ?PREFIX, <<"foo">>), + ?assertMatch({false, S1}, trie_insert(T, ?PREFIX, <<"foo">>)), + ?assertMatch({false, S1}, trie_next(T, ?PREFIX, <<"foo">>)), + + ?assertMatch(undefined, trie_next(T, ?PREFIX, <<"bar">>)), + {2, S2} = trie_insert(T, ?PREFIX, <<"bar">>), + ?assertMatch({false, S2}, trie_insert(T, ?PREFIX, <<"bar">>)), + + ?assertMatch(undefined, trie_next(T, S1, <<"foo">>)), + ?assertMatch(undefined, trie_next(T, S1, <<"bar">>)), + {1, S11} = trie_insert(T, S1, <<"foo">>), + {2, S12} = trie_insert(T, S1, <<"bar">>), + ?assertMatch({false, S11}, trie_next(T, S1, <<"foo">>)), + ?assertMatch({false, S12}, trie_next(T, S1, <<"bar">>)), + + ?assertMatch(undefined, trie_next(T, S11, <<"bar">>)), + {1, S111} = trie_insert(T, S11, <<"bar">>), + ?assertMatch({false, S111}, trie_next(T, S11, <<"bar">>)). + +lookup_key_test() -> + T = trie_create(), + {_, S1} = trie_insert(T, ?PREFIX, <<"foo">>), + {_, S11} = trie_insert(T, S1, <<"foo">>), + %% Topics don't match until we insert ?EOT: + ?assertMatch( + undefined, + lookup_topic_key(T, [<<"foo">>]) + ), + ?assertMatch( + undefined, + lookup_topic_key(T, [<<"foo">>, <<"foo">>]) + ), + {_, S10} = trie_insert(T, S1, ?EOT), + {_, S110} = trie_insert(T, S11, ?EOT), + ?assertMatch( + {ok, {S10, []}}, + lookup_topic_key(T, [<<"foo">>]) + ), + ?assertMatch( + {ok, {S110, []}}, + lookup_topic_key(T, [<<"foo">>, <<"foo">>]) + ), + %% The rest of keys still don't match: + ?assertMatch( + undefined, + lookup_topic_key(T, [<<"bar">>]) + ), + ?assertMatch( + undefined, + lookup_topic_key(T, [<<"bar">>, <<"foo">>]) + ). + +wildcard_lookup_test() -> + T = trie_create(), + {1, S1} = trie_insert(T, ?PREFIX, <<"foo">>), + %% Plus doesn't increase the number of children + {0, S11} = trie_insert(T, S1, ?PLUS), + {1, S111} = trie_insert(T, S11, <<"foo">>), + %% ?EOT doesn't increase the number of children + {0, S1110} = trie_insert(T, S111, ?EOT), + ?assertMatch( + {ok, {S1110, [<<"bar">>]}}, + lookup_topic_key(T, [<<"foo">>, <<"bar">>, <<"foo">>]) + ), + ?assertMatch( + {ok, {S1110, [<<"quux">>]}}, + lookup_topic_key(T, [<<"foo">>, <<"quux">>, <<"foo">>]) + ), + ?assertMatch( + undefined, + lookup_topic_key(T, [<<"foo">>]) + ), + ?assertMatch( + undefined, + lookup_topic_key(T, [<<"foo">>, <<"bar">>]) + ), + ?assertMatch( + undefined, + lookup_topic_key(T, [<<"foo">>, <<"bar">>, <<"bar">>]) + ), + ?assertMatch( + undefined, + lookup_topic_key(T, [<<"bar">>, <<"foo">>, <<"foo">>]) + ), + {_, S10} = trie_insert(T, S1, ?EOT), + ?assertMatch( + {ok, {S10, []}}, + lookup_topic_key(T, [<<"foo">>]) + ). + +%% erlfmt-ignore +topic_key_test() -> + T = trie_create(), + try + Threshold = 4, + ThresholdFun = fun(0) -> 1000; + (_) -> Threshold + end, + %% Test that bottom layer threshold is high: + lists:foreach( + fun(I) -> + {_, []} = test_key(T, ThresholdFun, [I, 99999, 999999, 99999]) + end, + lists:seq(1, 10)), + %% Test adding children on the 2nd level: + lists:foreach( + fun(I) -> + case test_key(T, ThresholdFun, [1, I, 1]) of + {_, []} -> + ?assert(I < Threshold, {I, '<', Threshold}), + ok; + {_, [Var]} -> + ?assert(I >= Threshold, {I, '>=', Threshold}), + ?assertEqual(Var, integer_to_binary(I)) + end + end, + lists:seq(1, 100)), + %% This doesn't affect 2nd level with a different prefix: + ?assertMatch({_, []}, test_key(T, ThresholdFun, [2, 1, 1])), + ?assertMatch({_, []}, test_key(T, ThresholdFun, [2, 10, 1])), + %% This didn't retroactively change the indexes that were + %% created prior to reaching the threshold: + ?assertMatch({_, []}, test_key(T, ThresholdFun, [1, 1, 1])), + ?assertMatch({_, []}, test_key(T, ThresholdFun, [1, 2, 1])), + %% Now create another level of +: + lists:foreach( + fun(I) -> + case test_key(T, ThresholdFun, [1, 42, 1, I, 42]) of + {_, [<<"42">>]} when I =< Threshold -> %% TODO: off by 1 error + ok; + {_, [<<"42">>, Var]} -> + ?assertEqual(Var, integer_to_binary(I)); + Ret -> + error({Ret, I}) + end + end, + lists:seq(1, 100)) + after + dump_to_dot(T, filename:join("_build", atom_to_list(?FUNCTION_NAME) ++ ".dot")) + end. + +%% erlfmt-ignore +topic_match_test() -> + T = trie_create(), + try + Threshold = 2, + ThresholdFun = fun(0) -> 1000; + (_) -> Threshold + end, + {S1, []} = test_key(T, ThresholdFun, [1]), + {S11, []} = test_key(T, ThresholdFun, [1, 1]), + {S12, []} = test_key(T, ThresholdFun, [1, 2]), + {S111, []} = test_key(T, ThresholdFun, [1, 1, 1]), + %% Match concrete topics: + assert_match_topics(T, [1], [{S1, []}]), + assert_match_topics(T, [1, 1], [{S11, []}]), + assert_match_topics(T, [1, 1, 1], [{S111, []}]), + %% Match topics with +: + assert_match_topics(T, [1, '+'], [{S11, []}, {S12, []}]), + assert_match_topics(T, [1, '+', 1], [{S111, []}]), + %% Match topics with #: + assert_match_topics(T, [1, '#'], + [{S1, []}, + {S11, []}, {S12, []}, + {S111, []}]), + assert_match_topics(T, [1, 1, '#'], + [{S11, []}, + {S111, []}]), + %% Now add learned wildcards: + {S21, []} = test_key(T, ThresholdFun, [2, 1]), + {S22, []} = test_key(T, ThresholdFun, [2, 2]), + {S2_, [<<"3">>]} = test_key(T, ThresholdFun, [2, 3]), + {S2_11, [<<"3">>]} = test_key(T, ThresholdFun, [2, 3, 1, 1]), + {S2_12, [<<"4">>]} = test_key(T, ThresholdFun, [2, 4, 1, 2]), + {S2_1_, [<<"3">>, <<"3">>]} = test_key(T, ThresholdFun, [2, 3, 1, 3]), + %% %% Check matching: + assert_match_topics(T, [2, 2], + [{S22, []}, {S2_, [<<"2">>]}]), + assert_match_topics(T, [2, '+'], + [{S22, []}, {S21, []}, {S2_, ['+']}]), + assert_match_topics(T, [2, '#'], + [{S21, []}, {S22, []}, + {S2_, ['+']}, + {S2_11, ['+']}, {S2_12, ['+']}, {S2_1_, ['+', '+']}]), + ok + after + dump_to_dot(T, filename:join("_build", atom_to_list(?FUNCTION_NAME) ++ ".dot")) + end. + +-define(keys_history, topic_key_history). + +%% erlfmt-ignore +assert_match_topics(Trie, Filter0, Expected) -> + Filter = lists:map(fun(I) when is_integer(I) -> integer_to_binary(I); + (I) -> I + end, + Filter0), + Matched = match_topics(Trie, Filter), + ?assertMatch( #{missing := [], unexpected := []} + , #{ missing => Expected -- Matched + , unexpected => Matched -- Expected + } + , Filter + ). + +%% erlfmt-ignore +test_key(Trie, Threshold, Topic0) -> + Topic = [integer_to_binary(I) || I <- Topic0], + Ret = topic_key(Trie, Threshold, Topic), + %% Test idempotency: + Ret1 = topic_key(Trie, Threshold, Topic), + ?assertEqual(Ret, Ret1, Topic), + %% Add new key to the history: + case get(?keys_history) of + undefined -> OldHistory = #{}; + OldHistory -> ok + end, + %% Test that the generated keys are always unique for the topic: + History = maps:update_with( + Ret, + fun(Old) -> + case Old =:= Topic of + true -> Old; + false -> error(#{ '$msg' => "Duplicate key!" + , key => Ret + , old_topic => Old + , new_topic => Topic + }) + end + end, + Topic, + OldHistory), + put(?keys_history, History), + {ok, Ret} = lookup_topic_key(Trie, Topic), + Ret. + +-endif. diff --git a/apps/emqx_durable_storage/src/emqx_ds_message_storage_bitmask.erl b/apps/emqx_durable_storage/src/emqx_ds_message_storage_bitmask.erl deleted file mode 100644 index 7b141b202..000000000 --- a/apps/emqx_durable_storage/src/emqx_ds_message_storage_bitmask.erl +++ /dev/null @@ -1,742 +0,0 @@ -%%-------------------------------------------------------------------- -%% Copyright (c) 2022-2023 EMQ Technologies Co., Ltd. All Rights Reserved. -%%-------------------------------------------------------------------- - --module(emqx_ds_message_storage_bitmask). - -%%================================================================================ -%% @doc Description of the schema -%% -%% Let us assume that `T' is a topic and `t' is time. These are the two -%% dimensions used to index messages. They can be viewed as -%% "coordinates" of an MQTT message in a 2D space. -%% -%% Oftentimes, when wildcard subscription is used, keys must be -%% scanned in both dimensions simultaneously. -%% -%% Rocksdb allows to iterate over sorted keys very fast. This means we -%% need to map our two-dimentional keys to a single index that is -%% sorted in a way that helps to iterate over both time and topic -%% without having to do a lot of random seeks. -%% -%% == Mapping of 2D keys to rocksdb keys == -%% -%% We use "zigzag" pattern to store messages, where rocksdb key is -%% composed like like this: -%% -%% |ttttt|TTTTTTTTT|tttt| -%% ^ ^ ^ -%% | | | -%% +-------+ | +---------+ -%% | | | -%% most significant topic hash least significant -%% bits of timestamp bits of timestamp -%% (a.k.a epoch) (a.k.a time offset) -%% -%% Topic hash is level-aware: each topic level is hashed separately -%% and the resulting hashes are bitwise-concatentated. This allows us -%% to map topics to fixed-length bitstrings while keeping some degree -%% of information about the hierarchy. -%% -%% Next important concept is what we call "epoch". Duration of the -%% epoch is determined by maximum time offset. Epoch is calculated by -%% shifting bits of the timestamp right. -%% -%% The resulting index is a space-filling curve that looks like -%% this in the topic-time 2D space: -%% -%% T ^ ---->------ |---->------ |---->------ -%% | --/ / --/ / --/ -%% | -<-/ | -<-/ | -<-/ -%% | -/ | -/ | -/ -%% | ---->------ | ---->------ | ---->------ -%% | --/ / --/ / --/ -%% | ---/ | ---/ | ---/ -%% | -/ ^ -/ ^ -/ -%% | ---->------ | ---->------ | ---->------ -%% | --/ / --/ / --/ -%% | -<-/ | -<-/ | -<-/ -%% | -/ | -/ | -/ -%% | ---->------| ---->------| ----------> -%% | -%% -+------------+-----------------------------> t -%% epoch -%% -%% This structure allows to quickly seek to a the first message that -%% was recorded in a certain epoch in a certain topic or a -%% group of topics matching filter like `foo/bar/#`. -%% -%% Due to its structure, for each pair of rocksdb keys K1 and K2, such -%% that K1 > K2 and topic(K1) = topic(K2), timestamp(K1) > -%% timestamp(K2). -%% That is, replay doesn't reorder messages published in each -%% individual topic. -%% -%% This property doesn't hold between different topics, but it's not deemed -%% a problem right now. -%% -%%================================================================================ - --behaviour(emqx_ds_storage_layer). - -%% API: --export([create_new/3, open/5]). --export([make_keymapper/1]). - --export([store/5]). --export([delete/4]). --export([make_iterator/2]). --export([make_iterator/3]). --export([next/1]). - --export([preserve_iterator/1]). --export([restore_iterator/3]). --export([refresh_iterator/1]). - -%% Debug/troubleshooting: -%% Keymappers --export([ - keymapper_info/1, - compute_bitstring/3, - compute_topic_bitmask/2, - compute_time_bitmask/1, - hash/2 -]). - -%% Keyspace filters --export([ - make_keyspace_filter/2, - compute_initial_seek/1, - compute_next_seek/2, - compute_time_seek/3, - compute_topic_seek/4 -]). - --export_type([db/0, iterator/0, schema/0]). - --export_type([options/0]). --export_type([iteration_options/0]). - --compile( - {inline, [ - bitwise_concat/3, - ones/1, - successor/1, - topic_hash_matches/3, - time_matches/3 - ]} -). - -%%================================================================================ -%% Type declarations -%%================================================================================ - --type topic() :: emqx_ds:topic(). --type topic_filter() :: emqx_ds:topic_filter(). --type time() :: emqx_ds:time(). - -%% Number of bits --type bits() :: non_neg_integer(). - -%% Key of a RocksDB record. --type key() :: binary(). - -%% Distribution of entropy among topic levels. -%% Example: [4, 8, 16] means that level 1 gets 4 bits, level 2 gets 8 bits, -%% and _rest of levels_ (if any) get 16 bits. --type bits_per_level() :: [bits(), ...]. - --type options() :: #{ - %% Number of bits in a message timestamp. - timestamp_bits := bits(), - %% Number of bits in a key allocated to each level in a message topic. - topic_bits_per_level := bits_per_level(), - %% Maximum granularity of iteration over time. - epoch := time(), - - iteration => iteration_options(), - - cf_options => emqx_ds_storage_layer:db_cf_options() -}. - --type iteration_options() :: #{ - %% Request periodic iterator refresh. - %% This might be helpful during replays taking a lot of time (e.g. tens of seconds). - %% Note that `{every, 1000}` means 1000 _operations_ with the iterator which is not - %% the same as 1000 replayed messages. - iterator_refresh => {every, _NumOperations :: pos_integer()} -}. - -%% Persistent configuration of the generation, it is used to create db -%% record when the database is reopened --record(schema, {keymapper :: keymapper()}). - --opaque schema() :: #schema{}. - --record(db, { - shard :: emqx_ds:shard(), - handle :: rocksdb:db_handle(), - cf :: rocksdb:cf_handle(), - keymapper :: keymapper(), - write_options = [{sync, true}] :: emqx_ds_storage_layer:db_write_options(), - read_options = [] :: emqx_ds_storage_layer:db_read_options() -}). - --record(it, { - handle :: rocksdb:itr_handle(), - filter :: keyspace_filter(), - cursor :: binary() | undefined, - next_action :: {seek, binary()} | next, - refresh_counter :: {non_neg_integer(), pos_integer()} | undefined -}). - --record(filter, { - keymapper :: keymapper(), - topic_filter :: topic_filter(), - start_time :: integer(), - hash_bitfilter :: integer(), - hash_bitmask :: integer(), - time_bitfilter :: integer(), - time_bitmask :: integer() -}). - -% NOTE -% Keymapper decides how to map messages into RocksDB column family keyspace. --record(keymapper, { - source :: [bitsource(), ...], - bitsize :: bits(), - epoch :: non_neg_integer() -}). - --type bitsource() :: - %% Consume `_Size` bits from timestamp starting at `_Offset`th bit. - %% TODO consistency - {timestamp, _Offset :: bits(), _Size :: bits()} - %% Consume next topic level (either one or all of them) and compute `_Size` bits-wide hash. - | {hash, level | levels, _Size :: bits()}. - --opaque db() :: #db{}. --opaque iterator() :: #it{}. --type keymapper() :: #keymapper{}. --type keyspace_filter() :: #filter{}. - -%%================================================================================ -%% API funcions -%%================================================================================ - -%% Create a new column family for the generation and a serializable representation of the schema --spec create_new(rocksdb:db_handle(), emqx_ds_storage_layer:gen_id(), options()) -> - {schema(), emqx_ds_storage_layer:cf_refs()}. -create_new(DBHandle, GenId, Options) -> - CFName = data_cf(GenId), - CFOptions = maps:get(cf_options, Options, []), - {ok, CFHandle} = rocksdb:create_column_family(DBHandle, CFName, CFOptions), - Schema = #schema{keymapper = make_keymapper(Options)}, - {Schema, [{CFName, CFHandle}]}. - -%% Reopen the database --spec open( - emqx_ds:shard(), - rocksdb:db_handle(), - emqx_ds_storage_layer:gen_id(), - emqx_ds_storage_layer:cf_refs(), - schema() -) -> - db(). -open(Shard, DBHandle, GenId, CFs, #schema{keymapper = Keymapper}) -> - {value, {_, CFHandle}} = lists:keysearch(data_cf(GenId), 1, CFs), - #db{ - shard = Shard, - handle = DBHandle, - cf = CFHandle, - keymapper = Keymapper - }. - --spec make_keymapper(options()) -> keymapper(). -make_keymapper(#{ - timestamp_bits := TimestampBits, - topic_bits_per_level := BitsPerLevel, - epoch := MaxEpoch -}) -> - TimestampLSBs = min(TimestampBits, floor(math:log2(MaxEpoch))), - TimestampMSBs = TimestampBits - TimestampLSBs, - NLevels = length(BitsPerLevel), - {LevelBits, [TailLevelsBits]} = lists:split(NLevels - 1, BitsPerLevel), - Source = lists:flatten([ - [{timestamp, TimestampLSBs, TimestampMSBs} || TimestampMSBs > 0], - [{hash, level, Bits} || Bits <- LevelBits], - {hash, levels, TailLevelsBits}, - [{timestamp, 0, TimestampLSBs} || TimestampLSBs > 0] - ]), - #keymapper{ - source = Source, - bitsize = lists:sum([S || {_, _, S} <- Source]), - epoch = 1 bsl TimestampLSBs - }. - --spec store(db(), emqx_guid:guid(), emqx_ds:time(), topic(), binary()) -> - ok | {error, _TODO}. -store(DB = #db{handle = DBHandle, cf = CFHandle}, MessageID, PublishedAt, Topic, MessagePayload) -> - Key = make_message_key(Topic, PublishedAt, MessageID, DB#db.keymapper), - Value = make_message_value(Topic, MessagePayload), - rocksdb:put(DBHandle, CFHandle, Key, Value, DB#db.write_options). - --spec delete(db(), emqx_guid:guid(), emqx_ds:time(), topic()) -> - ok | {error, _TODO}. -delete(DB = #db{handle = DBHandle, cf = CFHandle}, MessageID, PublishedAt, Topic) -> - Key = make_message_key(Topic, PublishedAt, MessageID, DB#db.keymapper), - rocksdb:delete(DBHandle, CFHandle, Key, DB#db.write_options). - --spec make_iterator(db(), emqx_ds:replay()) -> - {ok, iterator()} | {error, _TODO}. -make_iterator(DB, Replay) -> - {Keyspace, _ShardId} = DB#db.shard, - Options = emqx_ds_conf:iteration_options(Keyspace), - make_iterator(DB, Replay, Options). - --spec make_iterator(db(), emqx_ds:replay(), iteration_options()) -> - % {error, invalid_start_time}? might just start from the beginning of time - % and call it a day: client violated the contract anyway. - {ok, iterator()} | {error, _TODO}. -make_iterator(DB = #db{handle = DBHandle, cf = CFHandle}, Replay, Options) -> - case rocksdb:iterator(DBHandle, CFHandle, DB#db.read_options) of - {ok, ITHandle} -> - Filter = make_keyspace_filter(Replay, DB#db.keymapper), - InitialSeek = combine(compute_initial_seek(Filter), <<>>, DB#db.keymapper), - RefreshCounter = make_refresh_counter(maps:get(iterator_refresh, Options, undefined)), - {ok, #it{ - handle = ITHandle, - filter = Filter, - next_action = {seek, InitialSeek}, - refresh_counter = RefreshCounter - }}; - Err -> - Err - end. - --spec next(iterator()) -> {value, binary(), iterator()} | none | {error, closed}. -next(It0 = #it{filter = #filter{keymapper = Keymapper}}) -> - It = maybe_refresh_iterator(It0), - case rocksdb:iterator_move(It#it.handle, It#it.next_action) of - % spec says `{ok, Key}` is also possible but the implementation says it's not - {ok, Key, Value} -> - % Preserve last seen key in the iterator so it could be restored / refreshed later. - ItNext = It#it{cursor = Key}, - Bitstring = extract(Key, Keymapper), - case match_next(Bitstring, Value, It#it.filter) of - {_Topic, Payload} -> - {value, Payload, ItNext#it{next_action = next}}; - next -> - next(ItNext#it{next_action = next}); - NextBitstring when is_integer(NextBitstring) -> - NextSeek = combine(NextBitstring, <<>>, Keymapper), - next(ItNext#it{next_action = {seek, NextSeek}}); - none -> - stop_iteration(ItNext) - end; - {error, invalid_iterator} -> - stop_iteration(It); - {error, iterator_closed} -> - {error, closed} - end. - --spec preserve_iterator(iterator()) -> binary(). -preserve_iterator(#it{cursor = Cursor}) -> - State = #{ - v => 1, - cursor => Cursor - }, - term_to_binary(State). - --spec restore_iterator(db(), emqx_ds:replay(), binary()) -> - {ok, iterator()} | {error, _TODO}. -restore_iterator(DB, Replay, Serial) when is_binary(Serial) -> - State = binary_to_term(Serial), - restore_iterator(DB, Replay, State); -restore_iterator(DB, Replay, #{ - v := 1, - cursor := Cursor -}) -> - case make_iterator(DB, Replay) of - {ok, It} when Cursor == undefined -> - % Iterator was preserved right after it has been made. - {ok, It}; - {ok, It} -> - % Iterator was preserved mid-replay, seek right past the last seen key. - {ok, It#it{cursor = Cursor, next_action = {seek, successor(Cursor)}}}; - Err -> - Err - end. - --spec refresh_iterator(iterator()) -> iterator(). -refresh_iterator(It = #it{handle = Handle, cursor = Cursor, next_action = Action}) -> - case rocksdb:iterator_refresh(Handle) of - ok when Action =:= next -> - % Now the underlying iterator is invalid, need to seek instead. - It#it{next_action = {seek, successor(Cursor)}}; - ok -> - % Now the underlying iterator is invalid, but will seek soon anyway. - It; - {error, _} -> - % Implementation could in theory return an {error, ...} tuple. - % Supposedly our best bet is to ignore it. - % TODO logging? - It - end. - -%%================================================================================ -%% Internal exports -%%================================================================================ - --spec keymapper_info(keymapper()) -> - #{source := [bitsource()], bitsize := bits(), epoch := time()}. -keymapper_info(#keymapper{source = Source, bitsize = Bitsize, epoch = Epoch}) -> - #{source => Source, bitsize => Bitsize, epoch => Epoch}. - -make_message_key(Topic, PublishedAt, MessageID, Keymapper) -> - combine(compute_bitstring(Topic, PublishedAt, Keymapper), MessageID, Keymapper). - -make_message_value(Topic, MessagePayload) -> - term_to_binary({Topic, MessagePayload}). - -unwrap_message_value(Binary) -> - binary_to_term(Binary). - --spec combine(_Bitstring :: integer(), emqx_guid:guid() | <<>>, keymapper()) -> - key(). -combine(Bitstring, MessageID, #keymapper{bitsize = Size}) -> - <>. - --spec extract(key(), keymapper()) -> - _Bitstring :: integer(). -extract(Key, #keymapper{bitsize = Size}) -> - <> = Key, - Bitstring. - --spec compute_bitstring(topic_filter(), time(), keymapper()) -> integer(). -compute_bitstring(TopicFilter, Timestamp, #keymapper{source = Source}) -> - compute_bitstring(TopicFilter, Timestamp, Source, 0). - --spec compute_topic_bitmask(topic_filter(), keymapper()) -> integer(). -compute_topic_bitmask(TopicFilter, #keymapper{source = Source}) -> - compute_topic_bitmask(TopicFilter, Source, 0). - --spec compute_time_bitmask(keymapper()) -> integer(). -compute_time_bitmask(#keymapper{source = Source}) -> - compute_time_bitmask(Source, 0). - --spec hash(term(), bits()) -> integer(). -hash(Input, Bits) -> - % at most 32 bits - erlang:phash2(Input, 1 bsl Bits). - --spec make_keyspace_filter(emqx_ds:replay(), keymapper()) -> keyspace_filter(). -make_keyspace_filter({TopicFilter, StartTime}, Keymapper) -> - Bitstring = compute_bitstring(TopicFilter, StartTime, Keymapper), - HashBitmask = compute_topic_bitmask(TopicFilter, Keymapper), - TimeBitmask = compute_time_bitmask(Keymapper), - HashBitfilter = Bitstring band HashBitmask, - TimeBitfilter = Bitstring band TimeBitmask, - #filter{ - keymapper = Keymapper, - topic_filter = TopicFilter, - start_time = StartTime, - hash_bitfilter = HashBitfilter, - hash_bitmask = HashBitmask, - time_bitfilter = TimeBitfilter, - time_bitmask = TimeBitmask - }. - --spec compute_initial_seek(keyspace_filter()) -> integer(). -compute_initial_seek(#filter{hash_bitfilter = HashBitfilter, time_bitfilter = TimeBitfilter}) -> - % Should be the same as `compute_initial_seek(0, Filter)`. - HashBitfilter bor TimeBitfilter. - --spec compute_next_seek(integer(), keyspace_filter()) -> integer(). -compute_next_seek( - Bitstring, - Filter = #filter{ - hash_bitfilter = HashBitfilter, - hash_bitmask = HashBitmask, - time_bitfilter = TimeBitfilter, - time_bitmask = TimeBitmask - } -) -> - HashMatches = topic_hash_matches(Bitstring, HashBitfilter, HashBitmask), - TimeMatches = time_matches(Bitstring, TimeBitfilter, TimeBitmask), - compute_next_seek(HashMatches, TimeMatches, Bitstring, Filter). - -%%================================================================================ -%% Internal functions -%%================================================================================ - -compute_bitstring(Topic, Timestamp, [{timestamp, Offset, Size} | Rest], Acc) -> - I = (Timestamp bsr Offset) band ones(Size), - compute_bitstring(Topic, Timestamp, Rest, bitwise_concat(Acc, I, Size)); -compute_bitstring([], Timestamp, [{hash, level, Size} | Rest], Acc) -> - I = hash(<<"/">>, Size), - compute_bitstring([], Timestamp, Rest, bitwise_concat(Acc, I, Size)); -compute_bitstring([Level | Tail], Timestamp, [{hash, level, Size} | Rest], Acc) -> - I = hash(Level, Size), - compute_bitstring(Tail, Timestamp, Rest, bitwise_concat(Acc, I, Size)); -compute_bitstring(Tail, Timestamp, [{hash, levels, Size} | Rest], Acc) -> - I = hash(Tail, Size), - compute_bitstring(Tail, Timestamp, Rest, bitwise_concat(Acc, I, Size)); -compute_bitstring(_, _, [], Acc) -> - Acc. - -compute_topic_bitmask(Filter, [{timestamp, _, Size} | Rest], Acc) -> - compute_topic_bitmask(Filter, Rest, bitwise_concat(Acc, 0, Size)); -compute_topic_bitmask(['#'], [{hash, _, Size} | Rest], Acc) -> - compute_topic_bitmask(['#'], Rest, bitwise_concat(Acc, 0, Size)); -compute_topic_bitmask(['+' | Tail], [{hash, _, Size} | Rest], Acc) -> - compute_topic_bitmask(Tail, Rest, bitwise_concat(Acc, 0, Size)); -compute_topic_bitmask([], [{hash, level, Size} | Rest], Acc) -> - compute_topic_bitmask([], Rest, bitwise_concat(Acc, ones(Size), Size)); -compute_topic_bitmask([_ | Tail], [{hash, level, Size} | Rest], Acc) -> - compute_topic_bitmask(Tail, Rest, bitwise_concat(Acc, ones(Size), Size)); -compute_topic_bitmask(Tail, [{hash, levels, Size} | Rest], Acc) -> - Mask = - case lists:member('+', Tail) orelse lists:member('#', Tail) of - true -> 0; - false -> ones(Size) - end, - compute_topic_bitmask([], Rest, bitwise_concat(Acc, Mask, Size)); -compute_topic_bitmask(_, [], Acc) -> - Acc. - -compute_time_bitmask([{timestamp, _, Size} | Rest], Acc) -> - compute_time_bitmask(Rest, bitwise_concat(Acc, ones(Size), Size)); -compute_time_bitmask([{hash, _, Size} | Rest], Acc) -> - compute_time_bitmask(Rest, bitwise_concat(Acc, 0, Size)); -compute_time_bitmask([], Acc) -> - Acc. - -bitwise_concat(Acc, Item, ItemSize) -> - (Acc bsl ItemSize) bor Item. - -ones(Bits) -> - 1 bsl Bits - 1. - --spec successor(key()) -> key(). -successor(Key) -> - <>. - -%% |123|345|678| -%% foo bar baz - -%% |123|000|678| - |123|fff|678| - -%% foo + baz - -%% |fff|000|fff| - -%% |123|000|678| - -%% |123|056|678| & |fff|000|fff| = |123|000|678|. - -match_next( - Bitstring, - Value, - Filter = #filter{ - topic_filter = TopicFilter, - hash_bitfilter = HashBitfilter, - hash_bitmask = HashBitmask, - time_bitfilter = TimeBitfilter, - time_bitmask = TimeBitmask - } -) -> - HashMatches = topic_hash_matches(Bitstring, HashBitfilter, HashBitmask), - TimeMatches = time_matches(Bitstring, TimeBitfilter, TimeBitmask), - case HashMatches and TimeMatches of - true -> - Message = {Topic, _Payload} = unwrap_message_value(Value), - case emqx_topic:match(Topic, TopicFilter) of - true -> - Message; - false -> - next - end; - false -> - compute_next_seek(HashMatches, TimeMatches, Bitstring, Filter) - end. - -%% `Bitstring` is out of the hash space defined by `HashBitfilter`. -compute_next_seek( - _HashMatches = false, - _TimeMatches, - Bitstring, - Filter = #filter{ - keymapper = Keymapper, - hash_bitfilter = HashBitfilter, - hash_bitmask = HashBitmask, - time_bitfilter = TimeBitfilter, - time_bitmask = TimeBitmask - } -) -> - NextBitstring = compute_topic_seek(Bitstring, HashBitfilter, HashBitmask, Keymapper), - case NextBitstring of - none -> - none; - _ -> - TimeMatches = time_matches(NextBitstring, TimeBitfilter, TimeBitmask), - compute_next_seek(true, TimeMatches, NextBitstring, Filter) - end; -%% `Bitstring` is out of the time range defined by `TimeBitfilter`. -compute_next_seek( - _HashMatches = true, - _TimeMatches = false, - Bitstring, - #filter{ - time_bitfilter = TimeBitfilter, - time_bitmask = TimeBitmask - } -) -> - compute_time_seek(Bitstring, TimeBitfilter, TimeBitmask); -compute_next_seek(true, true, Bitstring, _It) -> - Bitstring. - -topic_hash_matches(Bitstring, HashBitfilter, HashBitmask) -> - (Bitstring band HashBitmask) == HashBitfilter. - -time_matches(Bitstring, TimeBitfilter, TimeBitmask) -> - (Bitstring band TimeBitmask) >= TimeBitfilter. - -compute_time_seek(Bitstring, TimeBitfilter, TimeBitmask) -> - % Replace the bits of the timestamp in `Bistring` with bits from `Timebitfilter`. - (Bitstring band (bnot TimeBitmask)) bor TimeBitfilter. - -%% Find the closest bitstring which is: -%% * greater than `Bitstring`, -%% * and falls into the hash space defined by `HashBitfilter`. -%% Note that the result can end up "back" in time and out of the time range. -compute_topic_seek(Bitstring, HashBitfilter, HashBitmask, Keymapper) -> - Sources = Keymapper#keymapper.source, - Size = Keymapper#keymapper.bitsize, - compute_topic_seek(Bitstring, HashBitfilter, HashBitmask, Sources, Size). - -compute_topic_seek(Bitstring, HashBitfilter, HashBitmask, Sources, Size) -> - % NOTE - % We're iterating through `Substring` here, in lockstep with `HashBitfilter` - % and `HashBitmask`, starting from least signigicant bits. Each bitsource in - % `Sources` has a bitsize `S` and, accordingly, gives us a sub-bitstring `S` - % bits long which we interpret as a "digit". There are 2 flavors of those - % "digits": - % * regular digit with 2^S possible values, - % * degenerate digit with exactly 1 possible value U (represented with 0). - % Our goal here is to find a successor of `Bistring` and perform a kind of - % digit-by-digit addition operation with carry propagation. - NextSeek = zipfoldr3( - fun(Source, Substring, Filter, LBitmask, Offset, Acc) -> - case Source of - {hash, _, S} when LBitmask =:= 0 -> - % Regular case - bitwise_add_digit(Substring, Acc, S, Offset); - {hash, _, _} when LBitmask =/= 0, Substring < Filter -> - % Degenerate case, I_digit < U, no overflow. - % Successor is `U bsl Offset` which is equivalent to 0. - 0; - {hash, _, S} when LBitmask =/= 0, Substring > Filter -> - % Degenerate case, I_digit > U, overflow. - % Successor is `(1 bsl Size + U) bsl Offset`. - overflow_digit(S, Offset); - {hash, _, S} when LBitmask =/= 0 -> - % Degenerate case, I_digit = U - % Perform digit addition with I_digit = 0, assuming "digit" has - % 0 bits of information (but is `S` bits long at the same time). - % This will overflow only if the result of previous iteration - % was an overflow. - bitwise_add_digit(0, Acc, 0, S, Offset); - {timestamp, _, S} -> - % Regular case - bitwise_add_digit(Substring, Acc, S, Offset) - end - end, - 0, - Bitstring, - HashBitfilter, - HashBitmask, - Size, - Sources - ), - case NextSeek bsr Size of - _Carry = 0 -> - % Found the successor. - % We need to recover values of those degenerate digits which we - % represented with 0 during digit-by-digit iteration. - NextSeek bor (HashBitfilter band HashBitmask); - _Carry = 1 -> - % We got "carried away" past the range, time to stop iteration. - none - end. - -bitwise_add_digit(Digit, Number, Width, Offset) -> - bitwise_add_digit(Digit, Number, Width, Width, Offset). - -%% Add "digit" (represented with integer `Digit`) to the `Number` assuming -%% this digit starts at `Offset` bits in `Number` and is `Width` bits long. -%% Perform an overflow if the result of addition would not fit into `Bits` -%% bits. -bitwise_add_digit(Digit, Number, Bits, Width, Offset) -> - Sum = (Digit bsl Offset) + Number, - case (Sum bsr Offset) < (1 bsl Bits) of - true -> Sum; - false -> overflow_digit(Width, Offset) - end. - -%% Constuct a number which denotes an overflow of digit that starts at -%% `Offset` bits and is `Width` bits long. -overflow_digit(Width, Offset) -> - (1 bsl Width) bsl Offset. - -%% Iterate through sub-bitstrings of 3 integers in lockstep, starting from least -%% significant bits first. -%% -%% Each integer is assumed to be `Size` bits long. Lengths of sub-bitstring are -%% specified in `Sources` list, in order from most significant bits to least -%% significant. Each iteration calls `FoldFun` with: -%% * bitsource that was used to extract sub-bitstrings, -%% * 3 sub-bitstrings in integer representation, -%% * bit offset into integers, -%% * current accumulator. --spec zipfoldr3(FoldFun, Acc, integer(), integer(), integer(), _Size :: bits(), [bitsource()]) -> - Acc -when - FoldFun :: fun((bitsource(), integer(), integer(), integer(), _Offset :: bits(), Acc) -> Acc). -zipfoldr3(_FoldFun, Acc, _, _, _, 0, []) -> - Acc; -zipfoldr3(FoldFun, Acc, I1, I2, I3, Offset, [Source = {_, _, S} | Rest]) -> - OffsetNext = Offset - S, - AccNext = zipfoldr3(FoldFun, Acc, I1, I2, I3, OffsetNext, Rest), - FoldFun( - Source, - substring(I1, OffsetNext, S), - substring(I2, OffsetNext, S), - substring(I3, OffsetNext, S), - OffsetNext, - AccNext - ). - -substring(I, Offset, Size) -> - (I bsr Offset) band ones(Size). - -%% @doc Generate a column family ID for the MQTT messages --spec data_cf(emqx_ds_storage_layer:gen_id()) -> [char()]. -data_cf(GenId) -> - ?MODULE_STRING ++ integer_to_list(GenId). - -make_refresh_counter({every, N}) when is_integer(N), N > 0 -> - {0, N}; -make_refresh_counter(undefined) -> - undefined. - -maybe_refresh_iterator(It = #it{refresh_counter = {N, N}}) -> - refresh_iterator(It#it{refresh_counter = {0, N}}); -maybe_refresh_iterator(It = #it{refresh_counter = {M, N}}) -> - It#it{refresh_counter = {M + 1, N}}; -maybe_refresh_iterator(It = #it{refresh_counter = undefined}) -> - It. - -stop_iteration(It) -> - ok = rocksdb:iterator_close(It#it.handle), - none. diff --git a/apps/emqx_durable_storage/src/emqx_ds_replication_layer.erl b/apps/emqx_durable_storage/src/emqx_ds_replication_layer.erl new file mode 100644 index 000000000..d61dfa906 --- /dev/null +++ b/apps/emqx_durable_storage/src/emqx_ds_replication_layer.erl @@ -0,0 +1,217 @@ +%%-------------------------------------------------------------------- +%% Copyright (c) 2023 EMQ Technologies Co., Ltd. All Rights Reserved. +%% +%% Licensed under the Apache License, Version 2.0 (the "License"); +%% you may not use this file except in compliance with the License. +%% You may obtain a copy of the License at +%% +%% http://www.apache.org/licenses/LICENSE-2.0 +%% +%% Unless required by applicable law or agreed to in writing, software +%% distributed under the License is distributed on an "AS IS" BASIS, +%% WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +%% See the License for the specific language governing permissions and +%% limitations under the License. +%%-------------------------------------------------------------------- + +%% @doc Replication layer for DS backends that don't support +%% replication on their own. +-module(emqx_ds_replication_layer). + +-export([ + list_shards/1, + open_db/2, + drop_db/1, + store_batch/3, + get_streams/3, + make_iterator/3, + next/2 +]). + +%% internal exports: +-export([ + do_open_shard_v1/2, + do_drop_shard_v1/1, + do_get_streams_v1/3, + do_make_iterator_v1/4, + do_next_v1/3 +]). + +-export_type([shard_id/0, stream/0, iterator/0, message_id/0]). + +%%================================================================================ +%% Type declarations +%%================================================================================ + +-type db() :: emqx_ds:db(). + +-type shard_id() :: {db(), atom()}. + +%% This record enapsulates the stream entity from the replication +%% level. +%% +%% TODO: currently the stream is hardwired to only support the +%% internal rocksdb storage. In the future we want to add another +%% implementations for emqx_ds, so this type has to take this into +%% account. +-record(stream, { + shard :: emqx_ds_replication_layer:shard_id(), + enc :: emqx_ds_storage_layer:stream() +}). + +-opaque stream() :: #stream{}. + +-record(iterator, { + shard :: emqx_ds_replication_layer:shard_id(), + enc :: enqx_ds_storage_layer:iterator() +}). + +-opaque iterator() :: #iterator{}. + +-type message_id() :: emqx_ds_storage_layer:message_id(). + +%%================================================================================ +%% API functions +%%================================================================================ + +-spec list_shards(db()) -> [shard_id()]. +list_shards(DB) -> + %% TODO: milestone 5 + lists:map( + fun(Node) -> + shard_id(DB, Node) + end, + list_nodes() + ). + +-spec open_db(db(), emqx_ds:create_db_opts()) -> ok | {error, _}. +open_db(DB, Opts) -> + %% TODO: improve error reporting, don't just crash + lists:foreach( + fun(Node) -> + Shard = shard_id(DB, Node), + ok = emqx_ds_proto_v1:open_shard(Node, Shard, Opts) + end, + list_nodes() + ). + +-spec drop_db(db()) -> ok | {error, _}. +drop_db(DB) -> + lists:foreach( + fun(Node) -> + Shard = shard_id(DB, Node), + ok = emqx_ds_proto_v1:drop_shard(Node, Shard) + end, + list_nodes() + ). + +-spec store_batch(db(), [emqx_types:message()], emqx_ds:message_store_opts()) -> + emqx_ds:store_batch_result(). +store_batch(DB, Msg, Opts) -> + %% TODO: Currently we store messages locally. + Shard = shard_id(DB, node()), + emqx_ds_storage_layer:store_batch(Shard, Msg, Opts). + +-spec get_streams(db(), emqx_ds:topic_filter(), emqx_ds:time()) -> + [{emqx_ds:stream_rank(), stream()}]. +get_streams(DB, TopicFilter, StartTime) -> + Shards = list_shards(DB), + lists:flatmap( + fun(Shard) -> + Node = node_of_shard(Shard), + Streams = emqx_ds_proto_v1:get_streams(Node, Shard, TopicFilter, StartTime), + lists:map( + fun({RankY, Stream}) -> + RankX = Shard, + Rank = {RankX, RankY}, + {Rank, #stream{ + shard = Shard, + enc = Stream + }} + end, + Streams + ) + end, + Shards + ). + +-spec make_iterator(stream(), emqx_ds:topic_filter(), emqx_ds:time()) -> + emqx_ds:make_iterator_result(iterator()). +make_iterator(Stream, TopicFilter, StartTime) -> + #stream{shard = Shard, enc = StorageStream} = Stream, + Node = node_of_shard(Shard), + case emqx_ds_proto_v1:make_iterator(Node, Shard, StorageStream, TopicFilter, StartTime) of + {ok, Iter} -> + {ok, #iterator{shard = Shard, enc = Iter}}; + Err = {error, _} -> + Err + end. + +-spec next(iterator(), pos_integer()) -> emqx_ds:next_result(iterator()). +next(Iter0, BatchSize) -> + #iterator{shard = Shard, enc = StorageIter0} = Iter0, + Node = node_of_shard(Shard), + %% TODO: iterator can contain information that is useful for + %% reconstructing messages sent over the network. For example, + %% when we send messages with the learned topic index, we could + %% send the static part of topic once, and append it to the + %% messages on the receiving node, hence saving some network. + %% + %% This kind of trickery should be probably done here in the + %% replication layer. Or, perhaps, in the logic layer. + case emqx_ds_proto_v1:next(Node, Shard, StorageIter0, BatchSize) of + {ok, StorageIter, Batch} -> + Iter = #iterator{shard = Shard, enc = StorageIter}, + {ok, Iter, Batch}; + Other -> + Other + end. + +%%================================================================================ +%% behavior callbacks +%%================================================================================ + +%%================================================================================ +%% Internal exports (RPC targets) +%%================================================================================ + +-spec do_open_shard_v1(shard_id(), emqx_ds:create_db_opts()) -> ok. +do_open_shard_v1(Shard, Opts) -> + emqx_ds_storage_layer:open_shard(Shard, Opts). + +-spec do_drop_shard_v1(shard_id()) -> ok. +do_drop_shard_v1(Shard) -> + emqx_ds_storage_layer:drop_shard(Shard). + +-spec do_get_streams_v1(shard_id(), emqx_ds:topic_filter(), emqx_ds:time()) -> + [{integer(), _Stream}]. +do_get_streams_v1(Shard, TopicFilter, StartTime) -> + emqx_ds_storage_layer:get_streams(Shard, TopicFilter, StartTime). + +-spec do_make_iterator_v1( + shard_id(), emqx_ds_storage_layer:stream(), emqx_ds:topic_filter(), emqx_ds:time() +) -> + {ok, iterator()} | {error, _}. +do_make_iterator_v1(Shard, Stream, TopicFilter, StartTime) -> + emqx_ds_storage_layer:make_iterator(Shard, Stream, TopicFilter, StartTime). + +-spec do_next_v1(shard_id(), emqx_ds_storage_layer:iterator(), pos_integer()) -> + emqx_ds:next_result(emqx_ds_storage_layer:iterator()). +do_next_v1(Shard, Iter, BatchSize) -> + emqx_ds_storage_layer:next(Shard, Iter, BatchSize). + +%%================================================================================ +%% Internal functions +%%================================================================================ + +shard_id(DB, Node) -> + %% TODO: don't bake node name into the schema, don't repeat the + %% Mnesia's 1M$ mistake. + {DB, Node}. + +-spec node_of_shard(shard_id()) -> node(). +node_of_shard({_DB, Node}) -> + Node. + +list_nodes() -> + mria:running_nodes(). diff --git a/apps/emqx_durable_storage/src/emqx_ds_storage_bitfield_lts.erl b/apps/emqx_durable_storage/src/emqx_ds_storage_bitfield_lts.erl new file mode 100644 index 000000000..d57d8013c --- /dev/null +++ b/apps/emqx_durable_storage/src/emqx_ds_storage_bitfield_lts.erl @@ -0,0 +1,418 @@ +%%-------------------------------------------------------------------- +%% Copyright (c) 2023 EMQ Technologies Co., Ltd. All Rights Reserved. +%% +%% Licensed under the Apache License, Version 2.0 (the "License"); +%% you may not use this file except in compliance with the License. +%% You may obtain a copy of the License at +%% +%% http://www.apache.org/licenses/LICENSE-2.0 +%% +%% Unless required by applicable law or agreed to in writing, software +%% distributed under the License is distributed on an "AS IS" BASIS, +%% WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +%% See the License for the specific language governing permissions and +%% limitations under the License. +%%-------------------------------------------------------------------- + +%% @doc A storage layout based on learned topic structure and using +%% bitfield mapping for the varying topic layers. +-module(emqx_ds_storage_bitfield_lts). + +-behaviour(emqx_ds_storage_layer). + +%% API: +-export([]). + +%% behavior callbacks: +-export([create/4, open/5, store_batch/4, get_streams/4, make_iterator/5, next/4]). + +%% internal exports: +-export([format_key/2]). + +-export_type([options/0]). + +-include_lib("emqx_utils/include/emqx_message.hrl"). +-include_lib("snabbkaffe/include/trace.hrl"). + +%%================================================================================ +%% Type declarations +%%================================================================================ + +-type options() :: + #{ + bits_per_wildcard_level => pos_integer(), + topic_index_bytes => pos_integer(), + epoch_bits => non_neg_integer() + }. + +%% Permanent state: +-type schema() :: + #{ + bits_per_wildcard_level := pos_integer(), + topic_index_bytes := pos_integer(), + ts_bits := non_neg_integer(), + ts_offset_bits := non_neg_integer() + }. + +%% Runtime state: +-record(s, { + db :: rocksdb:db_handle(), + data :: rocksdb:cf_handle(), + trie :: emqx_ds_lts:trie(), + keymappers :: array:array(emqx_ds_bitmask_keymapper:keymapper()), + ts_offset :: non_neg_integer() +}). + +-type s() :: #s{}. + +-record(stream, { + storage_key :: emqx_ds_lts:msg_storage_key() +}). + +-record(it, { + topic_filter :: emqx_ds:topic_filter(), + start_time :: emqx_ds:time(), + storage_key :: emqx_ds_lts:msg_storage_key(), + last_seen_key = <<>> :: binary() +}). + +-type iterator() :: #it{}. + +-define(COUNTER, emqx_ds_storage_bitfield_lts_counter). + +%% Limit on the number of wildcard levels in the learned topic trie: +-define(WILDCARD_LIMIT, 10). + +-include("emqx_ds_bitmask.hrl"). + +%%================================================================================ +%% API funcions +%%================================================================================ + +%%================================================================================ +%% behavior callbacks +%%================================================================================ + +-spec create( + emqx_ds_replication_layer:shard_id(), + rocksdb:db_handle(), + emqx_ds_storage_layer:gen_id(), + options() +) -> + {schema(), emqx_ds_storage_layer:cf_refs()}. +create(_ShardId, DBHandle, GenId, Options) -> + %% Get options: + BitsPerTopicLevel = maps:get(bits_per_wildcard_level, Options, 64), + TopicIndexBytes = maps:get(topic_index_bytes, Options, 4), + %% 10 bits -> 1024 ms -> ~1 sec + TSOffsetBits = maps:get(epoch_bits, Options, 10), + %% Create column families: + DataCFName = data_cf(GenId), + TrieCFName = trie_cf(GenId), + {ok, DataCFHandle} = rocksdb:create_column_family(DBHandle, DataCFName, []), + {ok, TrieCFHandle} = rocksdb:create_column_family(DBHandle, TrieCFName, []), + %% Create schema: + Schema = #{ + bits_per_wildcard_level => BitsPerTopicLevel, + topic_index_bytes => TopicIndexBytes, + ts_bits => 64, + ts_offset_bits => TSOffsetBits + }, + {Schema, [{DataCFName, DataCFHandle}, {TrieCFName, TrieCFHandle}]}. + +-spec open( + emqx_ds_replication_layer:shard_id(), + rocksdb:db_handle(), + emqx_ds_storage_layer:gen_id(), + emqx_ds_storage_layer:cf_refs(), + schema() +) -> + s(). +open(_Shard, DBHandle, GenId, CFRefs, Schema) -> + #{ + bits_per_wildcard_level := BitsPerTopicLevel, + topic_index_bytes := TopicIndexBytes, + ts_bits := TSBits, + ts_offset_bits := TSOffsetBits + } = Schema, + {_, DataCF} = lists:keyfind(data_cf(GenId), 1, CFRefs), + {_, TrieCF} = lists:keyfind(trie_cf(GenId), 1, CFRefs), + Trie = restore_trie(TopicIndexBytes, DBHandle, TrieCF), + %% If user's topics have more than learned 10 wildcard levels + %% (more than 2, really), then it's total carnage; learned topic + %% structure won't help. + MaxWildcardLevels = ?WILDCARD_LIMIT, + KeymapperCache = array:from_list( + [ + make_keymapper(TopicIndexBytes, BitsPerTopicLevel, TSBits, TSOffsetBits, N) + || N <- lists:seq(0, MaxWildcardLevels) + ] + ), + #s{ + db = DBHandle, + data = DataCF, + trie = Trie, + keymappers = KeymapperCache, + ts_offset = TSOffsetBits + }. + +-spec store_batch( + emqx_ds_replication_layer:shard_id(), s(), [emqx_types:message()], emqx_ds:message_store_opts() +) -> + emqx_ds:store_batch_result(). +store_batch(_ShardId, S = #s{db = DB, data = Data}, Messages, _Options) -> + lists:foreach( + fun(Msg) -> + {Key, _} = make_key(S, Msg), + Val = serialize(Msg), + rocksdb:put(DB, Data, Key, Val, []) + end, + Messages + ). + +get_streams(_Shard, #s{trie = Trie}, TopicFilter, _StartTime) -> + Indexes = emqx_ds_lts:match_topics(Trie, TopicFilter), + [#stream{storage_key = I} || I <- Indexes]. + +make_iterator(_Shard, _Data, #stream{storage_key = StorageKey}, TopicFilter, StartTime) -> + %% Note: it's a good idea to keep the iterator structure lean, + %% since it can be stored on a remote node that could update its + %% code independently from us. + {ok, #it{ + topic_filter = TopicFilter, + start_time = StartTime, + storage_key = StorageKey + }}. + +next(_Shard, Schema = #s{ts_offset = TSOffset}, It, BatchSize) -> + %% Compute safe cutoff time. + %% It's the point in time where the last complete epoch ends, so we need to know + %% the current time to compute it. + Now = emqx_message:timestamp_now(), + SafeCutoffTime = (Now bsr TSOffset) bsl TSOffset, + next_until(Schema, It, SafeCutoffTime, BatchSize). + +next_until(_Schema, It, SafeCutoffTime, _BatchSize) when It#it.start_time >= SafeCutoffTime -> + %% We're in the middle of the current epoch, so we can't yet iterate over it. + %% It would be unsafe otherwise: messages can be stored in the current epoch + %% concurrently with iterating over it. They can end up earlier (in the iteration + %% order) due to the nature of keymapping, potentially causing us to miss them. + {ok, It, []}; +next_until(#s{db = DB, data = CF, keymappers = Keymappers}, It, SafeCutoffTime, BatchSize) -> + #it{ + start_time = StartTime, + storage_key = {TopicIndex, Varying} + } = It, + %% Make filter: + Inequations = [ + {'=', TopicIndex}, + {StartTime, '..', SafeCutoffTime - 1}, + %% Unique integer: + any + %% Varying topic levels: + | lists:map( + fun + ('+') -> + any; + (TopicLevel) when is_binary(TopicLevel) -> + {'=', hash_topic_level(TopicLevel)} + end, + Varying + ) + ], + %% Obtain a keymapper for the current number of varying levels. + NVarying = length(Varying), + %% Assert: + NVarying =< ?WILDCARD_LIMIT orelse + error({too_many_varying_topic_levels, NVarying}), + Keymapper = array:get(NVarying, Keymappers), + Filter = + #filter{range_min = LowerBound, range_max = UpperBound} = emqx_ds_bitmask_keymapper:make_filter( + Keymapper, Inequations + ), + {ok, ITHandle} = rocksdb:iterator(DB, CF, [ + {iterate_lower_bound, emqx_ds_bitmask_keymapper:key_to_bitstring(Keymapper, LowerBound)}, + {iterate_upper_bound, emqx_ds_bitmask_keymapper:key_to_bitstring(Keymapper, UpperBound + 1)} + ]), + try + put(?COUNTER, 0), + next_loop(ITHandle, Keymapper, Filter, SafeCutoffTime, It, [], BatchSize) + after + rocksdb:iterator_close(ITHandle), + erase(?COUNTER) + end. + +%%================================================================================ +%% Internal functions +%%================================================================================ + +next_loop(_ITHandle, _KeyMapper, _Filter, _Cutoff, It, Acc, 0) -> + {ok, It, lists:reverse(Acc)}; +next_loop(ITHandle, KeyMapper, Filter, Cutoff, It0, Acc0, N0) -> + inc_counter(), + #it{last_seen_key = Key0} = It0, + case emqx_ds_bitmask_keymapper:bin_increment(Filter, Key0) of + overflow -> + {ok, It0, lists:reverse(Acc0)}; + Key1 -> + %% assert + true = Key1 > Key0, + case rocksdb:iterator_move(ITHandle, {seek, Key1}) of + {ok, Key, Val} -> + {N, It, Acc} = + traverse_interval(ITHandle, Filter, Cutoff, Key, Val, It0, Acc0, N0), + next_loop(ITHandle, KeyMapper, Filter, Cutoff, It, Acc, N); + {error, invalid_iterator} -> + {ok, It0, lists:reverse(Acc0)} + end + end. + +traverse_interval(ITHandle, Filter, Cutoff, Key, Val, It0, Acc0, N) -> + It = It0#it{last_seen_key = Key}, + case emqx_ds_bitmask_keymapper:bin_checkmask(Filter, Key) of + true -> + Msg = deserialize(Val), + case check_message(Cutoff, It, Msg) of + true -> + Acc = [Msg | Acc0], + traverse_interval(ITHandle, Filter, Cutoff, It, Acc, N - 1); + false -> + traverse_interval(ITHandle, Filter, Cutoff, It, Acc0, N); + overflow -> + {0, It0, Acc0} + end; + false -> + {N, It, Acc0} + end. + +traverse_interval(_ITHandle, _Filter, _Cutoff, It, Acc, 0) -> + {0, It, Acc}; +traverse_interval(ITHandle, Filter, Cutoff, It, Acc, N) -> + inc_counter(), + case rocksdb:iterator_move(ITHandle, next) of + {ok, Key, Val} -> + traverse_interval(ITHandle, Filter, Cutoff, Key, Val, It, Acc, N); + {error, invalid_iterator} -> + {0, It, Acc} + end. + +-spec check_message(emqx_ds:time(), iterator(), emqx_types:message()) -> + true | false | overflow. +check_message( + Cutoff, + _It, + #message{timestamp = Timestamp} +) when Timestamp >= Cutoff -> + %% We hit the current epoch, we can't continue iterating over it yet. + %% It would be unsafe otherwise: messages can be stored in the current epoch + %% concurrently with iterating over it. They can end up earlier (in the iteration + %% order) due to the nature of keymapping, potentially causing us to miss them. + overflow; +check_message( + _Cutoff, + #it{start_time = StartTime, topic_filter = TopicFilter}, + #message{timestamp = Timestamp, topic = Topic} +) when Timestamp >= StartTime -> + emqx_topic:match(emqx_topic:words(Topic), TopicFilter); +check_message(_Cutoff, _It, _Msg) -> + false. + +format_key(KeyMapper, Key) -> + Vec = [integer_to_list(I, 16) || I <- emqx_ds_bitmask_keymapper:key_to_vector(KeyMapper, Key)], + lists:flatten(io_lib:format("~.16B (~s)", [Key, string:join(Vec, ",")])). + +-spec make_key(s(), emqx_types:message()) -> {binary(), [binary()]}. +make_key(#s{keymappers = KeyMappers, trie = Trie}, #message{timestamp = Timestamp, topic = TopicBin}) -> + Tokens = emqx_topic:tokens(TopicBin), + {TopicIndex, Varying} = emqx_ds_lts:topic_key(Trie, fun threshold_fun/1, Tokens), + VaryingHashes = [hash_topic_level(I) || I <- Varying], + KeyMapper = array:get(length(Varying), KeyMappers), + KeyBin = make_key(KeyMapper, TopicIndex, Timestamp, VaryingHashes), + {KeyBin, Varying}. + +-spec make_key(emqx_ds_bitmask_keymapper:keymapper(), emqx_ds_lts:static_key(), emqx_ds:time(), [ + non_neg_integer() +]) -> + binary(). +make_key(KeyMapper, TopicIndex, Timestamp, Varying) -> + UniqueInteger = erlang:unique_integer([monotonic, positive]), + emqx_ds_bitmask_keymapper:key_to_bitstring( + KeyMapper, + emqx_ds_bitmask_keymapper:vector_to_key(KeyMapper, [ + TopicIndex, Timestamp, UniqueInteger | Varying + ]) + ). + +%% TODO: don't hardcode the thresholds +threshold_fun(0) -> + 100; +threshold_fun(_) -> + 20. + +hash_topic_level(TopicLevel) -> + <> = erlang:md5(TopicLevel), + Int. + +serialize(Msg) -> + term_to_binary(Msg). + +deserialize(Blob) -> + binary_to_term(Blob). + +-define(BYTE_SIZE, 8). + +%% erlfmt-ignore +make_keymapper(TopicIndexBytes, BitsPerTopicLevel, TSBits, TSOffsetBits, N) -> + Bitsources = + %% Dimension Offset Bitsize + [{1, 0, TopicIndexBytes * ?BYTE_SIZE}, %% Topic index + {2, TSOffsetBits, TSBits - TSOffsetBits }] ++ %% Timestamp epoch + [{3 + I, 0, BitsPerTopicLevel } %% Varying topic levels + || I <- lists:seq(1, N)] ++ + [{2, 0, TSOffsetBits }, %% Timestamp offset + {3, 0, 64 }], %% Unique integer + Keymapper = emqx_ds_bitmask_keymapper:make_keymapper(lists:reverse(Bitsources)), + %% Assert: + case emqx_ds_bitmask_keymapper:bitsize(Keymapper) rem 8 of + 0 -> + ok; + _ -> + error(#{'$msg' => "Non-even key size", bitsources => Bitsources}) + end, + Keymapper. + +-spec restore_trie(pos_integer(), rocksdb:db_handle(), rocksdb:cf_handle()) -> emqx_ds_lts:trie(). +restore_trie(TopicIndexBytes, DB, CF) -> + PersistCallback = fun(Key, Val) -> + rocksdb:put(DB, CF, term_to_binary(Key), term_to_binary(Val), []) + end, + {ok, IT} = rocksdb:iterator(DB, CF, []), + try + Dump = read_persisted_trie(IT, rocksdb:iterator_move(IT, first)), + TrieOpts = #{persist_callback => PersistCallback, static_key_size => TopicIndexBytes}, + emqx_ds_lts:trie_restore(TrieOpts, Dump) + after + rocksdb:iterator_close(IT) + end. + +read_persisted_trie(IT, {ok, KeyB, ValB}) -> + [ + {binary_to_term(KeyB), binary_to_term(ValB)} + | read_persisted_trie(IT, rocksdb:iterator_move(IT, next)) + ]; +read_persisted_trie(_IT, {error, invalid_iterator}) -> + []. + +inc_counter() -> + N = get(?COUNTER), + put(?COUNTER, N + 1). + +%% @doc Generate a column family ID for the MQTT messages +-spec data_cf(emqx_ds_storage_layer:gen_id()) -> [char()]. +data_cf(GenId) -> + "emqx_ds_storage_bitfield_lts_data" ++ integer_to_list(GenId). + +%% @doc Generate a column family ID for the trie +-spec trie_cf(emqx_ds_storage_layer:gen_id()) -> [char()]. +trie_cf(GenId) -> + "emqx_ds_storage_bitfield_lts_trie" ++ integer_to_list(GenId). diff --git a/apps/emqx_durable_storage/src/emqx_ds_storage_layer.erl b/apps/emqx_durable_storage/src/emqx_ds_storage_layer.erl index 6137a1ed7..57af33d61 100644 --- a/apps/emqx_durable_storage/src/emqx_ds_storage_layer.erl +++ b/apps/emqx_durable_storage/src/emqx_ds_storage_layer.erl @@ -1,277 +1,255 @@ %%-------------------------------------------------------------------- -%% Copyright (c) 2022-2023 EMQ Technologies Co., Ltd. All Rights Reserved. +%% Copyright (c) 2023 EMQ Technologies Co., Ltd. All Rights Reserved. +%% +%% Licensed under the Apache License, Version 2.0 (the "License"); +%% you may not use this file except in compliance with the License. +%% You may obtain a copy of the License at +%% +%% http://www.apache.org/licenses/LICENSE-2.0 +%% +%% Unless required by applicable law or agreed to in writing, software +%% distributed under the License is distributed on an "AS IS" BASIS, +%% WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +%% See the License for the specific language governing permissions and +%% limitations under the License. %%-------------------------------------------------------------------- -module(emqx_ds_storage_layer). -behaviour(gen_server). -%% API: --export([start_link/2]). --export([create_generation/3]). +%% Replication layer API: +-export([open_shard/2, drop_shard/1, store_batch/3, get_streams/3, make_iterator/4, next/3]). --export([store/5]). --export([delete/4]). +%% gen_server +-export([start_link/2, init/1, handle_call/3, handle_cast/2, handle_info/2, terminate/2]). --export([make_iterator/2, next/1]). +%% internal exports: +-export([db_dir/1]). --export([ - preserve_iterator/2, - restore_iterator/2, - discard_iterator/2, - ensure_iterator/3, - discard_iterator_prefix/2, - list_iterator_prefix/2, - foldl_iterator_prefix/4 -]). +-export_type([gen_id/0, generation/0, cf_refs/0, stream/0, iterator/0]). -%% behaviour callbacks: --export([init/1, handle_call/3, handle_cast/2, handle_info/2, terminate/2]). - --export_type([cf_refs/0, gen_id/0, options/0, state/0, iterator/0]). --export_type([db_options/0, db_write_options/0, db_read_options/0]). - --compile({inline, [meta_lookup/2]}). +-include_lib("snabbkaffe/include/snabbkaffe.hrl"). %%================================================================================ %% Type declarations %%================================================================================ --type options() :: #{ - dir => file:filename() -}. +-type prototype() :: + {emqx_ds_storage_reference, emqx_ds_storage_reference:options()} + | {emqx_ds_storage_bitfield_lts, emqx_ds_storage_bitfield_lts:options()}. -%% see rocksdb:db_options() --type db_options() :: proplists:proplist(). -%% see rocksdb:write_options() --type db_write_options() :: proplists:proplist(). -%% see rocksdb:read_options() --type db_read_options() :: proplists:proplist(). +-type shard_id() :: emqx_ds_replication_layer:shard_id(). -type cf_refs() :: [{string(), rocksdb:cf_handle()}]. -%% Message storage generation -%% Keep in mind that instances of this type are persisted in long-term storage. --type generation() :: #{ - %% Module that handles data for the generation +-type gen_id() :: 0..16#ffff. + +%% Note: this record might be stored permanently on a remote node. +-record(stream, { + generation :: gen_id(), + enc :: _EncapsulatedData, + misc = #{} :: map() +}). + +-opaque stream() :: #stream{}. + +%% Note: this record might be stored permanently on a remote node. +-record(it, { + generation :: gen_id(), + enc :: _EncapsulatedData, + misc = #{} :: map() +}). + +-opaque iterator() :: #it{}. + +%%%% Generation: + +-type generation(Data) :: #{ + %% Module that handles data for the generation: module := module(), - %% Module-specific data defined at generation creation time - data := term(), + %% Module-specific data defined at generation creation time: + data := Data, %% When should this generation become active? %% This generation should only contain messages timestamped no earlier than that. %% The very first generation will have `since` equal 0. - since := emqx_ds:time() + since := emqx_ds:time(), + until := emqx_ds:time() | undefined }. +%% Schema for a generation. Persistent term. +-type generation_schema() :: generation(term()). + +%% Runtime view of generation: +-type generation() :: generation(term()). + +%%%% Shard: + +-type shard(GenData) :: #{ + %% ID of the current generation (where the new data is written): + current_generation := gen_id(), + %% This data is used to create new generation: + prototype := prototype(), + %% Generations: + {generation, gen_id()} => GenData +}. + +%% Shard schema (persistent): +-type shard_schema() :: shard(generation_schema()). + +%% Shard (runtime): +-type shard() :: shard(generation()). + +%%================================================================================ +%% Generation callbacks +%%================================================================================ + +%% Create the new schema given generation id and the options. +%% Create rocksdb column families. +-callback create(shard_id(), rocksdb:db_handle(), gen_id(), _Options) -> + {_Schema, cf_refs()}. + +%% Open the existing schema +-callback open(shard_id(), rocsdb:db_handle(), gen_id(), cf_refs(), _Schema) -> + _Data. + +-callback store_batch(shard_id(), _Data, [emqx_types:message()], emqx_ds:message_store_opts()) -> + emqx_ds:store_batch_result(). + +-callback get_streams(shard_id(), _Data, emqx_ds:topic_filter(), emqx_ds:time()) -> + [_Stream]. + +-callback make_iterator(shard_id(), _Data, _Stream, emqx_ds:topic_filter(), emqx_ds:time()) -> + emqx_ds:make_iterator_result(_Iterator). + +-callback next(shard_id(), _Data, Iter, pos_integer()) -> + {ok, Iter, [emqx_types:message()]} | {error, _}. + +%%================================================================================ +%% API for the replication layer +%%================================================================================ + +-spec open_shard(shard_id(), emqx_ds:builtin_db_opts()) -> ok. +open_shard(Shard, Options) -> + emqx_ds_storage_layer_sup:ensure_shard(Shard, Options). + +-spec drop_shard(shard_id()) -> ok. +drop_shard(Shard) -> + catch emqx_ds_storage_layer_sup:stop_shard(Shard), + ok = rocksdb:destroy(db_dir(Shard), []). + +-spec store_batch(shard_id(), [emqx_types:message()], emqx_ds:message_store_opts()) -> + emqx_ds:store_batch_result(). +store_batch(Shard, Messages, Options) -> + %% We always store messages in the current generation: + GenId = generation_current(Shard), + #{module := Mod, data := GenData} = generation_get(Shard, GenId), + Mod:store_batch(Shard, GenData, Messages, Options). + +-spec get_streams(shard_id(), emqx_ds:topic_filter(), emqx_ds:time()) -> + [{integer(), stream()}]. +get_streams(Shard, TopicFilter, StartTime) -> + Gens = generations_since(Shard, StartTime), + lists:flatmap( + fun(GenId) -> + #{module := Mod, data := GenData} = generation_get(Shard, GenId), + Streams = Mod:get_streams(Shard, GenData, TopicFilter, StartTime), + [ + {GenId, #stream{ + generation = GenId, + enc = Stream + }} + || Stream <- Streams + ] + end, + Gens + ). + +-spec make_iterator(shard_id(), stream(), emqx_ds:topic_filter(), emqx_ds:time()) -> + emqx_ds:make_iterator_result(iterator()). +make_iterator(Shard, #stream{generation = GenId, enc = Stream}, TopicFilter, StartTime) -> + #{module := Mod, data := GenData} = generation_get(Shard, GenId), + case Mod:make_iterator(Shard, GenData, Stream, TopicFilter, StartTime) of + {ok, Iter} -> + {ok, #it{ + generation = GenId, + enc = Iter + }}; + {error, _} = Err -> + Err + end. + +-spec next(shard_id(), iterator(), pos_integer()) -> + emqx_ds:next_result(iterator()). +next(Shard, Iter = #it{generation = GenId, enc = GenIter0}, BatchSize) -> + #{module := Mod, data := GenData} = generation_get(Shard, GenId), + Current = generation_current(Shard), + case Mod:next(Shard, GenData, GenIter0, BatchSize) of + {ok, _GenIter, []} when GenId < Current -> + %% This is a past generation. Storage layer won't write + %% any more messages here. The iterator reached the end: + %% the stream has been fully replayed. + {ok, end_of_stream}; + {ok, GenIter, Batch} -> + {ok, Iter#it{enc = GenIter}, Batch}; + Error = {error, _} -> + Error + end. + +%%================================================================================ +%% gen_server for the shard +%%================================================================================ + +-define(REF(ShardId), {via, gproc, {n, l, {?MODULE, ShardId}}}). + +-spec start_link(shard_id(), emqx_ds:builtin_db_opts()) -> + {ok, pid()}. +start_link(Shard, Options) -> + gen_server:start_link(?REF(Shard), ?MODULE, {Shard, Options}, []). + -record(s, { - shard :: emqx_ds:shard(), + shard_id :: emqx_ds:shard_id(), db :: rocksdb:db_handle(), - cf_iterator :: rocksdb:cf_handle(), - cf_generations :: cf_refs() + cf_refs :: cf_refs(), + schema :: shard_schema(), + shard :: shard() }). --record(it, { - shard :: emqx_ds:shard(), - gen :: gen_id(), - replay :: emqx_ds:replay(), - module :: module(), - data :: term() -}). +%% Note: we specify gen_server requests as records to make use of Dialyzer: +-record(call_create_generation, {since :: emqx_ds:time()}). --type gen_id() :: 0..16#ffff. - --opaque state() :: #s{}. --opaque iterator() :: #it{}. - -%% Contents of the default column family: -%% -%% [{<<"genNN">>, #generation{}}, ..., -%% {<<"current">>, GenID}] +-type server_state() :: #s{}. -define(DEFAULT_CF, "default"). -define(DEFAULT_CF_OPTS, []). --define(ITERATOR_CF, "$iterators"). - -%% TODO -%% 1. CuckooTable might be of use here / `OptimizeForPointLookup(...)`. -%% 2. Supposedly might be compressed _very_ effectively. -%% 3. `inplace_update_support`? --define(ITERATOR_CF_OPTS, []). - --define(REF(Keyspace, ShardId), {via, gproc, {n, l, {?MODULE, Keyspace, ShardId}}}). - -%%================================================================================ -%% Callbacks -%%================================================================================ - --callback create_new(rocksdb:db_handle(), gen_id(), _Options :: term()) -> - {_Schema, cf_refs()}. - --callback open( - emqx_ds:shard(), - rocksdb:db_handle(), - gen_id(), - cf_refs(), - _Schema -) -> - term(). - --callback store( - _Schema, - _MessageID :: binary(), - emqx_ds:time(), - emqx_ds:topic(), - _Payload :: binary() -) -> - ok | {error, _}. - --callback delete(_Schema, _MessageID :: binary(), emqx_ds:time(), emqx_ds:topic()) -> - ok | {error, _}. - --callback make_iterator(_Schema, emqx_ds:replay()) -> - {ok, _It} | {error, _}. - --callback restore_iterator(_Schema, emqx_ds:replay(), binary()) -> {ok, _It} | {error, _}. - --callback preserve_iterator(_It) -> term(). - --callback next(It) -> {value, binary(), It} | none | {error, closed}. - -%%================================================================================ -%% API funcions -%%================================================================================ - --spec start_link(emqx_ds:shard(), emqx_ds_storage_layer:options()) -> - {ok, pid()}. -start_link(Shard = {Keyspace, ShardId}, Options) -> - gen_server:start_link(?REF(Keyspace, ShardId), ?MODULE, {Shard, Options}, []). - --spec create_generation( - emqx_ds:shard(), emqx_ds:time(), emqx_ds_conf:backend_config() -) -> - {ok, gen_id()} | {error, nonmonotonic}. -create_generation({Keyspace, ShardId}, Since, Config = {_Module, _Options}) -> - gen_server:call(?REF(Keyspace, ShardId), {create_generation, Since, Config}). - --spec store(emqx_ds:shard(), emqx_guid:guid(), emqx_ds:time(), emqx_ds:topic(), binary()) -> - ok | {error, _}. -store(Shard, GUID, Time, Topic, Msg) -> - {_GenId, #{module := Mod, data := Data}} = meta_lookup_gen(Shard, Time), - Mod:store(Data, GUID, Time, Topic, Msg). - --spec delete(emqx_ds:shard(), emqx_guid:guid(), emqx_ds:time(), emqx_ds:topic()) -> - ok | {error, _}. -delete(Shard, GUID, Time, Topic) -> - {_GenId, #{module := Mod, data := Data}} = meta_lookup_gen(Shard, Time), - Mod:delete(Data, GUID, Time, Topic). - --spec make_iterator(emqx_ds:shard(), emqx_ds:replay()) -> - {ok, iterator()} | {error, _TODO}. -make_iterator(Shard, Replay = {_, StartTime}) -> - {GenId, Gen} = meta_lookup_gen(Shard, StartTime), - open_iterator(Gen, #it{ - shard = Shard, - gen = GenId, - replay = Replay - }). - --spec next(iterator()) -> {value, binary(), iterator()} | none | {error, closed}. -next(It = #it{module = Mod, data = ItData}) -> - case Mod:next(ItData) of - {value, Val, ItDataNext} -> - {value, Val, It#it{data = ItDataNext}}; - {error, _} = Error -> - Error; - none -> - case open_next_iterator(It) of - {ok, ItNext} -> - next(ItNext); - {error, _} = Error -> - Error; - none -> - none - end - end. - --spec preserve_iterator(iterator(), emqx_ds:iterator_id()) -> - ok | {error, _TODO}. -preserve_iterator(It = #it{}, IteratorID) -> - iterator_put_state(IteratorID, It). - --spec restore_iterator(emqx_ds:shard(), emqx_ds:replay_id()) -> - {ok, iterator()} | {error, _TODO}. -restore_iterator(Shard, ReplayID) -> - case iterator_get_state(Shard, ReplayID) of - {ok, Serial} -> - restore_iterator_state(Shard, Serial); - not_found -> - {error, not_found}; - {error, _Reason} = Error -> - Error - end. - --spec ensure_iterator(emqx_ds:shard(), emqx_ds:iterator_id(), emqx_ds:replay()) -> - {ok, iterator()} | {error, _TODO}. -ensure_iterator(Shard, IteratorID, Replay = {_TopicFilter, _StartMS}) -> - case restore_iterator(Shard, IteratorID) of - {ok, It} -> - {ok, It}; - {error, not_found} -> - {ok, It} = make_iterator(Shard, Replay), - ok = emqx_ds_storage_layer:preserve_iterator(It, IteratorID), - {ok, It}; - Error -> - Error - end. - --spec discard_iterator(emqx_ds:shard(), emqx_ds:replay_id()) -> - ok | {error, _TODO}. -discard_iterator(Shard, ReplayID) -> - iterator_delete(Shard, ReplayID). - --spec discard_iterator_prefix(emqx_ds:shard(), binary()) -> - ok | {error, _TODO}. -discard_iterator_prefix(Shard, KeyPrefix) -> - case do_discard_iterator_prefix(Shard, KeyPrefix) of - {ok, _} -> ok; - Error -> Error - end. - --spec list_iterator_prefix( - emqx_ds:shard(), - binary() -) -> {ok, [emqx_ds:iterator_id()]} | {error, _TODO}. -list_iterator_prefix(Shard, KeyPrefix) -> - do_list_iterator_prefix(Shard, KeyPrefix). - --spec foldl_iterator_prefix( - emqx_ds:shard(), - binary(), - fun((_Key :: binary(), _Value :: binary(), Acc) -> Acc), - Acc -) -> {ok, Acc} | {error, _TODO} when - Acc :: term(). -foldl_iterator_prefix(Shard, KeyPrefix, Fn, Acc) -> - do_foldl_iterator_prefix(Shard, KeyPrefix, Fn, Acc). - -%%================================================================================ -%% behaviour callbacks -%%================================================================================ - -init({Shard, Options}) -> +init({ShardId, Options}) -> process_flag(trap_exit, true), - {ok, S0} = open_db(Shard, Options), - S = ensure_current_generation(S0), - ok = populate_metadata(S), + logger:set_process_metadata(#{shard_id => ShardId, domain => [ds, storage_layer, shard]}), + erase_schema_runtime(ShardId), + {ok, DB, CFRefs0} = rocksdb_open(ShardId, Options), + {Schema, CFRefs} = + case get_schema_persistent(DB) of + not_found -> + Prototype = maps:get(storage, Options), + create_new_shard_schema(ShardId, DB, CFRefs0, Prototype); + Scm -> + {Scm, CFRefs0} + end, + Shard = open_shard(ShardId, DB, CFRefs, Schema), + S = #s{ + shard_id = ShardId, + db = DB, + cf_refs = CFRefs, + schema = Schema, + shard = Shard + }, + commit_metadata(S), {ok, S}. -handle_call({create_generation, Since, Config}, _From, S) -> - case create_new_gen(Since, Config, S) of - {ok, GenId, NS} -> - {reply, {ok, GenId}, NS}; - {error, _} = Error -> - {reply, Error, S} - end; +handle_call(#call_create_generation{since = Since}, _From, S0) -> + S = add_generation(S0, Since), + commit_metadata(S), + {reply, ok, S}; handle_call(_Call, _From, S) -> {reply, {error, unknown_call}, S}. @@ -281,346 +259,182 @@ handle_cast(_Cast, S) -> handle_info(_Info, S) -> {noreply, S}. -terminate(_Reason, #s{db = DB, shard = Shard}) -> - meta_erase(Shard), +terminate(_Reason, #s{db = DB, shard_id = ShardId}) -> + erase_schema_runtime(ShardId), ok = rocksdb:close(DB). +%%================================================================================ +%% Internal exports +%%================================================================================ + %%================================================================================ %% Internal functions %%================================================================================ --record(db, {handle :: rocksdb:db_handle(), cf_iterator :: rocksdb:cf_handle()}). +-spec open_shard(shard_id(), rocksdb:db_handle(), cf_refs(), shard_schema()) -> + shard(). +open_shard(ShardId, DB, CFRefs, ShardSchema) -> + %% Transform generation schemas to generation runtime data: + maps:map( + fun + ({generation, GenId}, GenSchema) -> + open_generation(ShardId, DB, CFRefs, GenId, GenSchema); + (_K, Val) -> + Val + end, + ShardSchema + ). --spec populate_metadata(state()) -> ok. -populate_metadata(S = #s{shard = Shard, db = DBHandle, cf_iterator = CFIterator}) -> - ok = meta_put(Shard, db, #db{handle = DBHandle, cf_iterator = CFIterator}), - Current = schema_get_current(DBHandle), - lists:foreach(fun(GenId) -> populate_metadata(GenId, S) end, lists:seq(0, Current)). +-spec add_generation(server_state(), emqx_ds:time()) -> server_state(). +add_generation(S0, Since) -> + #s{shard_id = ShardId, db = DB, schema = Schema0, shard = Shard0, cf_refs = CFRefs0} = S0, + {GenId, Schema, NewCFRefs} = new_generation(ShardId, DB, Schema0, Since), + CFRefs = NewCFRefs ++ CFRefs0, + Key = {generation, GenId}, + Generation = open_generation(ShardId, DB, CFRefs, GenId, maps:get(Key, Schema)), + Shard = Shard0#{Key => Generation}, + S0#s{ + cf_refs = CFRefs, + schema = Schema, + shard = Shard + }. --spec populate_metadata(gen_id(), state()) -> ok. -populate_metadata(GenId, S = #s{shard = Shard, db = DBHandle}) -> - Gen = open_gen(GenId, schema_get_gen(DBHandle, GenId), S), - meta_register_gen(Shard, GenId, Gen). +-spec open_generation(shard_id(), rocksdb:db_handle(), cf_refs(), gen_id(), generation_schema()) -> + generation(). +open_generation(ShardId, DB, CFRefs, GenId, GenSchema) -> + ?tp(debug, ds_open_generation, #{gen_id => GenId, schema => GenSchema}), + #{module := Mod, data := Schema} = GenSchema, + RuntimeData = Mod:open(ShardId, DB, GenId, CFRefs, Schema), + GenSchema#{data => RuntimeData}. --spec ensure_current_generation(state()) -> state(). -ensure_current_generation(S = #s{shard = {Keyspace, _ShardId}, db = DBHandle}) -> - case schema_get_current(DBHandle) of - undefined -> - Config = emqx_ds_conf:keyspace_config(Keyspace), - {ok, _, NS} = create_new_gen(0, Config, S), - NS; - _GenId -> - S - end. - --spec create_new_gen(emqx_ds:time(), emqx_ds_conf:backend_config(), state()) -> - {ok, gen_id(), state()} | {error, nonmonotonic}. -create_new_gen(Since, Config, S = #s{shard = Shard, db = DBHandle}) -> - GenId = get_next_id(meta_get_current(Shard)), - GenId = get_next_id(schema_get_current(DBHandle)), - case is_gen_valid(Shard, GenId, Since) of - ok -> - {ok, Gen, NS} = create_gen(GenId, Since, Config, S), - %% TODO: Transaction? Column family creation can't be transactional, anyway. - ok = schema_put_gen(DBHandle, GenId, Gen), - ok = schema_put_current(DBHandle, GenId), - ok = meta_register_gen(Shard, GenId, open_gen(GenId, Gen, NS)), - {ok, GenId, NS}; - {error, _} = Error -> - Error - end. - --spec create_gen(gen_id(), emqx_ds:time(), emqx_ds_conf:backend_config(), state()) -> - {ok, generation(), state()}. -create_gen(GenId, Since, {Module, Options}, S = #s{db = DBHandle, cf_generations = CFs}) -> - % TODO: Backend implementation should ensure idempotency. - {Schema, NewCFs} = Module:create_new(DBHandle, GenId, Options), - Gen = #{ - module => Module, - data => Schema, - since => Since +-spec create_new_shard_schema(shard_id(), rocksdb:db_handle(), cf_refs(), prototype()) -> + {shard_schema(), cf_refs()}. +create_new_shard_schema(ShardId, DB, CFRefs, Prototype) -> + ?tp(notice, ds_create_new_shard_schema, #{shard => ShardId, prototype => Prototype}), + %% TODO: read prototype from options/config + Schema0 = #{ + current_generation => 0, + prototype => Prototype }, - {ok, Gen, S#s{cf_generations = NewCFs ++ CFs}}. + {_NewGenId, Schema, NewCFRefs} = new_generation(ShardId, DB, Schema0, _Since = 0), + {Schema, NewCFRefs ++ CFRefs}. --spec open_db(emqx_ds:shard(), options()) -> {ok, state()} | {error, _TODO}. -open_db(Shard = {Keyspace, ShardId}, Options) -> - DefaultDir = filename:join([atom_to_binary(Keyspace), ShardId]), - DBDir = unicode:characters_to_list(maps:get(dir, Options, DefaultDir)), +-spec new_generation(shard_id(), rocksdb:db_handle(), shard_schema(), emqx_ds:time()) -> + {gen_id(), shard_schema(), cf_refs()}. +new_generation(ShardId, DB, Schema0, Since) -> + #{current_generation := PrevGenId, prototype := {Mod, ModConf}} = Schema0, + GenId = PrevGenId + 1, + {GenData, NewCFRefs} = Mod:create(ShardId, DB, GenId, ModConf), + GenSchema = #{module => Mod, data => GenData, since => Since, until => undefined}, + Schema = Schema0#{ + current_generation => GenId, + {generation, GenId} => GenSchema + }, + {GenId, Schema, NewCFRefs}. + +%% @doc Commit current state of the server to both rocksdb and the persistent term +-spec commit_metadata(server_state()) -> ok. +commit_metadata(#s{shard_id = ShardId, schema = Schema, shard = Runtime, db = DB}) -> + ok = put_schema_persistent(DB, Schema), + put_schema_runtime(ShardId, Runtime). + +-spec rocksdb_open(shard_id(), emqx_ds:builtin_db_opts()) -> + {ok, rocksdb:db_handle(), cf_refs()} | {error, _TODO}. +rocksdb_open(Shard, Options) -> DBOptions = [ {create_if_missing, true}, {create_missing_column_families, true} - | emqx_ds_conf:db_options(Keyspace) + | maps:get(db_options, Options, []) ], + DBDir = db_dir(Shard), _ = filelib:ensure_dir(DBDir), ExistingCFs = case rocksdb:list_column_families(DBDir, DBOptions) of {ok, CFs} -> - [{Name, []} || Name <- CFs, Name /= ?DEFAULT_CF, Name /= ?ITERATOR_CF]; + [{Name, []} || Name <- CFs, Name /= ?DEFAULT_CF]; % DB is not present. First start {error, {db_open, _}} -> [] end, ColumnFamilies = [ - {?DEFAULT_CF, ?DEFAULT_CF_OPTS}, - {?ITERATOR_CF, ?ITERATOR_CF_OPTS} + {?DEFAULT_CF, ?DEFAULT_CF_OPTS} | ExistingCFs ], case rocksdb:open(DBDir, DBOptions, ColumnFamilies) of - {ok, DBHandle, [_CFDefault, CFIterator | CFRefs]} -> + {ok, DBHandle, [_CFDefault | CFRefs]} -> {CFNames, _} = lists:unzip(ExistingCFs), - {ok, #s{ - shard = Shard, - db = DBHandle, - cf_iterator = CFIterator, - cf_generations = lists:zip(CFNames, CFRefs) - }}; + {ok, DBHandle, lists:zip(CFNames, CFRefs)}; Error -> Error end. --spec open_gen(gen_id(), generation(), state()) -> generation(). -open_gen( - GenId, - Gen = #{module := Mod, data := Data}, - #s{shard = Shard, db = DBHandle, cf_generations = CFs} -) -> - DB = Mod:open(Shard, DBHandle, GenId, CFs, Data), - Gen#{data := DB}. +-spec db_dir(shard_id()) -> file:filename(). +db_dir({DB, ShardId}) -> + filename:join([emqx:data_dir(), atom_to_list(DB), atom_to_list(ShardId)]). --spec open_next_iterator(iterator()) -> {ok, iterator()} | {error, _Reason} | none. -open_next_iterator(It = #it{shard = Shard, gen = GenId}) -> - open_next_iterator(meta_get_gen(Shard, GenId + 1), It#it{gen = GenId + 1}). +%%-------------------------------------------------------------------------------- +%% Schema access +%%-------------------------------------------------------------------------------- -open_next_iterator(undefined, _It) -> - none; -open_next_iterator(Gen = #{}, It) -> - open_iterator(Gen, It). +-spec generation_current(shard_id()) -> gen_id(). +generation_current(Shard) -> + #{current_generation := Current} = get_schema_runtime(Shard), + Current. --spec open_iterator(generation(), iterator()) -> {ok, iterator()} | {error, _Reason}. -open_iterator(#{module := Mod, data := Data}, It = #it{}) -> - case Mod:make_iterator(Data, It#it.replay) of - {ok, ItData} -> - {ok, It#it{module = Mod, data = ItData}}; - Err -> - Err - end. +-spec generation_get(shard_id(), gen_id()) -> generation(). +generation_get(Shard, GenId) -> + #{{generation, GenId} := GenData} = get_schema_runtime(Shard), + GenData. --spec open_restore_iterator(generation(), iterator(), binary()) -> - {ok, iterator()} | {error, _Reason}. -open_restore_iterator(#{module := Mod, data := Data}, It = #it{replay = Replay}, Serial) -> - case Mod:restore_iterator(Data, Replay, Serial) of - {ok, ItData} -> - {ok, It#it{module = Mod, data = ItData}}; - Err -> - Err - end. - -%% - --define(KEY_REPLAY_STATE(IteratorId), <<(IteratorId)/binary, "rs">>). --define(KEY_REPLAY_STATE_PAT(KeyReplayState), begin - <> = (KeyReplayState), - IteratorId -end). - --define(ITERATION_WRITE_OPTS, []). --define(ITERATION_READ_OPTS, []). - -iterator_get_state(Shard, ReplayID) -> - #db{handle = Handle, cf_iterator = CF} = meta_lookup(Shard, db), - rocksdb:get(Handle, CF, ?KEY_REPLAY_STATE(ReplayID), ?ITERATION_READ_OPTS). - -iterator_put_state(ID, It = #it{shard = Shard}) -> - #db{handle = Handle, cf_iterator = CF} = meta_lookup(Shard, db), - Serial = preserve_iterator_state(It), - rocksdb:put(Handle, CF, ?KEY_REPLAY_STATE(ID), Serial, ?ITERATION_WRITE_OPTS). - -iterator_delete(Shard, ID) -> - #db{handle = Handle, cf_iterator = CF} = meta_lookup(Shard, db), - rocksdb:delete(Handle, CF, ?KEY_REPLAY_STATE(ID), ?ITERATION_WRITE_OPTS). - -preserve_iterator_state(#it{ - gen = Gen, - replay = {TopicFilter, StartTime}, - module = Mod, - data = ItData -}) -> - term_to_binary(#{ - v => 1, - gen => Gen, - filter => TopicFilter, - start => StartTime, - st => Mod:preserve_iterator(ItData) - }). - -restore_iterator_state(Shard, Serial) when is_binary(Serial) -> - restore_iterator_state(Shard, binary_to_term(Serial)); -restore_iterator_state( - Shard, - #{ - v := 1, - gen := Gen, - filter := TopicFilter, - start := StartTime, - st := State - } -) -> - It = #it{shard = Shard, gen = Gen, replay = {TopicFilter, StartTime}}, - open_restore_iterator(meta_get_gen(Shard, Gen), It, State). - -do_list_iterator_prefix(Shard, KeyPrefix) -> - Fn = fun(K0, _V, Acc) -> - K = ?KEY_REPLAY_STATE_PAT(K0), - [K | Acc] - end, - do_foldl_iterator_prefix(Shard, KeyPrefix, Fn, []). - -do_discard_iterator_prefix(Shard, KeyPrefix) -> - #db{handle = DBHandle, cf_iterator = CF} = meta_lookup(Shard, db), - Fn = fun(K, _V, _Acc) -> ok = rocksdb:delete(DBHandle, CF, K, ?ITERATION_WRITE_OPTS) end, - do_foldl_iterator_prefix(Shard, KeyPrefix, Fn, ok). - -do_foldl_iterator_prefix(Shard, KeyPrefix, Fn, Acc) -> - #db{handle = Handle, cf_iterator = CF} = meta_lookup(Shard, db), - case rocksdb:iterator(Handle, CF, ?ITERATION_READ_OPTS) of - {ok, It} -> - NextAction = {seek, KeyPrefix}, - do_foldl_iterator_prefix(Handle, CF, It, KeyPrefix, NextAction, Fn, Acc); - Error -> - Error - end. - -do_foldl_iterator_prefix(DBHandle, CF, It, KeyPrefix, NextAction, Fn, Acc) -> - case rocksdb:iterator_move(It, NextAction) of - {ok, K = <>, V} -> - NewAcc = Fn(K, V, Acc), - do_foldl_iterator_prefix(DBHandle, CF, It, KeyPrefix, next, Fn, NewAcc); - {ok, _K, _V} -> - ok = rocksdb:iterator_close(It), - {ok, Acc}; - {error, invalid_iterator} -> - ok = rocksdb:iterator_close(It), - {ok, Acc}; - Error -> - ok = rocksdb:iterator_close(It), - Error - end. - -%% Functions for dealing with the metadata stored persistently in rocksdb - --define(CURRENT_GEN, <<"current">>). --define(SCHEMA_WRITE_OPTS, []). --define(SCHEMA_READ_OPTS, []). - --spec schema_get_gen(rocksdb:db_handle(), gen_id()) -> generation(). -schema_get_gen(DBHandle, GenId) -> - {ok, Bin} = rocksdb:get(DBHandle, schema_gen_key(GenId), ?SCHEMA_READ_OPTS), - binary_to_term(Bin). - --spec schema_put_gen(rocksdb:db_handle(), gen_id(), generation()) -> ok | {error, _}. -schema_put_gen(DBHandle, GenId, Gen) -> - rocksdb:put(DBHandle, schema_gen_key(GenId), term_to_binary(Gen), ?SCHEMA_WRITE_OPTS). - --spec schema_get_current(rocksdb:db_handle()) -> gen_id() | undefined. -schema_get_current(DBHandle) -> - case rocksdb:get(DBHandle, ?CURRENT_GEN, ?SCHEMA_READ_OPTS) of - {ok, Bin} -> - binary_to_integer(Bin); - not_found -> - undefined - end. - --spec schema_put_current(rocksdb:db_handle(), gen_id()) -> ok | {error, _}. -schema_put_current(DBHandle, GenId) -> - rocksdb:put(DBHandle, ?CURRENT_GEN, integer_to_binary(GenId), ?SCHEMA_WRITE_OPTS). - --spec schema_gen_key(integer()) -> binary(). -schema_gen_key(N) -> - <<"gen", N:32>>. - --undef(CURRENT_GEN). --undef(SCHEMA_WRITE_OPTS). --undef(SCHEMA_READ_OPTS). - -%% Functions for dealing with the runtime shard metadata: - --define(PERSISTENT_TERM(SHARD, GEN), {?MODULE, SHARD, GEN}). - --spec meta_register_gen(emqx_ds:shard(), gen_id(), generation()) -> ok. -meta_register_gen(Shard, GenId, Gen) -> - Gs = - case GenId > 0 of - true -> meta_lookup(Shard, GenId - 1); - false -> [] +-spec generations_since(shard_id(), emqx_ds:time()) -> [gen_id()]. +generations_since(Shard, Since) -> + Schema = get_schema_runtime(Shard), + maps:fold( + fun + ({generation, GenId}, #{until := Until}, Acc) when Until >= Since -> + [GenId | Acc]; + (_K, _V, Acc) -> + Acc end, - ok = meta_put(Shard, GenId, [Gen | Gs]), - ok = meta_put(Shard, current, GenId). + [], + Schema + ). --spec meta_lookup_gen(emqx_ds:shard(), emqx_ds:time()) -> {gen_id(), generation()}. -meta_lookup_gen(Shard, Time) -> - % TODO - % Is cheaper persistent term GC on update here worth extra lookup? I'm leaning - % towards a "no". - Current = meta_lookup(Shard, current), - Gens = meta_lookup(Shard, Current), - find_gen(Time, Current, Gens). +-define(PERSISTENT_TERM(SHARD), {emqx_ds_storage_layer, SHARD}). -find_gen(Time, GenId, [Gen = #{since := Since} | _]) when Time >= Since -> - {GenId, Gen}; -find_gen(Time, GenId, [_Gen | Rest]) -> - find_gen(Time, GenId - 1, Rest). +-spec get_schema_runtime(shard_id()) -> shard(). +get_schema_runtime(Shard) -> + persistent_term:get(?PERSISTENT_TERM(Shard)). --spec meta_get_gen(emqx_ds:shard(), gen_id()) -> generation() | undefined. -meta_get_gen(Shard, GenId) -> - case meta_lookup(Shard, GenId, []) of - [Gen | _Older] -> Gen; - [] -> undefined - end. +-spec put_schema_runtime(shard_id(), shard()) -> ok. +put_schema_runtime(Shard, RuntimeSchema) -> + persistent_term:put(?PERSISTENT_TERM(Shard), RuntimeSchema), + ok. --spec meta_get_current(emqx_ds:shard()) -> gen_id() | undefined. -meta_get_current(Shard) -> - meta_lookup(Shard, current, undefined). - --spec meta_lookup(emqx_ds:shard(), _K) -> _V. -meta_lookup(Shard, K) -> - persistent_term:get(?PERSISTENT_TERM(Shard, K)). - --spec meta_lookup(emqx_ds:shard(), _K, Default) -> _V | Default. -meta_lookup(Shard, K, Default) -> - persistent_term:get(?PERSISTENT_TERM(Shard, K), Default). - --spec meta_put(emqx_ds:shard(), _K, _V) -> ok. -meta_put(Shard, K, V) -> - persistent_term:put(?PERSISTENT_TERM(Shard, K), V). - --spec meta_erase(emqx_ds:shard()) -> ok. -meta_erase(Shard) -> - [ - persistent_term:erase(K) - || {K = ?PERSISTENT_TERM(Z, _), _} <- persistent_term:get(), Z =:= Shard - ], +-spec erase_schema_runtime(shard_id()) -> ok. +erase_schema_runtime(Shard) -> + persistent_term:erase(?PERSISTENT_TERM(Shard)), ok. -undef(PERSISTENT_TERM). -get_next_id(undefined) -> 0; -get_next_id(GenId) -> GenId + 1. +-define(ROCKSDB_SCHEMA_KEY, <<"schema_v1">>). -is_gen_valid(Shard, GenId, Since) when GenId > 0 -> - [GenPrev | _] = meta_lookup(Shard, GenId - 1), - case GenPrev of - #{since := SincePrev} when Since > SincePrev -> - ok; - #{} -> - {error, nonmonotonic} - end; -is_gen_valid(_Shard, 0, 0) -> - ok. +-spec get_schema_persistent(rocksdb:db_handle()) -> shard_schema() | not_found. +get_schema_persistent(DB) -> + case rocksdb:get(DB, ?ROCKSDB_SCHEMA_KEY, []) of + {ok, Blob} -> + Schema = binary_to_term(Blob), + %% Sanity check: + #{current_generation := _, prototype := _} = Schema, + Schema; + not_found -> + not_found + end. -%% -spec store_cfs(rocksdb:db_handle(), [{string(), rocksdb:cf_handle()}]) -> ok. -%% store_cfs(DBHandle, CFRefs) -> -%% lists:foreach( -%% fun({CFName, CFRef}) -> -%% persistent_term:put({self(), CFName}, {DBHandle, CFRef}) -%% end, -%% CFRefs). +-spec put_schema_persistent(rocksdb:db_handle(), shard_schema()) -> ok. +put_schema_persistent(DB, Schema) -> + Blob = term_to_binary(Schema), + rocksdb:put(DB, ?ROCKSDB_SCHEMA_KEY, Blob, []). + +-undef(ROCKSDB_SCHEMA_KEY). diff --git a/apps/emqx_durable_storage/src/emqx_ds_storage_layer_sup.erl b/apps/emqx_durable_storage/src/emqx_ds_storage_layer_sup.erl index 56c8c760a..fac7204bf 100644 --- a/apps/emqx_durable_storage/src/emqx_ds_storage_layer_sup.erl +++ b/apps/emqx_durable_storage/src/emqx_ds_storage_layer_sup.erl @@ -6,7 +6,7 @@ -behaviour(supervisor). %% API: --export([start_link/0, start_shard/2, stop_shard/1]). +-export([start_link/0, start_shard/2, stop_shard/1, ensure_shard/2]). %% behaviour callbacks: -export([init/1]). @@ -25,7 +25,7 @@ start_link() -> supervisor:start_link({local, ?SUP}, ?MODULE, []). --spec start_shard(emqx_ds:shard(), emqx_ds_storage_layer:options()) -> +-spec start_shard(emqx_ds_replication_layer:shard_id(), emqx_ds:create_db_opts()) -> supervisor:startchild_ret(). start_shard(Shard, Options) -> supervisor:start_child(?SUP, shard_child_spec(Shard, Options)). @@ -35,6 +35,17 @@ stop_shard(Shard) -> ok = supervisor:terminate_child(?SUP, Shard), ok = supervisor:delete_child(?SUP, Shard). +-spec ensure_shard(emqx_ds:shard(), emqx_ds_storage_layer:options()) -> ok | {error, _Reason}. +ensure_shard(Shard, Options) -> + case start_shard(Shard, Options) of + {ok, _Pid} -> + ok; + {error, {already_started, _Pid}} -> + ok; + {error, Reason} -> + {error, Reason} + end. + %%================================================================================ %% behaviour callbacks %%================================================================================ @@ -52,7 +63,7 @@ init([]) -> %% Internal functions %%================================================================================ --spec shard_child_spec(emqx_ds:shard(), emqx_ds_storage_layer:options()) -> +-spec shard_child_spec(emqx_ds_replication_layer:shard_id(), emqx_ds:create_db_opts()) -> supervisor:child_spec(). shard_child_spec(Shard, Options) -> #{ diff --git a/apps/emqx_durable_storage/src/emqx_ds_storage_reference.erl b/apps/emqx_durable_storage/src/emqx_ds_storage_reference.erl new file mode 100644 index 000000000..6676faf88 --- /dev/null +++ b/apps/emqx_durable_storage/src/emqx_ds_storage_reference.erl @@ -0,0 +1,139 @@ +%%-------------------------------------------------------------------- +%% Copyright (c) 2023 EMQ Technologies Co., Ltd. All Rights Reserved. +%% +%% Licensed under the Apache License, Version 2.0 (the "License"); +%% you may not use this file except in compliance with the License. +%% You may obtain a copy of the License at +%% +%% http://www.apache.org/licenses/LICENSE-2.0 +%% +%% Unless required by applicable law or agreed to in writing, software +%% distributed under the License is distributed on an "AS IS" BASIS, +%% WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +%% See the License for the specific language governing permissions and +%% limitations under the License. +%%-------------------------------------------------------------------- + +%% @doc Reference implementation of the storage. +%% +%% Trivial, extremely slow and inefficient. It also doesn't handle +%% restart of the Erlang node properly, so obviously it's only to be +%% used for testing. +-module(emqx_ds_storage_reference). + +-behaviour(emqx_ds_storage_layer). + +%% API: +-export([]). + +%% behavior callbacks: +-export([create/4, open/5, store_batch/4, get_streams/4, make_iterator/5, next/4]). + +%% internal exports: +-export([]). + +-export_type([options/0]). + +-include_lib("emqx_utils/include/emqx_message.hrl"). + +%%================================================================================ +%% Type declarations +%%================================================================================ + +-type options() :: #{}. + +%% Permanent state: +-record(schema, {}). + +%% Runtime state: +-record(s, { + db :: rocksdb:db_handle(), + cf :: rocksdb:cf_handle() +}). + +-record(stream, {}). + +-record(it, { + topic_filter :: emqx_ds:topic_filter(), + start_time :: emqx_ds:time(), + last_seen_message_key = first :: binary() | first +}). + +%%================================================================================ +%% API funcions +%%================================================================================ + +%%================================================================================ +%% behavior callbacks +%%================================================================================ + +create(_ShardId, DBHandle, GenId, _Options) -> + CFName = data_cf(GenId), + {ok, CFHandle} = rocksdb:create_column_family(DBHandle, CFName, []), + Schema = #schema{}, + {Schema, [{CFName, CFHandle}]}. + +open(_Shard, DBHandle, GenId, CFRefs, #schema{}) -> + {_, CF} = lists:keyfind(data_cf(GenId), 1, CFRefs), + #s{db = DBHandle, cf = CF}. + +store_batch(_ShardId, #s{db = DB, cf = CF}, Messages, _Options) -> + lists:foreach( + fun(Msg) -> + Id = erlang:unique_integer([monotonic]), + Key = <>, + Val = term_to_binary(Msg), + rocksdb:put(DB, CF, Key, Val, []) + end, + Messages + ). + +get_streams(_Shard, _Data, _TopicFilter, _StartTime) -> + [#stream{}]. + +make_iterator(_Shard, _Data, #stream{}, TopicFilter, StartTime) -> + {ok, #it{ + topic_filter = TopicFilter, + start_time = StartTime + }}. + +next(_Shard, #s{db = DB, cf = CF}, It0, BatchSize) -> + #it{topic_filter = TopicFilter, start_time = StartTime, last_seen_message_key = Key0} = It0, + {ok, ITHandle} = rocksdb:iterator(DB, CF, []), + Action = + case Key0 of + first -> + first; + _ -> + _ = rocksdb:iterator_move(ITHandle, Key0), + next + end, + {Key, Messages} = do_next(TopicFilter, StartTime, ITHandle, Action, BatchSize, Key0, []), + rocksdb:iterator_close(ITHandle), + It = It0#it{last_seen_message_key = Key}, + {ok, It, lists:reverse(Messages)}. + +%%================================================================================ +%% Internal functions +%%================================================================================ + +do_next(_, _, _, _, 0, Key, Acc) -> + {Key, Acc}; +do_next(TopicFilter, StartTime, IT, Action, NLeft, Key0, Acc) -> + case rocksdb:iterator_move(IT, Action) of + {ok, Key, Blob} -> + Msg = #message{topic = Topic, timestamp = TS} = binary_to_term(Blob), + case emqx_topic:match(Topic, TopicFilter) andalso TS >= StartTime of + true -> + do_next(TopicFilter, StartTime, IT, next, NLeft - 1, Key, [Msg | Acc]); + false -> + do_next(TopicFilter, StartTime, IT, next, NLeft, Key, Acc) + end; + {error, invalid_iterator} -> + {Key0, Acc} + end. + +%% @doc Generate a column family ID for the MQTT messages +-spec data_cf(emqx_ds_storage_layer:gen_id()) -> [char()]. +data_cf(GenId) -> + "emqx_ds_storage_reference" ++ integer_to_list(GenId). diff --git a/apps/emqx_durable_storage/src/emqx_ds_sup.erl b/apps/emqx_durable_storage/src/emqx_ds_sup.erl index ca939e892..d371a2346 100644 --- a/apps/emqx_durable_storage/src/emqx_ds_sup.erl +++ b/apps/emqx_durable_storage/src/emqx_ds_sup.erl @@ -30,7 +30,7 @@ start_link() -> %%================================================================================ init([]) -> - Children = [shard_sup()], + Children = [storage_layer_sup()], SupFlags = #{ strategy => one_for_all, intensity => 0, @@ -42,7 +42,7 @@ init([]) -> %% Internal functions %%================================================================================ -shard_sup() -> +storage_layer_sup() -> #{ id => local_store_shard_sup, start => {emqx_ds_storage_layer_sup, start_link, []}, diff --git a/apps/emqx_durable_storage/src/emqx_durable_storage.app.src b/apps/emqx_durable_storage/src/emqx_durable_storage.app.src index 6edbfda9b..f106494c8 100644 --- a/apps/emqx_durable_storage/src/emqx_durable_storage.app.src +++ b/apps/emqx_durable_storage/src/emqx_durable_storage.app.src @@ -5,7 +5,7 @@ {vsn, "0.1.6"}, {modules, []}, {registered, []}, - {applications, [kernel, stdlib, rocksdb, gproc, mria]}, + {applications, [kernel, stdlib, rocksdb, gproc, mria, emqx_utils]}, {mod, {emqx_ds_app, []}}, {env, []} ]}. diff --git a/apps/emqx_durable_storage/src/proto/emqx_ds_proto_v1.erl b/apps/emqx_durable_storage/src/proto/emqx_ds_proto_v1.erl new file mode 100644 index 000000000..17e873ecd --- /dev/null +++ b/apps/emqx_durable_storage/src/proto/emqx_ds_proto_v1.erl @@ -0,0 +1,75 @@ +%%-------------------------------------------------------------------- +%% Copyright (c) 2023 EMQ Technologies Co., Ltd. All Rights Reserved. +%% +%% Licensed under the Apache License, Version 2.0 (the "License"); +%% you may not use this file except in compliance with the License. +%% You may obtain a copy of the License at +%% +%% http://www.apache.org/licenses/LICENSE-2.0 +%% +%% Unless required by applicable law or agreed to in writing, software +%% distributed under the License is distributed on an "AS IS" BASIS, +%% WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +%% See the License for the specific language governing permissions and +%% limitations under the License. +%%-------------------------------------------------------------------- +-module(emqx_ds_proto_v1). + +-behavior(emqx_bpapi). + +-include_lib("emqx_utils/include/bpapi.hrl"). +%% API: +-export([open_shard/3, drop_shard/2, get_streams/4, make_iterator/5, next/4]). + +%% behavior callbacks: +-export([introduced_in/0]). + +%%================================================================================ +%% API funcions +%%================================================================================ + +-spec open_shard(node(), emqx_ds_replication_layer:shard_id(), emqx_ds:create_db_opts()) -> + ok. +open_shard(Node, Shard, Opts) -> + erpc:call(Node, emqx_ds_replication_layer, do_open_shard_v1, [Shard, Opts]). + +-spec drop_shard(node(), emqx_ds_replication_layer:shard_id()) -> + ok. +drop_shard(Node, Shard) -> + erpc:call(Node, emqx_ds_replication_layer, do_drop_shard_v1, [Shard]). + +-spec get_streams( + node(), emqx_ds_replication_layer:shard_id(), emqx_ds:topic_filter(), emqx_ds:time() +) -> + [{integer(), emqx_ds_replication_layer:stream()}]. +get_streams(Node, Shard, TopicFilter, Time) -> + erpc:call(Node, emqx_ds_replication_layer, do_get_streams_v1, [Shard, TopicFilter, Time]). + +-spec make_iterator( + node(), + emqx_ds_replication_layer:shard_id(), + emqx_ds_storage_layer:stream(), + emqx_ds:topic_filter(), + emqx_ds:time() +) -> + {ok, emqx_ds_replication_layer:iterator()} | {error, _}. +make_iterator(Node, Shard, Stream, TopicFilter, StartTime) -> + erpc:call(Node, emqx_ds_replication_layer, do_make_iterator_v1, [ + Shard, Stream, TopicFilter, StartTime + ]). + +-spec next( + node(), emqx_ds_replication_layer:shard_id(), emqx_ds_storage_layer:iterator(), pos_integer() +) -> + {ok, emqx_ds_storage_layer:iterator(), [emqx_types:messages()]} + | {ok, end_of_stream} + | {error, _}. +next(Node, Shard, Iter, BatchSize) -> + erpc:call(Node, emqx_ds_replication_layer, do_next_v1, [Shard, Iter, BatchSize]). + +%%================================================================================ +%% behavior callbacks +%%================================================================================ + +introduced_in() -> + "5.4.0". diff --git a/apps/emqx_durable_storage/test/emqx_ds_SUITE.erl b/apps/emqx_durable_storage/test/emqx_ds_SUITE.erl new file mode 100644 index 000000000..9637431d3 --- /dev/null +++ b/apps/emqx_durable_storage/test/emqx_ds_SUITE.erl @@ -0,0 +1,146 @@ +%%-------------------------------------------------------------------- +%% Copyright (c) 2023 EMQ Technologies Co., Ltd. All Rights Reserved. +%% +%% Licensed under the Apache License, Version 2.0 (the "License"); +%% you may not use this file except in compliance with the License. +%% You may obtain a copy of the License at +%% +%% http://www.apache.org/licenses/LICENSE-2.0 +%% +%% Unless required by applicable law or agreed to in writing, software +%% distributed under the License is distributed on an "AS IS" BASIS, +%% WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +%% See the License for the specific language governing permissions and +%% limitations under the License. +%%-------------------------------------------------------------------- +-module(emqx_ds_SUITE). + +-compile(export_all). +-compile(nowarn_export_all). + +-include_lib("emqx/include/emqx.hrl"). +-include_lib("common_test/include/ct.hrl"). +-include_lib("stdlib/include/assert.hrl"). +-include_lib("snabbkaffe/include/snabbkaffe.hrl"). + +opts() -> + #{ + backend => builtin, + storage => {emqx_ds_storage_reference, #{}} + }. + +%% A simple smoke test that verifies that opening/closing the DB +%% doesn't crash, and not much else +t_00_smoke_open_drop(_Config) -> + DB = 'DB', + ?assertMatch(ok, emqx_ds:open_db(DB, opts())), + ?assertMatch(ok, emqx_ds:open_db(DB, opts())), + ?assertMatch(ok, emqx_ds:drop_db(DB)). + +%% A simple smoke test that verifies that storing the messages doesn't +%% crash +t_01_smoke_store(_Config) -> + DB = default, + ?assertMatch(ok, emqx_ds:open_db(DB, opts())), + Msg = message(<<"foo/bar">>, <<"foo">>, 0), + ?assertMatch(ok, emqx_ds:store_batch(DB, [Msg])). + +%% A simple smoke test that verifies that getting the list of streams +%% doesn't crash and that iterators can be opened. +t_02_smoke_get_streams_start_iter(_Config) -> + DB = ?FUNCTION_NAME, + ?assertMatch(ok, emqx_ds:open_db(DB, opts())), + StartTime = 0, + TopicFilter = ['#'], + [{Rank, Stream}] = emqx_ds:get_streams(DB, TopicFilter, StartTime), + ?assertMatch({_, _}, Rank), + ?assertMatch({ok, _Iter}, emqx_ds:make_iterator(Stream, TopicFilter, StartTime)). + +%% A simple smoke test that verifies that it's possible to iterate +%% over messages. +t_03_smoke_iterate(_Config) -> + DB = ?FUNCTION_NAME, + ?assertMatch(ok, emqx_ds:open_db(DB, opts())), + StartTime = 0, + TopicFilter = ['#'], + Msgs = [ + message(<<"foo/bar">>, <<"1">>, 0), + message(<<"foo">>, <<"2">>, 1), + message(<<"bar/bar">>, <<"3">>, 2) + ], + ?assertMatch(ok, emqx_ds:store_batch(DB, Msgs)), + [{_, Stream}] = emqx_ds:get_streams(DB, TopicFilter, StartTime), + {ok, Iter0} = emqx_ds:make_iterator(Stream, TopicFilter, StartTime), + {ok, Iter, Batch} = iterate(Iter0, 1), + ?assertEqual(Msgs, Batch, {Iter0, Iter}). + +%% Verify that iterators survive restart of the application. This is +%% an important property, since the lifetime of the iterators is tied +%% to the external resources, such as clients' sessions, and they +%% should always be able to continue replaying the topics from where +%% they are left off. +t_04_restart(_Config) -> + DB = ?FUNCTION_NAME, + ?assertMatch(ok, emqx_ds:open_db(DB, opts())), + TopicFilter = ['#'], + StartTime = 0, + Msgs = [ + message(<<"foo/bar">>, <<"1">>, 0), + message(<<"foo">>, <<"2">>, 1), + message(<<"bar/bar">>, <<"3">>, 2) + ], + ?assertMatch(ok, emqx_ds:store_batch(DB, Msgs)), + [{_, Stream}] = emqx_ds:get_streams(DB, TopicFilter, StartTime), + {ok, Iter0} = emqx_ds:make_iterator(Stream, TopicFilter, StartTime), + %% Restart the application: + ?tp(warning, emqx_ds_SUITE_restart_app, #{}), + ok = application:stop(emqx_durable_storage), + {ok, _} = application:ensure_all_started(emqx_durable_storage), + ok = emqx_ds:open_db(DB, opts()), + %% The old iterator should be still operational: + {ok, Iter, Batch} = iterate(Iter0, 1), + ?assertEqual(Msgs, Batch, {Iter0, Iter}). + +message(Topic, Payload, PublishedAt) -> + #message{ + topic = Topic, + payload = Payload, + timestamp = PublishedAt, + id = emqx_guid:gen() + }. + +iterate(It, BatchSize) -> + iterate(It, BatchSize, []). + +iterate(It0, BatchSize, Acc) -> + case emqx_ds:next(It0, BatchSize) of + {ok, It, []} -> + {ok, It, Acc}; + {ok, It, Msgs} -> + iterate(It, BatchSize, Acc ++ Msgs); + Ret -> + Ret + end. + +%% CT callbacks + +all() -> emqx_common_test_helpers:all(?MODULE). + +init_per_suite(Config) -> + Apps = emqx_cth_suite:start( + [mria, emqx_durable_storage], + #{work_dir => ?config(priv_dir, Config)} + ), + [{apps, Apps} | Config]. + +end_per_suite(Config) -> + ok = emqx_cth_suite:stop(?config(apps, Config)), + ok. + +init_per_testcase(_TC, Config) -> + %% snabbkaffe:fix_ct_logging(), + application:ensure_all_started(emqx_durable_storage), + Config. + +end_per_testcase(_TC, _Config) -> + ok = application:stop(emqx_durable_storage). diff --git a/apps/emqx_durable_storage/test/emqx_ds_message_storage_bitmask_SUITE.erl b/apps/emqx_durable_storage/test/emqx_ds_message_storage_bitmask_SUITE.erl deleted file mode 100644 index 599bd6c7b..000000000 --- a/apps/emqx_durable_storage/test/emqx_ds_message_storage_bitmask_SUITE.erl +++ /dev/null @@ -1,188 +0,0 @@ -%%-------------------------------------------------------------------- -%% Copyright (c) 2022-2023 EMQ Technologies Co., Ltd. All Rights Reserved. -%%-------------------------------------------------------------------- --module(emqx_ds_message_storage_bitmask_SUITE). - --compile(export_all). --compile(nowarn_export_all). - --include_lib("stdlib/include/assert.hrl"). - --import(emqx_ds_message_storage_bitmask, [ - make_keymapper/1, - keymapper_info/1, - compute_topic_bitmask/2, - compute_time_bitmask/1, - compute_topic_seek/4 -]). - -all() -> emqx_common_test_helpers:all(?MODULE). - -t_make_keymapper(_) -> - ?assertMatch( - #{ - source := [ - {timestamp, 9, 23}, - {hash, level, 2}, - {hash, level, 4}, - {hash, levels, 8}, - {timestamp, 0, 9} - ], - bitsize := 46, - epoch := 512 - }, - keymapper_info( - make_keymapper(#{ - timestamp_bits => 32, - topic_bits_per_level => [2, 4, 8], - epoch => 1000 - }) - ) - ). - -t_make_keymapper_single_hash_level(_) -> - ?assertMatch( - #{ - source := [ - {timestamp, 0, 32}, - {hash, levels, 16} - ], - bitsize := 48, - epoch := 1 - }, - keymapper_info( - make_keymapper(#{ - timestamp_bits => 32, - topic_bits_per_level => [16], - epoch => 1 - }) - ) - ). - -t_make_keymapper_no_timestamp(_) -> - ?assertMatch( - #{ - source := [ - {hash, level, 4}, - {hash, level, 8}, - {hash, levels, 16} - ], - bitsize := 28, - epoch := 1 - }, - keymapper_info( - make_keymapper(#{ - timestamp_bits => 0, - topic_bits_per_level => [4, 8, 16], - epoch => 42 - }) - ) - ). - -t_compute_topic_bitmask(_) -> - KM = make_keymapper(#{topic_bits_per_level => [3, 4, 5, 2], timestamp_bits => 0, epoch => 1}), - ?assertEqual( - 2#111_1111_11111_11, - compute_topic_bitmask([<<"foo">>, <<"bar">>], KM) - ), - ?assertEqual( - 2#111_0000_11111_11, - compute_topic_bitmask([<<"foo">>, '+'], KM) - ), - ?assertEqual( - 2#111_0000_00000_11, - compute_topic_bitmask([<<"foo">>, '+', '+'], KM) - ), - ?assertEqual( - 2#111_0000_11111_00, - compute_topic_bitmask([<<"foo">>, '+', <<"bar">>, '+'], KM) - ). - -t_compute_topic_bitmask_wildcard(_) -> - KM = make_keymapper(#{topic_bits_per_level => [3, 4, 5, 2], timestamp_bits => 0, epoch => 1}), - ?assertEqual( - 2#000_0000_00000_00, - compute_topic_bitmask(['#'], KM) - ), - ?assertEqual( - 2#111_0000_00000_00, - compute_topic_bitmask([<<"foo">>, '#'], KM) - ), - ?assertEqual( - 2#111_1111_11111_00, - compute_topic_bitmask([<<"foo">>, <<"bar">>, <<"baz">>, '#'], KM) - ). - -t_compute_topic_bitmask_wildcard_long_tail(_) -> - KM = make_keymapper(#{topic_bits_per_level => [3, 4, 5, 2], timestamp_bits => 0, epoch => 1}), - ?assertEqual( - 2#111_1111_11111_11, - compute_topic_bitmask([<<"foo">>, <<"bar">>, <<"baz">>, <<>>, <<"xyzzy">>], KM) - ), - ?assertEqual( - 2#111_1111_11111_00, - compute_topic_bitmask([<<"foo">>, <<"bar">>, <<"baz">>, <<>>, '#'], KM) - ). - -t_compute_time_bitmask(_) -> - KM = make_keymapper(#{topic_bits_per_level => [1, 2, 3], timestamp_bits => 10, epoch => 200}), - ?assertEqual(2#111_000000_1111111, compute_time_bitmask(KM)). - -t_compute_time_bitmask_epoch_only(_) -> - KM = make_keymapper(#{topic_bits_per_level => [1, 2, 3], timestamp_bits => 10, epoch => 1}), - ?assertEqual(2#1111111111_000000, compute_time_bitmask(KM)). - -%% Filter = |123|***|678|***| -%% Mask = |123|***|678|***| -%% Key1 = |123|011|108|121| → Seek = 0 |123|011|678|000| -%% Key2 = |123|011|679|919| → Seek = 0 |123|012|678|000| -%% Key3 = |123|999|679|001| → Seek = 1 |123|000|678|000| → eos -%% Key4 = |125|011|179|017| → Seek = 1 |123|000|678|000| → eos - -t_compute_next_topic_seek(_) -> - KM = make_keymapper(#{topic_bits_per_level => [8, 8, 16, 12], timestamp_bits => 0, epoch => 1}), - ?assertMatch( - none, - compute_topic_seek( - 16#FD_42_4242_043, - 16#FD_42_4242_042, - 16#FF_FF_FFFF_FFF, - KM - ) - ), - ?assertMatch( - 16#FD_11_0678_000, - compute_topic_seek( - 16#FD_11_0108_121, - 16#FD_00_0678_000, - 16#FF_00_FFFF_000, - KM - ) - ), - ?assertMatch( - 16#FD_12_0678_000, - compute_topic_seek( - 16#FD_11_0679_919, - 16#FD_00_0678_000, - 16#FF_00_FFFF_000, - KM - ) - ), - ?assertMatch( - none, - compute_topic_seek( - 16#FD_FF_0679_001, - 16#FD_00_0678_000, - 16#FF_00_FFFF_000, - KM - ) - ), - ?assertMatch( - none, - compute_topic_seek( - 16#FE_11_0179_017, - 16#FD_00_0678_000, - 16#FF_00_FFFF_000, - KM - ) - ). diff --git a/apps/emqx_durable_storage/test/emqx_ds_storage_bitfield_lts_SUITE.erl b/apps/emqx_durable_storage/test/emqx_ds_storage_bitfield_lts_SUITE.erl new file mode 100644 index 000000000..6dc24a269 --- /dev/null +++ b/apps/emqx_durable_storage/test/emqx_ds_storage_bitfield_lts_SUITE.erl @@ -0,0 +1,396 @@ +%%-------------------------------------------------------------------- +%% Copyright (c) 2022-2023 EMQ Technologies Co., Ltd. All Rights Reserved. +%%-------------------------------------------------------------------- +-module(emqx_ds_storage_bitfield_lts_SUITE). + +-compile(export_all). +-compile(nowarn_export_all). + +-include_lib("emqx/include/emqx.hrl"). +-include_lib("common_test/include/ct.hrl"). +-include_lib("snabbkaffe/include/snabbkaffe.hrl"). +-include_lib("stdlib/include/assert.hrl"). + +-define(SHARD, shard(?FUNCTION_NAME)). + +-define(DEFAULT_CONFIG, #{ + backend => builtin, + storage => {emqx_ds_storage_bitfield_lts, #{}} +}). + +-define(COMPACT_CONFIG, #{ + backend => builtin, + storage => + {emqx_ds_storage_bitfield_lts, #{ + bits_per_wildcard_level => 8 + }} +}). + +%% Smoke test for opening and reopening the database +t_open(_Config) -> + ok = emqx_ds_storage_layer_sup:stop_shard(?SHARD), + {ok, _} = emqx_ds_storage_layer_sup:start_shard(?SHARD, #{}). + +%% Smoke test of store function +t_store(_Config) -> + MessageID = emqx_guid:gen(), + PublishedAt = 1000, + Topic = <<"foo/bar">>, + Payload = <<"message">>, + Msg = #message{ + id = MessageID, + topic = Topic, + payload = Payload, + timestamp = PublishedAt + }, + ?assertMatch(ok, emqx_ds_storage_layer:store_batch(?SHARD, [Msg], #{})). + +%% Smoke test for iteration through a concrete topic +t_iterate(_Config) -> + %% Prepare data: + Topics = [<<"foo/bar">>, <<"foo/bar/baz">>, <<"a">>], + Timestamps = lists:seq(1, 10), + Batch = [ + make_message(PublishedAt, Topic, integer_to_binary(PublishedAt)) + || Topic <- Topics, PublishedAt <- Timestamps + ], + ok = emqx_ds_storage_layer:store_batch(?SHARD, Batch, []), + %% Iterate through individual topics: + [ + begin + [{_Rank, Stream}] = emqx_ds_storage_layer:get_streams(?SHARD, parse_topic(Topic), 0), + {ok, It} = emqx_ds_storage_layer:make_iterator(?SHARD, Stream, parse_topic(Topic), 0), + {ok, NextIt, Messages} = emqx_ds_storage_layer:next(?SHARD, It, 100), + ?assertEqual( + lists:map(fun integer_to_binary/1, Timestamps), + payloads(Messages) + ), + {ok, _, []} = emqx_ds_storage_layer:next(?SHARD, NextIt, 100) + end + || Topic <- Topics + ], + ok. + +-define(assertSameSet(A, B), ?assertEqual(lists:sort(A), lists:sort(B))). + +%% Smoke test that verifies that concrete topics are mapped to +%% individual streams, unless there's too many of them. +t_get_streams(_Config) -> + %% Prepare data (without wildcards): + Topics = [<<"foo/bar">>, <<"foo/bar/baz">>, <<"a">>], + Timestamps = lists:seq(1, 10), + Batch = [ + make_message(PublishedAt, Topic, integer_to_binary(PublishedAt)) + || Topic <- Topics, PublishedAt <- Timestamps + ], + ok = emqx_ds_storage_layer:store_batch(?SHARD, Batch, []), + GetStream = fun(Topic) -> + StartTime = 0, + emqx_ds_storage_layer:get_streams(?SHARD, parse_topic(Topic), StartTime) + end, + %% Get streams for individual topics to use as a reference for later: + [FooBar = {_, _}] = GetStream(<<"foo/bar">>), + [FooBarBaz] = GetStream(<<"foo/bar/baz">>), + [A] = GetStream(<<"a">>), + %% Restart shard to make sure trie is persisted and restored: + ok = emqx_ds_storage_layer_sup:stop_shard(?SHARD), + {ok, _} = emqx_ds_storage_layer_sup:start_shard(?SHARD, #{}), + %% Verify that there are no "ghost streams" for topics that don't + %% have any messages: + [] = GetStream(<<"bar/foo">>), + %% Test some wildcard patterns: + ?assertEqual([FooBar], GetStream("+/+")), + ?assertSameSet([FooBar, FooBarBaz], GetStream(<<"foo/#">>)), + ?assertSameSet([FooBar, FooBarBaz, A], GetStream(<<"#">>)), + %% Now insert a bunch of messages with different topics to create wildcards: + NewBatch = [ + begin + B = integer_to_binary(I), + make_message(100, <<"foo/bar/", B/binary>>, <<"filler", B/binary>>) + end + || I <- lists:seq(1, 200) + ], + ok = emqx_ds_storage_layer:store_batch(?SHARD, NewBatch, []), + %% Check that "foo/bar/baz" topic now appears in two streams: + %% "foo/bar/baz" and "foo/bar/+": + NewStreams = lists:sort(GetStream("foo/bar/baz")), + ?assertMatch([_, _], NewStreams), + ?assert(lists:member(FooBarBaz, NewStreams)), + %% Verify that size of the trie is still relatively small, even + %% after processing 200+ topics: + AllStreams = GetStream("#"), + NTotal = length(AllStreams), + ?assert(NTotal < 30, {NTotal, '<', 30}), + ?assert(lists:member(FooBar, AllStreams)), + ?assert(lists:member(FooBarBaz, AllStreams)), + ?assert(lists:member(A, AllStreams)), + ok. + +t_replay(_Config) -> + %% Create concrete topics: + Topics = [<<"foo/bar">>, <<"foo/bar/baz">>], + Timestamps = lists:seq(1, 10_000, 100), + Batch1 = [ + make_message(PublishedAt, Topic, integer_to_binary(PublishedAt)) + || Topic <- Topics, PublishedAt <- Timestamps + ], + ok = emqx_ds_storage_layer:store_batch(?SHARD, Batch1, []), + %% Create wildcard topics `wildcard/+/suffix/foo' and `wildcard/+/suffix/bar': + Batch2 = [ + begin + B = integer_to_binary(I), + make_message( + TS, <<"wildcard/", B/binary, "/suffix/", Suffix/binary>>, integer_to_binary(TS) + ) + end + || I <- lists:seq(1, 200), TS <- Timestamps, Suffix <- [<<"foo">>, <<"bar">>] + ], + ok = emqx_ds_storage_layer:store_batch(?SHARD, Batch2, []), + %% Check various topic filters: + Messages = Batch1 ++ Batch2, + %% Missing topics (no ghost messages): + ?assertNot(check(?SHARD, <<"missing/foo/bar">>, 0, Messages)), + %% Regular topics: + ?assert(check(?SHARD, <<"foo/bar">>, 0, Messages)), + ?assert(check(?SHARD, <<"foo/bar/baz">>, 0, Messages)), + ?assert(check(?SHARD, <<"foo/#">>, 0, Messages)), + ?assert(check(?SHARD, <<"foo/+">>, 0, Messages)), + ?assert(check(?SHARD, <<"foo/+/+">>, 0, Messages)), + ?assert(check(?SHARD, <<"+/+/+">>, 0, Messages)), + ?assert(check(?SHARD, <<"+/+/baz">>, 0, Messages)), + %% Restart shard to make sure trie is persisted and restored: + ok = emqx_ds_storage_layer_sup:stop_shard(?SHARD), + {ok, _} = emqx_ds_storage_layer_sup:start_shard(?SHARD, #{}), + %% Learned wildcard topics: + ?assertNot(check(?SHARD, <<"wildcard/1000/suffix/foo">>, 0, [])), + ?assert(check(?SHARD, <<"wildcard/1/suffix/foo">>, 0, Messages)), + ?assert(check(?SHARD, <<"wildcard/100/suffix/foo">>, 0, Messages)), + ?assert(check(?SHARD, <<"wildcard/+/suffix/foo">>, 0, Messages)), + ?assert(check(?SHARD, <<"wildcard/1/suffix/+">>, 0, Messages)), + ?assert(check(?SHARD, <<"wildcard/100/suffix/+">>, 0, Messages)), + ?assert(check(?SHARD, <<"wildcard/#">>, 0, Messages)), + ?assert(check(?SHARD, <<"wildcard/1/#">>, 0, Messages)), + ?assert(check(?SHARD, <<"wildcard/100/#">>, 0, Messages)), + ?assert(check(?SHARD, <<"#">>, 0, Messages)), + ok. + +check(Shard, TopicFilter, StartTime, ExpectedMessages) -> + ExpectedFiltered = lists:filter( + fun(#message{topic = Topic, timestamp = TS}) -> + emqx_topic:match(Topic, TopicFilter) andalso TS >= StartTime + end, + ExpectedMessages + ), + ?check_trace( + #{timetrap => 10_000}, + begin + Dump = dump_messages(Shard, TopicFilter, StartTime), + verify_dump(TopicFilter, StartTime, Dump), + Missing = ExpectedFiltered -- Dump, + Extras = Dump -- ExpectedFiltered, + ?assertMatch( + #{missing := [], unexpected := []}, + #{ + missing => Missing, + unexpected => Extras, + topic_filter => TopicFilter, + start_time => StartTime + } + ) + end, + [] + ), + length(ExpectedFiltered) > 0. + +verify_dump(TopicFilter, StartTime, Dump) -> + lists:foldl( + fun(#message{topic = Topic, timestamp = TS}, Acc) -> + %% Verify that the topic of the message returned by the + %% iterator matches the expected topic filter: + ?assert(emqx_topic:match(Topic, TopicFilter), {unexpected_topic, Topic, TopicFilter}), + %% Verify that timestamp of the message is greater than + %% the StartTime of the iterator: + ?assert(TS >= StartTime, {start_time, TopicFilter, TS, StartTime}), + %% Verify that iterator didn't reorder messages + %% (timestamps for each topic are growing): + LastTopicTs = maps:get(Topic, Acc, -1), + ?assert(TS >= LastTopicTs, {topic_ts_reordering, Topic, TS, LastTopicTs}), + Acc#{Topic => TS} + end, + #{}, + Dump + ). + +dump_messages(Shard, TopicFilter, StartTime) -> + Streams = emqx_ds_storage_layer:get_streams(Shard, parse_topic(TopicFilter), StartTime), + lists:flatmap( + fun({_Rank, Stream}) -> + dump_stream(Shard, Stream, TopicFilter, StartTime) + end, + Streams + ). + +dump_stream(Shard, Stream, TopicFilter, StartTime) -> + BatchSize = 100, + {ok, Iterator} = emqx_ds_storage_layer:make_iterator( + Shard, Stream, parse_topic(TopicFilter), StartTime + ), + Loop = fun + F(It, 0) -> + error({too_many_iterations, It}); + F(It, N) -> + case emqx_ds_storage_layer:next(Shard, It, BatchSize) of + end_of_stream -> + []; + {ok, _NextIt, []} -> + []; + {ok, NextIt, Batch} -> + Batch ++ F(NextIt, N - 1) + end + end, + MaxIterations = 1000000, + Loop(Iterator, MaxIterations). + +%% t_create_gen(_Config) -> +%% {ok, 1} = emqx_ds_storage_layer:create_generation(?SHARD, 5, ?DEFAULT_CONFIG), +%% ?assertEqual( +%% {error, nonmonotonic}, +%% emqx_ds_storage_layer:create_generation(?SHARD, 1, ?DEFAULT_CONFIG) +%% ), +%% ?assertEqual( +%% {error, nonmonotonic}, +%% emqx_ds_storage_layer:create_generation(?SHARD, 5, ?DEFAULT_CONFIG) +%% ), +%% {ok, 2} = emqx_ds_storage_layer:create_generation(?SHARD, 10, ?COMPACT_CONFIG), +%% Topics = ["foo/bar", "foo/bar/baz"], +%% Timestamps = lists:seq(1, 100), +%% [ +%% ?assertMatch({ok, [_]}, store(?SHARD, PublishedAt, Topic, <<>>)) +%% || Topic <- Topics, PublishedAt <- Timestamps +%% ]. + +%% t_iterate_multigen(_Config) -> +%% {ok, 1} = emqx_ds_storage_layer:create_generation(?SHARD, 10, ?COMPACT_CONFIG), +%% {ok, 2} = emqx_ds_storage_layer:create_generation(?SHARD, 50, ?DEFAULT_CONFIG), +%% {ok, 3} = emqx_ds_storage_layer:create_generation(?SHARD, 1000, ?DEFAULT_CONFIG), +%% Topics = ["foo/bar", "foo/bar/baz", "a", "a/bar"], +%% Timestamps = lists:seq(1, 100), +%% _ = [ +%% store(?SHARD, PublishedAt, Topic, term_to_binary({Topic, PublishedAt})) +%% || Topic <- Topics, PublishedAt <- Timestamps +%% ], +%% ?assertEqual( +%% lists:sort([ +%% {Topic, PublishedAt} +%% || Topic <- ["foo/bar", "foo/bar/baz"], PublishedAt <- Timestamps +%% ]), +%% lists:sort([binary_to_term(Payload) || Payload <- iterate(?SHARD, "foo/#", 0)]) +%% ), +%% ?assertEqual( +%% lists:sort([ +%% {Topic, PublishedAt} +%% || Topic <- ["a", "a/bar"], PublishedAt <- lists:seq(60, 100) +%% ]), +%% lists:sort([binary_to_term(Payload) || Payload <- iterate(?SHARD, "a/#", 60)]) +%% ). + +%% t_iterate_multigen_preserve_restore(_Config) -> +%% ReplayID = atom_to_binary(?FUNCTION_NAME), +%% {ok, 1} = emqx_ds_storage_layer:create_generation(?SHARD, 10, ?COMPACT_CONFIG), +%% {ok, 2} = emqx_ds_storage_layer:create_generation(?SHARD, 50, ?DEFAULT_CONFIG), +%% {ok, 3} = emqx_ds_storage_layer:create_generation(?SHARD, 100, ?DEFAULT_CONFIG), +%% Topics = ["foo/bar", "foo/bar/baz", "a/bar"], +%% Timestamps = lists:seq(1, 100), +%% TopicFilter = "foo/#", +%% TopicsMatching = ["foo/bar", "foo/bar/baz"], +%% _ = [ +%% store(?SHARD, TS, Topic, term_to_binary({Topic, TS})) +%% || Topic <- Topics, TS <- Timestamps +%% ], +%% It0 = iterator(?SHARD, TopicFilter, 0), +%% {It1, Res10} = iterate(It0, 10), +%% % preserve mid-generation +%% ok = emqx_ds_storage_layer:preserve_iterator(It1, ReplayID), +%% {ok, It2} = emqx_ds_storage_layer:restore_iterator(?SHARD, ReplayID), +%% {It3, Res100} = iterate(It2, 88), +%% % preserve on the generation boundary +%% ok = emqx_ds_storage_layer:preserve_iterator(It3, ReplayID), +%% {ok, It4} = emqx_ds_storage_layer:restore_iterator(?SHARD, ReplayID), +%% {It5, Res200} = iterate(It4, 1000), +%% ?assertEqual({end_of_stream, []}, iterate(It5, 1)), +%% ?assertEqual( +%% lists:sort([{Topic, TS} || Topic <- TopicsMatching, TS <- Timestamps]), +%% lists:sort([binary_to_term(Payload) || Payload <- Res10 ++ Res100 ++ Res200]) +%% ), +%% ?assertEqual( +%% ok, +%% emqx_ds_storage_layer:discard_iterator(?SHARD, ReplayID) +%% ), +%% ?assertEqual( +%% {error, not_found}, +%% emqx_ds_storage_layer:restore_iterator(?SHARD, ReplayID) +%% ). + +make_message(PublishedAt, Topic, Payload) when is_list(Topic) -> + make_message(PublishedAt, list_to_binary(Topic), Payload); +make_message(PublishedAt, Topic, Payload) when is_binary(Topic) -> + ID = emqx_guid:gen(), + #message{ + id = ID, + topic = Topic, + timestamp = PublishedAt, + payload = Payload + }. + +store(Shard, PublishedAt, TopicL, Payload) when is_list(TopicL) -> + store(Shard, PublishedAt, list_to_binary(TopicL), Payload); +store(Shard, PublishedAt, Topic, Payload) -> + ID = emqx_guid:gen(), + Msg = #message{ + id = ID, + topic = Topic, + timestamp = PublishedAt, + payload = Payload + }, + emqx_ds_storage_layer:message_store(Shard, [Msg], #{}). + +payloads(Messages) -> + lists:map( + fun(#message{payload = P}) -> + P + end, + Messages + ). + +parse_topic(Topic = [L | _]) when is_binary(L); is_atom(L) -> + Topic; +parse_topic(Topic) -> + emqx_topic:words(iolist_to_binary(Topic)). + +%% CT callbacks + +all() -> emqx_common_test_helpers:all(?MODULE). +suite() -> [{timetrap, {seconds, 20}}]. + +init_per_suite(Config) -> + {ok, _} = application:ensure_all_started(emqx_durable_storage), + Config. + +end_per_suite(_Config) -> + ok = application:stop(emqx_durable_storage). + +init_per_testcase(TC, Config) -> + {ok, _} = emqx_ds_storage_layer_sup:start_shard(shard(TC), ?DEFAULT_CONFIG), + Config. + +end_per_testcase(TC, _Config) -> + ok = emqx_ds_storage_layer_sup:stop_shard(shard(TC)). + +shard(TC) -> + {?MODULE, TC}. + +keyspace(TC) -> + TC. + +set_keyspace_config(Keyspace, Config) -> + ok = application:set_env(emqx_ds, keyspace_config, #{Keyspace => Config}). diff --git a/apps/emqx_durable_storage/test/emqx_ds_storage_layer_SUITE.erl b/apps/emqx_durable_storage/test/emqx_ds_storage_layer_SUITE.erl deleted file mode 100644 index 3a872934f..000000000 --- a/apps/emqx_durable_storage/test/emqx_ds_storage_layer_SUITE.erl +++ /dev/null @@ -1,282 +0,0 @@ -%%-------------------------------------------------------------------- -%% Copyright (c) 2022-2023 EMQ Technologies Co., Ltd. All Rights Reserved. -%%-------------------------------------------------------------------- --module(emqx_ds_storage_layer_SUITE). - --compile(export_all). --compile(nowarn_export_all). - --include_lib("common_test/include/ct.hrl"). --include_lib("stdlib/include/assert.hrl"). - --define(SHARD, shard(?FUNCTION_NAME)). - --define(DEFAULT_CONFIG, - {emqx_ds_message_storage_bitmask, #{ - timestamp_bits => 64, - topic_bits_per_level => [8, 8, 32, 16], - epoch => 5, - iteration => #{ - iterator_refresh => {every, 5} - } - }} -). - --define(COMPACT_CONFIG, - {emqx_ds_message_storage_bitmask, #{ - timestamp_bits => 16, - topic_bits_per_level => [16, 16], - epoch => 10 - }} -). - -%% Smoke test for opening and reopening the database -t_open(_Config) -> - ok = emqx_ds_storage_layer_sup:stop_shard(?SHARD), - {ok, _} = emqx_ds_storage_layer_sup:start_shard(?SHARD, #{}). - -%% Smoke test of store function -t_store(_Config) -> - MessageID = emqx_guid:gen(), - PublishedAt = 1000, - Topic = [<<"foo">>, <<"bar">>], - Payload = <<"message">>, - ?assertMatch(ok, emqx_ds_storage_layer:store(?SHARD, MessageID, PublishedAt, Topic, Payload)). - -%% Smoke test for iteration through a concrete topic -t_iterate(_Config) -> - %% Prepare data: - Topics = [[<<"foo">>, <<"bar">>], [<<"foo">>, <<"bar">>, <<"baz">>], [<<"a">>]], - Timestamps = lists:seq(1, 10), - [ - emqx_ds_storage_layer:store( - ?SHARD, - emqx_guid:gen(), - PublishedAt, - Topic, - integer_to_binary(PublishedAt) - ) - || Topic <- Topics, PublishedAt <- Timestamps - ], - %% Iterate through individual topics: - [ - begin - {ok, It} = emqx_ds_storage_layer:make_iterator(?SHARD, {Topic, 0}), - Values = iterate(It), - ?assertEqual(lists:map(fun integer_to_binary/1, Timestamps), Values) - end - || Topic <- Topics - ], - ok. - -%% Smoke test for iteration with wildcard topic filter -t_iterate_wildcard(_Config) -> - %% Prepare data: - Topics = ["foo/bar", "foo/bar/baz", "a", "a/bar"], - Timestamps = lists:seq(1, 10), - _ = [ - store(?SHARD, PublishedAt, Topic, term_to_binary({Topic, PublishedAt})) - || Topic <- Topics, PublishedAt <- Timestamps - ], - ?assertEqual( - lists:sort([{Topic, PublishedAt} || Topic <- Topics, PublishedAt <- Timestamps]), - lists:sort([binary_to_term(Payload) || Payload <- iterate(?SHARD, "#", 0)]) - ), - ?assertEqual( - [], - lists:sort([binary_to_term(Payload) || Payload <- iterate(?SHARD, "#", 10 + 1)]) - ), - ?assertEqual( - lists:sort([{Topic, PublishedAt} || Topic <- Topics, PublishedAt <- lists:seq(5, 10)]), - lists:sort([binary_to_term(Payload) || Payload <- iterate(?SHARD, "#", 5)]) - ), - ?assertEqual( - lists:sort([ - {Topic, PublishedAt} - || Topic <- ["foo/bar", "foo/bar/baz"], PublishedAt <- Timestamps - ]), - lists:sort([binary_to_term(Payload) || Payload <- iterate(?SHARD, "foo/#", 0)]) - ), - ?assertEqual( - lists:sort([{"foo/bar", PublishedAt} || PublishedAt <- Timestamps]), - lists:sort([binary_to_term(Payload) || Payload <- iterate(?SHARD, "foo/+", 0)]) - ), - ?assertEqual( - [], - lists:sort([binary_to_term(Payload) || Payload <- iterate(?SHARD, "foo/+/bar", 0)]) - ), - ?assertEqual( - lists:sort([ - {Topic, PublishedAt} - || Topic <- ["foo/bar", "foo/bar/baz", "a/bar"], PublishedAt <- Timestamps - ]), - lists:sort([binary_to_term(Payload) || Payload <- iterate(?SHARD, "+/bar/#", 0)]) - ), - ?assertEqual( - lists:sort([{Topic, PublishedAt} || Topic <- ["a", "a/bar"], PublishedAt <- Timestamps]), - lists:sort([binary_to_term(Payload) || Payload <- iterate(?SHARD, "a/#", 0)]) - ), - ?assertEqual( - [], - lists:sort([binary_to_term(Payload) || Payload <- iterate(?SHARD, "a/+/+", 0)]) - ), - ok. - -t_iterate_long_tail_wildcard(_Config) -> - Topic = "b/c/d/e/f/g", - TopicFilter = "b/c/d/e/+/+", - Timestamps = lists:seq(1, 100), - _ = [ - store(?SHARD, PublishedAt, Topic, term_to_binary({Topic, PublishedAt})) - || PublishedAt <- Timestamps - ], - ?assertEqual( - lists:sort([{"b/c/d/e/f/g", PublishedAt} || PublishedAt <- lists:seq(50, 100)]), - lists:sort([binary_to_term(Payload) || Payload <- iterate(?SHARD, TopicFilter, 50)]) - ). - -t_create_gen(_Config) -> - {ok, 1} = emqx_ds_storage_layer:create_generation(?SHARD, 5, ?DEFAULT_CONFIG), - ?assertEqual( - {error, nonmonotonic}, - emqx_ds_storage_layer:create_generation(?SHARD, 1, ?DEFAULT_CONFIG) - ), - ?assertEqual( - {error, nonmonotonic}, - emqx_ds_storage_layer:create_generation(?SHARD, 5, ?DEFAULT_CONFIG) - ), - {ok, 2} = emqx_ds_storage_layer:create_generation(?SHARD, 10, ?COMPACT_CONFIG), - Topics = ["foo/bar", "foo/bar/baz"], - Timestamps = lists:seq(1, 100), - [ - ?assertEqual(ok, store(?SHARD, PublishedAt, Topic, <<>>)) - || Topic <- Topics, PublishedAt <- Timestamps - ]. - -t_iterate_multigen(_Config) -> - {ok, 1} = emqx_ds_storage_layer:create_generation(?SHARD, 10, ?COMPACT_CONFIG), - {ok, 2} = emqx_ds_storage_layer:create_generation(?SHARD, 50, ?DEFAULT_CONFIG), - {ok, 3} = emqx_ds_storage_layer:create_generation(?SHARD, 1000, ?DEFAULT_CONFIG), - Topics = ["foo/bar", "foo/bar/baz", "a", "a/bar"], - Timestamps = lists:seq(1, 100), - _ = [ - store(?SHARD, PublishedAt, Topic, term_to_binary({Topic, PublishedAt})) - || Topic <- Topics, PublishedAt <- Timestamps - ], - ?assertEqual( - lists:sort([ - {Topic, PublishedAt} - || Topic <- ["foo/bar", "foo/bar/baz"], PublishedAt <- Timestamps - ]), - lists:sort([binary_to_term(Payload) || Payload <- iterate(?SHARD, "foo/#", 0)]) - ), - ?assertEqual( - lists:sort([ - {Topic, PublishedAt} - || Topic <- ["a", "a/bar"], PublishedAt <- lists:seq(60, 100) - ]), - lists:sort([binary_to_term(Payload) || Payload <- iterate(?SHARD, "a/#", 60)]) - ). - -t_iterate_multigen_preserve_restore(_Config) -> - ReplayID = atom_to_binary(?FUNCTION_NAME), - {ok, 1} = emqx_ds_storage_layer:create_generation(?SHARD, 10, ?COMPACT_CONFIG), - {ok, 2} = emqx_ds_storage_layer:create_generation(?SHARD, 50, ?DEFAULT_CONFIG), - {ok, 3} = emqx_ds_storage_layer:create_generation(?SHARD, 100, ?DEFAULT_CONFIG), - Topics = ["foo/bar", "foo/bar/baz", "a/bar"], - Timestamps = lists:seq(1, 100), - TopicFilter = "foo/#", - TopicsMatching = ["foo/bar", "foo/bar/baz"], - _ = [ - store(?SHARD, TS, Topic, term_to_binary({Topic, TS})) - || Topic <- Topics, TS <- Timestamps - ], - It0 = iterator(?SHARD, TopicFilter, 0), - {It1, Res10} = iterate(It0, 10), - % preserve mid-generation - ok = emqx_ds_storage_layer:preserve_iterator(It1, ReplayID), - {ok, It2} = emqx_ds_storage_layer:restore_iterator(?SHARD, ReplayID), - {It3, Res100} = iterate(It2, 88), - % preserve on the generation boundary - ok = emqx_ds_storage_layer:preserve_iterator(It3, ReplayID), - {ok, It4} = emqx_ds_storage_layer:restore_iterator(?SHARD, ReplayID), - {It5, Res200} = iterate(It4, 1000), - ?assertEqual(none, It5), - ?assertEqual( - lists:sort([{Topic, TS} || Topic <- TopicsMatching, TS <- Timestamps]), - lists:sort([binary_to_term(Payload) || Payload <- Res10 ++ Res100 ++ Res200]) - ), - ?assertEqual( - ok, - emqx_ds_storage_layer:discard_iterator(?SHARD, ReplayID) - ), - ?assertEqual( - {error, not_found}, - emqx_ds_storage_layer:restore_iterator(?SHARD, ReplayID) - ). - -store(Shard, PublishedAt, Topic, Payload) -> - ID = emqx_guid:gen(), - emqx_ds_storage_layer:store(Shard, ID, PublishedAt, parse_topic(Topic), Payload). - -iterate(DB, TopicFilter, StartTime) -> - iterate(iterator(DB, TopicFilter, StartTime)). - -iterate(It) -> - case emqx_ds_storage_layer:next(It) of - {value, Payload, ItNext} -> - [Payload | iterate(ItNext)]; - none -> - [] - end. - -iterate(It, 0) -> - {It, []}; -iterate(It, N) -> - case emqx_ds_storage_layer:next(It) of - {value, Payload, ItNext} -> - {ItFinal, Ps} = iterate(ItNext, N - 1), - {ItFinal, [Payload | Ps]}; - none -> - {none, []} - end. - -iterator(DB, TopicFilter, StartTime) -> - {ok, It} = emqx_ds_storage_layer:make_iterator(DB, {parse_topic(TopicFilter), StartTime}), - It. - -parse_topic(Topic = [L | _]) when is_binary(L); is_atom(L) -> - Topic; -parse_topic(Topic) -> - emqx_topic:words(iolist_to_binary(Topic)). - -%% CT callbacks - -all() -> emqx_common_test_helpers:all(?MODULE). - -init_per_suite(Config) -> - {ok, _} = application:ensure_all_started(emqx_durable_storage), - Config. - -end_per_suite(_Config) -> - ok = application:stop(emqx_durable_storage). - -init_per_testcase(TC, Config) -> - ok = set_keyspace_config(keyspace(TC), ?DEFAULT_CONFIG), - {ok, _} = emqx_ds_storage_layer_sup:start_shard(shard(TC), #{}), - Config. - -end_per_testcase(TC, _Config) -> - ok = emqx_ds_storage_layer_sup:stop_shard(shard(TC)). - -keyspace(TC) -> - list_to_atom(lists:concat([?MODULE, "_", TC])). - -shard_id(_TC) -> - <<"shard">>. - -shard(TC) -> - {keyspace(TC), shard_id(TC)}. - -set_keyspace_config(Keyspace, Config) -> - ok = application:set_env(emqx_ds, keyspace_config, #{Keyspace => Config}). diff --git a/apps/emqx_durable_storage/test/props/emqx_ds_message_storage_bitmask_shim.erl b/apps/emqx_durable_storage/test/props/emqx_ds_message_storage_bitmask_shim.erl index e9daf2581..9b5af9428 100644 --- a/apps/emqx_durable_storage/test/props/emqx_ds_message_storage_bitmask_shim.erl +++ b/apps/emqx_durable_storage/test/props/emqx_ds_message_storage_bitmask_shim.erl @@ -4,9 +4,11 @@ -module(emqx_ds_message_storage_bitmask_shim). +-include_lib("emqx/include/emqx.hrl"). + -export([open/0]). -export([close/1]). --export([store/5]). +-export([store/2]). -export([iterate/2]). -type topic() :: list(binary()). @@ -25,20 +27,21 @@ close(Tab) -> true = ets:delete(Tab), ok. --spec store(t(), emqx_guid:guid(), time(), topic(), binary()) -> +-spec store(t(), emqx_types:message()) -> ok | {error, _TODO}. -store(Tab, MessageID, PublishedAt, Topic, Payload) -> - true = ets:insert(Tab, {{PublishedAt, MessageID}, Topic, Payload}), +store(Tab, Msg = #message{id = MessageID, timestamp = PublishedAt}) -> + true = ets:insert(Tab, {{PublishedAt, MessageID}, Msg}), ok. -spec iterate(t(), emqx_ds:replay()) -> [binary()]. -iterate(Tab, {TopicFilter, StartTime}) -> +iterate(Tab, {TopicFilter0, StartTime}) -> + TopicFilter = iolist_to_binary(lists:join("/", TopicFilter0)), ets:foldr( - fun({{PublishedAt, _}, Topic, Payload}, Acc) -> + fun({{PublishedAt, _}, Msg = #message{topic = Topic}}, Acc) -> case emqx_topic:match(Topic, TopicFilter) of true when PublishedAt >= StartTime -> - [Payload | Acc]; + [Msg | Acc]; _ -> Acc end diff --git a/apps/emqx_durable_storage/test/props/prop_replay_message_storage.erl b/apps/emqx_durable_storage/test/props/prop_replay_message_storage.erl deleted file mode 100644 index f9964bebe..000000000 --- a/apps/emqx_durable_storage/test/props/prop_replay_message_storage.erl +++ /dev/null @@ -1,466 +0,0 @@ -%%-------------------------------------------------------------------- -%% Copyright (c) 2020-2023 EMQ Technologies Co., Ltd. All Rights Reserved. -%%-------------------------------------------------------------------- - --module(prop_replay_message_storage). - --include_lib("proper/include/proper.hrl"). --include_lib("eunit/include/eunit.hrl"). - --define(WORK_DIR, ["_build", "test"]). --define(RUN_ID, {?MODULE, testrun_id}). - --define(KEYSPACE, ?MODULE). --define(SHARD_ID, <<"shard">>). --define(SHARD, {?KEYSPACE, ?SHARD_ID}). --define(GEN_ID, 42). - -%%-------------------------------------------------------------------- -%% Properties -%%-------------------------------------------------------------------- - -prop_bitstring_computes() -> - ?FORALL( - Keymapper, - keymapper(), - ?FORALL({Topic, Timestamp}, {topic(), integer()}, begin - BS = emqx_ds_message_storage_bitmask:compute_bitstring(Topic, Timestamp, Keymapper), - is_integer(BS) andalso (BS < (1 bsl get_keymapper_bitsize(Keymapper))) - end) - ). - -prop_topic_bitmask_computes() -> - Keymapper = make_keymapper(16, [8, 12, 16], 100), - ?FORALL(TopicFilter, topic_filter(), begin - Mask = emqx_ds_message_storage_bitmask:compute_topic_bitmask(TopicFilter, Keymapper), - % topic bits + timestamp LSBs - is_integer(Mask) andalso (Mask < (1 bsl (36 + 6))) - end). - -prop_next_seek_monotonic() -> - ?FORALL( - {TopicFilter, StartTime, Keymapper}, - {topic_filter(), pos_integer(), keymapper()}, - begin - Filter = emqx_ds_message_storage_bitmask:make_keyspace_filter( - {TopicFilter, StartTime}, - Keymapper - ), - ?FORALL( - Bitstring, - bitstr(get_keymapper_bitsize(Keymapper)), - emqx_ds_message_storage_bitmask:compute_next_seek(Bitstring, Filter) >= Bitstring - ) - end - ). - -prop_next_seek_eq_initial_seek() -> - ?FORALL( - Filter, - keyspace_filter(), - emqx_ds_message_storage_bitmask:compute_initial_seek(Filter) =:= - emqx_ds_message_storage_bitmask:compute_next_seek(0, Filter) - ). - -prop_iterate_messages() -> - TBPL = [4, 8, 12], - Options = #{ - timestamp_bits => 32, - topic_bits_per_level => TBPL, - epoch => 200 - }, - % TODO - % Shrinking is too unpredictable and leaves a LOT of garbage in the scratch dit. - ?FORALL(Stream, noshrink(non_empty(messages(topic(TBPL)))), begin - Filepath = make_filepath(?FUNCTION_NAME, erlang:system_time(microsecond)), - {DB, Handle} = open_db(Filepath, Options), - Shim = emqx_ds_message_storage_bitmask_shim:open(), - ok = store_db(DB, Stream), - ok = store_shim(Shim, Stream), - ?FORALL( - { - {Topic, _}, - Pattern, - StartTime - }, - { - nth(Stream), - topic_filter_pattern(), - start_time() - }, - begin - TopicFilter = make_topic_filter(Pattern, Topic), - Iteration = {TopicFilter, StartTime}, - Messages = iterate_db(DB, Iteration), - Reference = iterate_shim(Shim, Iteration), - ok = close_db(Handle), - ok = emqx_ds_message_storage_bitmask_shim:close(Shim), - ?WHENFAIL( - begin - io:format(user, " *** Filepath = ~s~n", [Filepath]), - io:format(user, " *** TopicFilter = ~p~n", [TopicFilter]), - io:format(user, " *** StartTime = ~p~n", [StartTime]) - end, - is_list(Messages) andalso equals(Messages -- Reference, Reference -- Messages) - ) - end - ) - end). - -prop_iterate_eq_iterate_with_preserve_restore() -> - TBPL = [4, 8, 16, 12], - Options = #{ - timestamp_bits => 32, - topic_bits_per_level => TBPL, - epoch => 500 - }, - {DB, _Handle} = open_db(make_filepath(?FUNCTION_NAME), Options), - ?FORALL(Stream, non_empty(messages(topic(TBPL))), begin - % TODO - % This proptest is impure because messages from testruns assumed to be - % independent of each other are accumulated in the same storage. This - % would probably confuse shrinker in the event a testrun fails. - ok = store_db(DB, Stream), - ?FORALL( - { - {Topic, _}, - Pat, - StartTime, - Commands - }, - { - nth(Stream), - topic_filter_pattern(), - start_time(), - shuffled(flat([non_empty(list({preserve, restore})), list(iterate)])) - }, - begin - Replay = {make_topic_filter(Pat, Topic), StartTime}, - Iterator = make_iterator(DB, Replay), - Ctx = #{db => DB, replay => Replay}, - Messages = run_iterator_commands(Commands, Iterator, Ctx), - equals(Messages, iterate_db(DB, Replay)) - end - ) - end). - -prop_iterate_eq_iterate_with_refresh() -> - TBPL = [4, 8, 16, 12], - Options = #{ - timestamp_bits => 32, - topic_bits_per_level => TBPL, - epoch => 500 - }, - {DB, _Handle} = open_db(make_filepath(?FUNCTION_NAME), Options), - ?FORALL(Stream, non_empty(messages(topic(TBPL))), begin - % TODO - % This proptest is also impure, see above. - ok = store_db(DB, Stream), - ?FORALL( - { - {Topic, _}, - Pat, - StartTime, - RefreshEvery - }, - { - nth(Stream), - topic_filter_pattern(), - start_time(), - pos_integer() - }, - ?TIMEOUT(5000, begin - Replay = {make_topic_filter(Pat, Topic), StartTime}, - IterationOptions = #{iterator_refresh => {every, RefreshEvery}}, - Iterator = make_iterator(DB, Replay, IterationOptions), - Messages = iterate_db(Iterator), - equals(Messages, iterate_db(DB, Replay)) - end) - ) - end). - -% store_message_stream(DB, [{Topic, {Payload, ChunkNum, _ChunkCount}} | Rest]) -> -% MessageID = emqx_guid:gen(), -% PublishedAt = ChunkNum, -% MessageID, PublishedAt, Topic -% ]), -% ok = emqx_ds_message_storage_bitmask:store(DB, MessageID, PublishedAt, Topic, Payload), -% store_message_stream(DB, payload_gen:next(Rest)); -% store_message_stream(_Zone, []) -> -% ok. - -store_db(DB, Messages) -> - lists:foreach( - fun({Topic, Payload = {MessageID, Timestamp, _}}) -> - Bin = term_to_binary(Payload), - emqx_ds_message_storage_bitmask:store(DB, MessageID, Timestamp, Topic, Bin) - end, - Messages - ). - -iterate_db(DB, Iteration) -> - iterate_db(make_iterator(DB, Iteration)). - -iterate_db(It) -> - case emqx_ds_message_storage_bitmask:next(It) of - {value, Payload, ItNext} -> - [binary_to_term(Payload) | iterate_db(ItNext)]; - none -> - [] - end. - -make_iterator(DB, Replay) -> - {ok, It} = emqx_ds_message_storage_bitmask:make_iterator(DB, Replay), - It. - -make_iterator(DB, Replay, Options) -> - {ok, It} = emqx_ds_message_storage_bitmask:make_iterator(DB, Replay, Options), - It. - -run_iterator_commands([iterate | Rest], It, Ctx) -> - case emqx_ds_message_storage_bitmask:next(It) of - {value, Payload, ItNext} -> - [binary_to_term(Payload) | run_iterator_commands(Rest, ItNext, Ctx)]; - none -> - [] - end; -run_iterator_commands([{preserve, restore} | Rest], It, Ctx) -> - #{ - db := DB, - replay := Replay - } = Ctx, - Serial = emqx_ds_message_storage_bitmask:preserve_iterator(It), - {ok, ItNext} = emqx_ds_message_storage_bitmask:restore_iterator(DB, Replay, Serial), - run_iterator_commands(Rest, ItNext, Ctx); -run_iterator_commands([], It, _Ctx) -> - iterate_db(It). - -store_shim(Shim, Messages) -> - lists:foreach( - fun({Topic, Payload = {MessageID, Timestamp, _}}) -> - Bin = term_to_binary(Payload), - emqx_ds_message_storage_bitmask_shim:store(Shim, MessageID, Timestamp, Topic, Bin) - end, - Messages - ). - -iterate_shim(Shim, Iteration) -> - lists:map( - fun binary_to_term/1, - emqx_ds_message_storage_bitmask_shim:iterate(Shim, Iteration) - ). - -%%-------------------------------------------------------------------- -%% Setup / teardown -%%-------------------------------------------------------------------- - -open_db(Filepath, Options) -> - {ok, Handle} = rocksdb:open(Filepath, [{create_if_missing, true}]), - {Schema, CFRefs} = emqx_ds_message_storage_bitmask:create_new(Handle, ?GEN_ID, Options), - DB = emqx_ds_message_storage_bitmask:open(?SHARD, Handle, ?GEN_ID, CFRefs, Schema), - {DB, Handle}. - -close_db(Handle) -> - rocksdb:close(Handle). - -make_filepath(TC) -> - make_filepath(TC, 0). - -make_filepath(TC, InstID) -> - Name = io_lib:format("~0p.~0p", [TC, InstID]), - Path = filename:join(?WORK_DIR ++ ["proper", "runs", get_run_id(), ?MODULE_STRING, Name]), - ok = filelib:ensure_dir(Path), - Path. - -get_run_id() -> - case persistent_term:get(?RUN_ID, undefined) of - RunID when RunID /= undefined -> - RunID; - undefined -> - RunID = make_run_id(), - ok = persistent_term:put(?RUN_ID, RunID), - RunID - end. - -make_run_id() -> - calendar:system_time_to_rfc3339(erlang:system_time(second), [{offset, "Z"}]). - -%%-------------------------------------------------------------------- -%% Type generators -%%-------------------------------------------------------------------- - -topic() -> - non_empty(list(topic_level())). - -topic(EntropyWeights) -> - ?LET(L, scaled(1 / 4, list(1)), begin - EWs = lists:sublist(EntropyWeights ++ L, length(L)), - ?SIZED(S, [oneof([topic_level(S * EW), topic_level_fixed()]) || EW <- EWs]) - end). - -topic_filter() -> - ?SUCHTHAT( - L, - non_empty( - list( - frequency([ - {5, topic_level()}, - {2, '+'}, - {1, '#'} - ]) - ) - ), - not lists:member('#', L) orelse lists:last(L) == '#' - ). - -topic_level_pattern() -> - frequency([ - {5, level}, - {2, '+'}, - {1, '#'} - ]). - -topic_filter_pattern() -> - list(topic_level_pattern()). - -topic_filter(Topic) -> - ?LET({T, Pat}, {Topic, topic_filter_pattern()}, make_topic_filter(Pat, T)). - -make_topic_filter([], _) -> - []; -make_topic_filter(_, []) -> - []; -make_topic_filter(['#' | _], _) -> - ['#']; -make_topic_filter(['+' | Rest], [_ | Levels]) -> - ['+' | make_topic_filter(Rest, Levels)]; -make_topic_filter([level | Rest], [L | Levels]) -> - [L | make_topic_filter(Rest, Levels)]. - -% topic() -> -% ?LAZY(?SIZED(S, frequency([ -% {S, [topic_level() | topic()]}, -% {1, []} -% ]))). - -% topic_filter() -> -% ?LAZY(?SIZED(S, frequency([ -% {round(S / 3 * 2), [topic_level() | topic_filter()]}, -% {round(S / 3 * 1), ['+' | topic_filter()]}, -% {1, []}, -% {1, ['#']} -% ]))). - -topic_level() -> - ?LET(L, list(oneof([range($a, $z), range($0, $9)])), iolist_to_binary(L)). - -topic_level(Entropy) -> - S = floor(1 + math:log2(Entropy) / 4), - ?LET(I, range(1, Entropy), iolist_to_binary(io_lib:format("~*.16.0B", [S, I]))). - -topic_level_fixed() -> - oneof([ - <<"foo">>, - <<"bar">>, - <<"baz">>, - <<"xyzzy">> - ]). - -keymapper() -> - ?LET( - {TimestampBits, TopicBits, Epoch}, - { - range(0, 128), - non_empty(list(range(1, 32))), - pos_integer() - }, - make_keymapper(TimestampBits, TopicBits, Epoch * 100) - ). - -keyspace_filter() -> - ?LET( - {TopicFilter, StartTime, Keymapper}, - {topic_filter(), pos_integer(), keymapper()}, - emqx_ds_message_storage_bitmask:make_keyspace_filter({TopicFilter, StartTime}, Keymapper) - ). - -messages(Topic) -> - ?LET( - Ts, - list(Topic), - interleaved( - ?LET(Messages, vector(length(Ts), scaled(4, list(message()))), lists:zip(Ts, Messages)) - ) - ). - -message() -> - ?LET({Timestamp, Payload}, {timestamp(), binary()}, {emqx_guid:gen(), Timestamp, Payload}). - -message_streams(Topic) -> - ?LET(Topics, list(Topic), [{T, payload_gen:binary_stream_gen(64)} || T <- Topics]). - -timestamp() -> - scaled(20, pos_integer()). - -start_time() -> - scaled(10, pos_integer()). - -bitstr(Size) -> - ?LET(B, binary(1 + (Size div 8)), binary:decode_unsigned(B) band (1 bsl Size - 1)). - -nth(L) -> - ?LET(I, range(1, length(L)), lists:nth(I, L)). - -scaled(Factor, T) -> - ?SIZED(S, resize(ceil(S * Factor), T)). - -interleaved(T) -> - ?LET({L, Seed}, {T, integer()}, interleave(L, rand:seed_s(exsss, Seed))). - -shuffled(T) -> - ?LET({L, Seed}, {T, integer()}, shuffle(L, rand:seed_s(exsss, Seed))). - -flat(T) -> - ?LET(L, T, lists:flatten(L)). - -%%-------------------------------------------------------------------- -%% Internal functions -%%-------------------------------------------------------------------- - -make_keymapper(TimestampBits, TopicBits, MaxEpoch) -> - emqx_ds_message_storage_bitmask:make_keymapper(#{ - timestamp_bits => TimestampBits, - topic_bits_per_level => TopicBits, - epoch => MaxEpoch - }). - -get_keymapper_bitsize(Keymapper) -> - maps:get(bitsize, emqx_ds_message_storage_bitmask:keymapper_info(Keymapper)). - --spec interleave(list({Tag, list(E)}), rand:state()) -> list({Tag, E}). -interleave(Seqs, Rng) -> - interleave(Seqs, length(Seqs), Rng). - -interleave(Seqs, L, Rng) when L > 0 -> - {N, RngNext} = rand:uniform_s(L, Rng), - {SeqHead, SeqTail} = lists:split(N - 1, Seqs), - case SeqTail of - [{Tag, [M | Rest]} | SeqRest] -> - [{Tag, M} | interleave(SeqHead ++ [{Tag, Rest} | SeqRest], L, RngNext)]; - [{_, []} | SeqRest] -> - interleave(SeqHead ++ SeqRest, L - 1, RngNext) - end; -interleave([], 0, _) -> - []. - --spec shuffle(list(E), rand:state()) -> list(E). -shuffle(L, Rng) -> - {Rands, _} = randoms(length(L), Rng), - [E || {_, E} <- lists:sort(lists:zip(Rands, L))]. - -randoms(N, Rng) when N > 0 -> - {Rand, RngNext} = rand:uniform_s(Rng), - {Tail, RngFinal} = randoms(N - 1, RngNext), - {[Rand | Tail], RngFinal}; -randoms(_, Rng) -> - {[], Rng}. diff --git a/apps/emqx_utils/include/bpapi.hrl b/apps/emqx_utils/include/bpapi.hrl new file mode 100644 index 000000000..1373e0381 --- /dev/null +++ b/apps/emqx_utils/include/bpapi.hrl @@ -0,0 +1,22 @@ +%%-------------------------------------------------------------------- +%% Copyright (c) 2017-2023 EMQ Technologies Co., Ltd. All Rights Reserved. +%% +%% Licensed under the Apache License, Version 2.0 (the "License"); +%% you may not use this file except in compliance with the License. +%% You may obtain a copy of the License at +%% +%% http://www.apache.org/licenses/LICENSE-2.0 +%% +%% Unless required by applicable law or agreed to in writing, software +%% distributed under the License is distributed on an "AS IS" BASIS, +%% WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +%% See the License for the specific language governing permissions and +%% limitations under the License. +%%-------------------------------------------------------------------- + +-ifndef(EMQX_BPAPI_HRL). +-define(EMQX_BPAPI_HRL, true). + +-compile({parse_transform, emqx_bpapi_trans}). + +-endif. diff --git a/apps/emqx_utils/include/emqx_message.hrl b/apps/emqx_utils/include/emqx_message.hrl new file mode 100644 index 000000000..a0d196fa9 --- /dev/null +++ b/apps/emqx_utils/include/emqx_message.hrl @@ -0,0 +1,43 @@ +%%-------------------------------------------------------------------- +%% Copyright (c) 2022-2023 EMQ Technologies Co., Ltd. All Rights Reserved. +%% +%% Licensed under the Apache License, Version 2.0 (the "License"); +%% you may not use this file except in compliance with the License. +%% You may obtain a copy of the License at +%% +%% http://www.apache.org/licenses/LICENSE-2.0 +%% +%% Unless required by applicable law or agreed to in writing, software +%% distributed under the License is distributed on an "AS IS" BASIS, +%% WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +%% See the License for the specific language governing permissions and +%% limitations under the License. +%%-------------------------------------------------------------------- +-ifndef(EMQX_MESSAGE_HRL). +-define(EMQX_MESSAGE_HRL, true). + +%% See 'Application Message' in MQTT Version 5.0 +-record(message, { + %% Global unique message ID + id :: binary(), + %% Message QoS + qos = 0, + %% Message from + from :: atom() | binary(), + %% Message flags + flags = #{} :: emqx_types:flags(), + %% Message headers. May contain any metadata. e.g. the + %% protocol version number, username, peerhost or + %% the PUBLISH properties (MQTT 5.0). + headers = #{} :: emqx_types:headers(), + %% Topic that the message is published to + topic :: emqx_types:topic(), + %% Message Payload + payload :: emqx_types:payload(), + %% Timestamp (Unit: millisecond) + timestamp :: integer(), + %% not used so far, for future extension + extra = [] :: term() +}). + +-endif. diff --git a/apps/emqx/src/bpapi/emqx_bpapi_trans.erl b/apps/emqx_utils/src/bpapi/emqx_bpapi_trans.erl similarity index 100% rename from apps/emqx/src/bpapi/emqx_bpapi_trans.erl rename to apps/emqx_utils/src/bpapi/emqx_bpapi_trans.erl diff --git a/rebar.config b/rebar.config index d2512ece3..c7101abc3 100644 --- a/rebar.config +++ b/rebar.config @@ -106,6 +106,10 @@ emqx_exproto_pb % generated code for protobuf ]}. +{eunit_opts, + [ verbose + ]}. + {project_plugins, [ erlfmt, {rebar3_hex, "7.0.2"},