Merge pull request #13370 from ieQu1/dev/skip-streams

New durable storage layout with explicit index for LTS wildcards
2024-07-09 20:27:21 +02:00 · 2024-07-09 20:27:21 +02:00 · 92dc059908
parent 91fd01ed21 3721be65ee
commit 92dc059908
24 changed files with 2477 additions and 268 deletions
--- a/apps/emqx/src/emqx_ds_schema.erl
+++ b/apps/emqx/src/emqx_ds_schema.erl
@ -234,6 +234,42 @@ fields(layout_builtin_wildcard_optimized) ->
                }
            )}
    ];
+fields(layout_builtin_wildcard_optimized_v2) ->
+    [
+        {type,
+            sc(
+                wildcard_optimized_v2,
+                #{
+                    'readOnly' => true,
+                    default => wildcard_optimized_v2,
+                    desc => ?DESC(layout_builtin_wildcard_optimized_type)
+                }
+            )},
+        {bytes_per_topic_level,
+            sc(
+                range(1, 16),
+                #{
+                    default => 8,
+                    importance => ?IMPORTANCE_HIDDEN
+                }
+            )},
+        {topic_index_bytes,
+            sc(
+                pos_integer(),
+                #{
+                    default => 8,
+                    importance => ?IMPORTANCE_HIDDEN
+                }
+            )},
+        {serialization_schema,
+            sc(
+                emqx_ds_msg_serializer:schema(),
+                #{
+                    default => v1,
+                    importance => ?IMPORTANCE_HIDDEN
+                }
+            )}
+    ];
 fields(layout_builtin_reference) ->
    [
        {type,
@ -242,6 +278,7 @@ fields(layout_builtin_reference) ->
                #{
                    'readOnly' => true,
                    importance => ?IMPORTANCE_LOW,
+                    default => reference,
                    desc => ?DESC(layout_builtin_reference_type)
                }
            )}
@ -284,7 +321,7 @@ common_builtin_fields() ->
                    importance => ?IMPORTANCE_MEDIUM,
                    default =>
                        #{
-                            <<"type">> => wildcard_optimized
+                            <<"type">> => wildcard_optimized_v2
                        }
                }
            )}
@ -298,6 +335,8 @@ desc(builtin_write_buffer) ->
    ?DESC(builtin_write_buffer);
 desc(layout_builtin_wildcard_optimized) ->
    ?DESC(layout_builtin_wildcard_optimized);
+desc(layout_builtin_wildcard_optimized_v2) ->
+    ?DESC(layout_builtin_wildcard_optimized);
 desc(layout_builtin_reference) ->
    ?DESC(layout_builtin_reference);
 desc(_) ->
@ -307,6 +346,19 @@ desc(_) ->
 %% Internal functions
 %%================================================================================

+translate_layout(
+    #{
+        type := wildcard_optimized_v2,
+        bytes_per_topic_level := BytesPerTopicLevel,
+        topic_index_bytes := TopicIndexBytes,
+        serialization_schema := SSchema
+    }
+) ->
+    {emqx_ds_storage_skipstream_lts, #{
+        wildcard_hash_bytes => BytesPerTopicLevel,
+        topic_index_bytes => TopicIndexBytes,
+        serialization_schema => SSchema
+    }};
 translate_layout(
    #{
        type := wildcard_optimized,
@ -336,7 +388,11 @@ builtin_layouts() ->
    %% suitable for production use. However, it's very simple and
    %% produces a very predictabale replay order, which can be useful
    %% for testing and debugging:
-    [ref(layout_builtin_wildcard_optimized), ref(layout_builtin_reference)].
+    [
+        ref(layout_builtin_wildcard_optimized_v2),
+        ref(layout_builtin_wildcard_optimized),
+        ref(layout_builtin_reference)
+    ].

 sc(Type, Meta) -> hoconsc:mk(Type, Meta).

--- a/apps/emqx/test/emqx_cth_suite.erl
+++ b/apps/emqx/test/emqx_cth_suite.erl
@ -64,6 +64,7 @@

 -export([work_dir/1]).
 -export([work_dir/2]).
+-export([clean_work_dir/1]).

 -export([load_apps/1]).
 -export([start_apps/2]).
@ -162,6 +163,7 @@ start(Apps, SuiteOpts = #{work_dir := WorkDir}) ->
    % 4. Setup isolated mnesia directory
    ok = emqx_common_test_helpers:load(mnesia),
    ok = application:set_env(mnesia, dir, filename:join([WorkDir, mnesia])),
+    ok = application:set_env(emqx_durable_storage, db_data_dir, filename:join([WorkDir, ds])),
    % 5. Start ekka separately.
    % For some reason it's designed to be started in non-regular way, so we have to track
    % applications started in the process manually.
@ -432,6 +434,16 @@ work_dir(TCName, CTConfig) ->
    WorkDir = work_dir(CTConfig),
    filename:join(WorkDir, TCName).

+%% @doc Delete contents of the workdir.
+clean_work_dir(WorkDir) ->
+    ct:pal("Cleaning workdir ~p", [WorkDir]),
+    case re:run(WorkDir, "./_build/test/logs/") of
+        {match, _} ->
+            file:del_dir_r(WorkDir);
+        nomatch ->
+            error({unsafe_workdir, WorkDir})
+    end.
+
 %%

 start_ekka() ->
--- a/apps/emqx_ds_backends/test/emqx_ds_backends_SUITE.erl
+++ b/apps/emqx_ds_backends/test/emqx_ds_backends_SUITE.erl
@ -599,6 +599,9 @@ init_per_suite(Config) ->
 end_per_suite(_Config) ->
    ok.

+suite() ->
+    [{timetrap, 50_000}].
+
 init_per_testcase(TC, Config) ->
    Apps = emqx_cth_suite:start(
        [emqx_durable_storage, emqx_ds_backends],
--- a/apps/emqx_ds_builtin_local/src/emqx_ds_builtin_local.erl
+++ b/apps/emqx_ds_builtin_local/src/emqx_ds_builtin_local.erl
@ -46,6 +46,12 @@
    shard_of_message/4
 ]).

+%% Internal exports:
+-export([
+    do_next/3,
+    do_delete_next/4
+]).
+
 -export_type([db_opts/0, shard/0, iterator/0, delete_iterator/0]).

 -include_lib("emqx_utils/include/emqx_message.hrl").
@ -295,19 +301,8 @@ update_iterator(DB, Iter0 = #{?tag := ?IT, ?shard := Shard, ?enc := StorageIter0
    end.

 -spec next(emqx_ds:db(), iterator(), pos_integer()) -> emqx_ds:next_result(iterator()).
-next(DB, Iter0 = #{?tag := ?IT, ?shard := Shard, ?enc := StorageIter0}, N) ->
-    ShardId = {DB, Shard},
-    T0 = erlang:monotonic_time(microsecond),
-    Result = emqx_ds_storage_layer:next(ShardId, StorageIter0, N, current_timestamp(ShardId)),
-    T1 = erlang:monotonic_time(microsecond),
-    emqx_ds_builtin_metrics:observe_next_time(DB, T1 - T0),
-    case Result of
-        {ok, StorageIter, Batch} ->
-            Iter = Iter0#{?enc := StorageIter},
-            {ok, Iter, Batch};
-        Other ->
-            Other
-    end.
+next(DB, Iter, N) ->
+    with_worker(do_next, [DB, Iter, N]).

 -spec get_delete_streams(emqx_ds:db(), emqx_ds:topic_filter(), emqx_ds:time()) ->
    [emqx_ds:ds_specific_delete_stream()].
@ -347,7 +342,36 @@ make_delete_iterator(DB, ?delete_stream(Shard, InnerStream), TopicFilter, StartT

 -spec delete_next(emqx_ds:db(), delete_iterator(), emqx_ds:delete_selector(), pos_integer()) ->
    emqx_ds:delete_next_result(emqx_ds:delete_iterator()).
-delete_next(DB, Iter = #{?tag := ?DELETE_IT, ?shard := Shard, ?enc := StorageIter0}, Selector, N) ->
+delete_next(DB, Iter, Selector, N) ->
+    with_worker(do_delete_next, [DB, Iter, Selector, N]).
+
+%%================================================================================
+%% Internal exports
+%%================================================================================
+
+current_timestamp(ShardId) ->
+    emqx_ds_builtin_local_meta:current_timestamp(ShardId).
+
+-spec do_next(emqx_ds:db(), iterator(), pos_integer()) -> emqx_ds:next_result(iterator()).
+do_next(DB, Iter0 = #{?tag := ?IT, ?shard := Shard, ?enc := StorageIter0}, N) ->
+    ShardId = {DB, Shard},
+    T0 = erlang:monotonic_time(microsecond),
+    Result = emqx_ds_storage_layer:next(ShardId, StorageIter0, N, current_timestamp(ShardId)),
+    T1 = erlang:monotonic_time(microsecond),
+    emqx_ds_builtin_metrics:observe_next_time(DB, T1 - T0),
+    case Result of
+        {ok, StorageIter, Batch} ->
+            Iter = Iter0#{?enc := StorageIter},
+            {ok, Iter, Batch};
+        Other ->
+            Other
+    end.
+
+-spec do_delete_next(emqx_ds:db(), delete_iterator(), emqx_ds:delete_selector(), pos_integer()) ->
+    emqx_ds:delete_next_result(emqx_ds:delete_iterator()).
+do_delete_next(
+    DB, Iter = #{?tag := ?DELETE_IT, ?shard := Shard, ?enc := StorageIter0}, Selector, N
+) ->
    ShardId = {DB, Shard},
    case
        emqx_ds_storage_layer:delete_next(
@ -362,13 +386,6 @@ delete_next(DB, Iter = #{?tag := ?DELETE_IT, ?shard := Shard, ?enc := StorageIte
            Error
    end.

-%%================================================================================
-%% Internal exports
-%%================================================================================
-
-current_timestamp(ShardId) ->
-    emqx_ds_builtin_local_meta:current_timestamp(ShardId).
-
 %%================================================================================
 %% Internal functions
 %%================================================================================
@ -380,3 +397,20 @@ timeus_to_timestamp(undefined) ->
    undefined;
 timeus_to_timestamp(TimestampUs) ->
    TimestampUs div 1000.
+
+with_worker(F, A) ->
+    Parent = self(),
+    Ref = make_ref(),
+    {_Pid, MRef} = spawn_opt(
+        fun() ->
+            Parent ! {Ref, apply(?MODULE, F, A)}
+        end,
+        [monitor, {min_heap_size, 10000}]
+    ),
+    receive
+        {Ref, Result} ->
+            erlang:demonitor(MRef, [flush]),
+            Result;
+        {'DOWN', MRef, _, _, _, Info} ->
+            {error, unrecoverable, Info}
+    end.
--- a/apps/emqx_ds_builtin_raft/README.md
+++ b/apps/emqx_ds_builtin_raft/README.md
@ -1,3 +1,36 @@
 # `emqx_ds_builtin_raft`

 Replication layer for the builtin EMQX durable storage backend that uses Raft algorithm.
+
+
+Raft backend introduces the concept of **site** to alleviate the problem of changing node names.
+Site IDs are persistent, and they are randomly generated at the first startup of the node.
+Each node in the cluster has a unique site ID, that is independent from the Erlang node name (`emqx@...`).
+
+## Configurations
+
+OTP application environment variables:
+
+- `emqx_durable_storage.reads`: `leader_preferred` | `local_preferred`.
+
+# CLI
+
+
+Runtime settings for the durable storages can be modified via CLI as well as the REST API.
+The following CLI commands are available:
+
+- `emqx ctl ds info` — get a quick overview of the durable storage state
+- `emqx ctl ds set_replicas <DS> <Site1> <Site2> ...` — update the list of replicas for a durable storage.
+- `emqx ctl ds join <DS> <Site>` — add a replica of durable storage on the site
+- `emqx ctl ds leave <DS> <Site>` — remove a replica of a durable storage from the site
+
+# HTTP APIs
+
+The following REST APIs are available for managing the builtin durable storages:
+
+- `/ds/sites` — list known sites.
+- `/ds/sites/:site` — get information about the site (its status, current EMQX node name managing the site, etc.)
+- `/ds/storages` — list durable storages
+- `/ds/storages/:ds` — get information about the durable storage and its shards
+- `/ds/storages/:ds/replicas` — list or update sites that contain replicas of a durable storage
+- `/ds/storages/:ds/replicas/:site` — add or remove replica of the durable storage on the site
--- a/apps/emqx_ds_builtin_raft/test/emqx_ds_replication_SUITE.erl
+++ b/apps/emqx_ds_builtin_raft/test/emqx_ds_replication_SUITE.erl
@ -29,15 +29,12 @@
    emqx_ds_test_helpers:on(NODES, fun() -> BODY end)
 ).

-opts() ->
-    opts(#{}).
-
-opts(Overrides) ->
+opts(Config, Overrides) ->
+    Layout = ?config(layout, Config),
    maps:merge(
        #{
            backend => builtin_raft,
-            %% storage => {emqx_ds_storage_reference, #{}},
-            storage => {emqx_ds_storage_bitfield_lts, #{epoch_bits => 10}},
+            storage => Layout,
            n_shards => 16,
            n_sites => 1,
            replication_factor => 3,
@ -58,7 +55,7 @@ appspec(emqx_durable_storage) ->

 t_metadata(init, Config) ->
    Apps = emqx_cth_suite:start([emqx_ds_builtin_raft], #{
-        work_dir => emqx_cth_suite:work_dir(?FUNCTION_NAME, Config)
+        work_dir => ?config(work_dir, Config)
    }),
    [{apps, Apps} | Config];
 t_metadata('end', Config) ->
@ -108,7 +105,7 @@ t_replication_transfers_snapshots(init, Config) ->
            {t_replication_transfers_snapshots2, #{apps => Apps}},
            {t_replication_transfers_snapshots3, #{apps => Apps}}
        ],
-        #{work_dir => emqx_cth_suite:work_dir(?FUNCTION_NAME, Config)}
+        #{work_dir => ?config(work_dir, Config)}
    ),
    Nodes = emqx_cth_cluster:start(NodeSpecs),
    [{nodes, Nodes}, {specs, NodeSpecs} | Config];
@ -125,9 +122,10 @@ t_replication_transfers_snapshots(Config) ->
    Nodes = [Node, NodeOffline | _] = ?config(nodes, Config),
    _Specs = [_, SpecOffline | _] = ?config(specs, Config),
    ?check_trace(
+        #{timetrap => 30_000},
        begin
            %% Initialize DB on all nodes and wait for it to be online.
-            Opts = opts(#{n_shards => 1, n_sites => 3}),
+            Opts = opts(Config, #{n_shards => 1, n_sites => 3}),
            ?assertEqual(
                [{ok, ok} || _ <- Nodes],
                erpc:multicall(Nodes, emqx_ds, open_db, [?DB, Opts])
@ -139,8 +137,11 @@ t_replication_transfers_snapshots(Config) ->
            ),

            %% Stop the DB on the "offline" node.
-            ok = emqx_cth_cluster:stop_node(NodeOffline),
-            _ = ?block_until(#{?snk_kind := ds_ra_state_enter, state := leader}, 500, 0),
+            ?wait_async_action(
+                ok = emqx_cth_cluster:stop_node(NodeOffline),
+                #{?snk_kind := ds_ra_state_enter, state := leader},
+                5_000
+            ),

            %% Fill the storage with messages and few additional generations.
            emqx_ds_test_helpers:apply_stream(?DB, Nodes -- [NodeOffline], Stream),
@ -153,9 +154,10 @@ t_replication_transfers_snapshots(Config) ->
                    ?snk_meta := #{node := NodeOffline}
                })
            ),
-            ?assertEqual(
-                ok,
-                erpc:call(NodeOffline, emqx_ds, open_db, [?DB, opts()])
+
+            ok = ?ON(
+                NodeOffline,
+                emqx_ds:open_db(?DB, opts(Config, #{}))
            ),

            %% Trigger storage operation and wait the replica to be restored.
@ -183,7 +185,7 @@ t_rebalance(init, Config) ->
            {t_rebalance3, #{apps => Apps}},
            {t_rebalance4, #{apps => Apps}}
        ],
-        #{work_dir => emqx_cth_suite:work_dir(?FUNCTION_NAME, Config)}
+        #{work_dir => ?config(work_dir, Config)}
    ),
    [{nodes, Nodes} | Config];
 t_rebalance('end', Config) ->
@ -206,7 +208,7 @@ t_rebalance(Config) ->
        begin
            Sites = [S1, S2 | _] = [ds_repl_meta(N, this_site) || N <- Nodes],
            %% 1. Initialize DB on the first node.
-            Opts = opts(#{n_shards => 16, n_sites => 1, replication_factor => 3}),
+            Opts = opts(Config, #{n_shards => 16, n_sites => 1, replication_factor => 3}),
            [
                ?assertEqual(ok, ?ON(Node, emqx_ds:open_db(?DB, Opts)))
             || Node <- Nodes
@ -218,7 +220,7 @@ t_rebalance(Config) ->
                {ok, [_]},
                ?ON(N1, emqx_ds_replication_layer_meta:assign_db_sites(?DB, [S1]))
            ),
-            ?retry(1000, 10, ?assertEqual([], emqx_ds_test_helpers:transitions(N1, ?DB))),
+            ?retry(1000, 20, ?assertEqual([], emqx_ds_test_helpers:transitions(N1, ?DB))),
            ?retry(500, 10, ?assertMatch(Shards when length(Shards) == 16, shards_online(N1, ?DB))),

            ct:pal("Sites: ~p~n", [Sites]),
@ -293,7 +295,7 @@ t_rebalance(Config) ->
            ct:pal("Transitions (~p -> ~p): ~p~n", [
                Sites, tl(Sites), emqx_ds_test_helpers:transitions(N1, ?DB)
            ]),
-            ?retry(1000, 10, ?assertEqual([], emqx_ds_test_helpers:transitions(N2, ?DB))),
+            ?retry(1000, 20, ?assertEqual([], emqx_ds_test_helpers:transitions(N2, ?DB))),

            %% Verify that at the end each node is now responsible for each shard.
            ?defer_assert(
@ -316,7 +318,7 @@ t_join_leave_errors(init, Config) ->
            {t_join_leave_errors1, #{apps => Apps}},
            {t_join_leave_errors2, #{apps => Apps}}
        ],
-        #{work_dir => emqx_cth_suite:work_dir(?FUNCTION_NAME, Config)}
+        #{work_dir => ?config(work_dir, Config)}
    ),
    [{nodes, Nodes} | Config];
 t_join_leave_errors('end', Config) ->
@ -327,7 +329,7 @@ t_join_leave_errors(Config) ->
    %% join/leave operations are reported correctly.
    [N1, N2] = ?config(nodes, Config),

-    Opts = opts(#{n_shards => 16, n_sites => 1, replication_factor => 3}),
+    Opts = opts(Config, #{n_shards => 16, n_sites => 1, replication_factor => 3}),
    ?assertEqual(ok, erpc:call(N1, emqx_ds, open_db, [?FUNCTION_NAME, Opts])),
    ?assertEqual(ok, erpc:call(N2, emqx_ds, open_db, [?FUNCTION_NAME, Opts])),

@ -370,7 +372,7 @@ t_join_leave_errors(Config) ->
    ?assertMatch({ok, _}, ds_repl_meta(N2, leave_db_site, [?FUNCTION_NAME, S1])),
    ?assertMatch([_ | _], emqx_ds_test_helpers:transitions(N1, ?FUNCTION_NAME)),
    ?retry(
-        1000, 10, ?assertEqual([], emqx_ds_test_helpers:transitions(N1, ?FUNCTION_NAME))
+        1000, 20, ?assertEqual([], emqx_ds_test_helpers:transitions(N1, ?FUNCTION_NAME))
    ),

    %% Should be no-op.
@ -385,7 +387,7 @@ t_rebalance_chaotic_converges(init, Config) ->
            {t_rebalance_chaotic_converges2, #{apps => Apps}},
            {t_rebalance_chaotic_converges3, #{apps => Apps}}
        ],
-        #{work_dir => emqx_cth_suite:work_dir(?FUNCTION_NAME, Config)}
+        #{work_dir => ?config(work_dir, Config)}
    ),
    [{nodes, Nodes} | Config];
 t_rebalance_chaotic_converges('end', Config) ->
@ -411,7 +413,7 @@ t_rebalance_chaotic_converges(Config) ->
            ct:pal("Sites: ~p~n", [Sites]),

            %% Initialize DB on first two nodes.
-            Opts = opts(#{n_shards => 16, n_sites => 2, replication_factor => 3}),
+            Opts = opts(Config, #{n_shards => 16, n_sites => 2, replication_factor => 3}),

            %% Open DB:
            ?assertEqual(
@ -456,7 +458,7 @@ t_rebalance_chaotic_converges(Config) ->
            emqx_ds_test_helpers:apply_stream(?DB, Nodes, Stream),

            %% Wait for the last transition to complete.
-            ?retry(500, 20, ?assertEqual([], emqx_ds_test_helpers:transitions(N1, ?DB))),
+            ?retry(1000, 30, ?assertEqual([], emqx_ds_test_helpers:transitions(N1, ?DB))),

            ?defer_assert(
                ?assertEqual(
@ -482,7 +484,7 @@ t_rebalance_offline_restarts(init, Config) ->
            {t_rebalance_offline_restarts2, #{apps => Apps}},
            {t_rebalance_offline_restarts3, #{apps => Apps}}
        ],
-        #{work_dir => emqx_cth_suite:work_dir(?FUNCTION_NAME, Config)}
+        #{work_dir => ?config(work_dir, Config)}
    ),
    Nodes = emqx_cth_cluster:start(Specs),
    [{nodes, Nodes}, {nodespecs, Specs} | Config];
@ -498,7 +500,7 @@ t_rebalance_offline_restarts(Config) ->
    _Specs = [NS1, NS2, _] = ?config(nodespecs, Config),

    %% Initialize DB on all 3 nodes.
-    Opts = opts(#{n_shards => 8, n_sites => 3, replication_factor => 3}),
+    Opts = opts(Config, #{n_shards => 8, n_sites => 3, replication_factor => 3}),
    ?assertEqual(
        [{ok, ok} || _ <- Nodes],
        erpc:multicall(Nodes, emqx_ds, open_db, [?DB, Opts])
@ -544,7 +546,7 @@ t_drop_generation(Config) ->
                {t_drop_generation3, #{apps => Apps}}
            ],
            #{
-                work_dir => emqx_cth_suite:work_dir(?FUNCTION_NAME, Config)
+                work_dir => ?config(work_dir, Config)
            }
        ),

@ -552,7 +554,7 @@ t_drop_generation(Config) ->
    ?check_trace(
        try
            %% Initialize DB on all 3 nodes.
-            Opts = opts(#{n_shards => 1, n_sites => 3, replication_factor => 3}),
+            Opts = opts(Config, #{n_shards => 1, n_sites => 3, replication_factor => 3}),
            ?assertEqual(
                [{ok, ok} || _ <- Nodes],
                erpc:multicall(Nodes, emqx_ds, open_db, [?DB, Opts])
@ -614,21 +616,21 @@ t_drop_generation(Config) ->

 t_error_mapping_replication_layer(init, Config) ->
    Apps = emqx_cth_suite:start([emqx_ds_builtin_raft], #{
-        work_dir => emqx_cth_suite:work_dir(?FUNCTION_NAME, Config)
+        work_dir => ?config(work_dir, Config)
    }),
    [{apps, Apps} | Config];
 t_error_mapping_replication_layer('end', Config) ->
    emqx_cth_suite:stop(?config(apps, Config)),
    Config.

-t_error_mapping_replication_layer(_Config) ->
+t_error_mapping_replication_layer(Config) ->
    %% This checks that the replication layer maps recoverable errors correctly.

    ok = emqx_ds_test_helpers:mock_rpc(),
    ok = snabbkaffe:start_trace(),

    DB = ?FUNCTION_NAME,
-    ?assertMatch(ok, emqx_ds:open_db(DB, (opts())#{n_shards => 2})),
+    ?assertMatch(ok, emqx_ds:open_db(DB, opts(Config, #{n_shards => 2}))),
    [Shard1, Shard2] = emqx_ds_replication_layer_meta:shards(DB),

    TopicFilter = emqx_topic:words(<<"foo/#">>),
@ -695,7 +697,7 @@ t_error_mapping_replication_layer(_Config) ->
    Results2 = lists:map(
        fun(Iter) ->
            case emqx_ds:next(DB, Iter, _BatchSize = 42) of
-                Ok = {ok, _Iter, [_ | _]} ->
+                Ok = {ok, _Iter, _} ->
                    Ok;
                Error = {error, recoverable, {badrpc, _}} ->
                    Error;
@ -716,20 +718,20 @@ t_error_mapping_replication_layer(_Config) ->
 %% problems.
 t_store_batch_fail(init, Config) ->
    Apps = emqx_cth_suite:start([emqx_ds_builtin_raft], #{
-        work_dir => emqx_cth_suite:work_dir(?FUNCTION_NAME, Config)
+        work_dir => ?config(work_dir, Config)
    }),
    [{apps, Apps} | Config];
 t_store_batch_fail('end', Config) ->
    emqx_cth_suite:stop(?config(apps, Config)),
    Config.

-t_store_batch_fail(_Config) ->
+t_store_batch_fail(Config) ->
+    DB = ?FUNCTION_NAME,
    ?check_trace(
        #{timetrap => 15_000},
        try
-            meck:new(emqx_ds_storage_layer, [passthrough, no_history]),
-            DB = ?FUNCTION_NAME,
-            ?assertMatch(ok, emqx_ds:open_db(DB, (opts())#{n_shards => 2})),
+            ok = meck:new(emqx_ds_storage_layer, [passthrough, no_history]),
+            ?assertMatch(ok, emqx_ds:open_db(DB, opts(Config, #{n_shards => 2}))),
            %% Success:
            Batch1 = [
                message(<<"C1">>, <<"foo/bar">>, <<"1">>, 1),
@ -737,7 +739,7 @@ t_store_batch_fail(_Config) ->
            ],
            ?assertMatch(ok, emqx_ds:store_batch(DB, Batch1, #{sync => true})),
            %% Inject unrecoverable error:
-            meck:expect(emqx_ds_storage_layer, store_batch, fun(_DB, _Shard, _Messages) ->
+            ok = meck:expect(emqx_ds_storage_layer, store_batch, fun(_DB, _Shard, _Messages) ->
                {error, unrecoverable, mock}
            end),
            Batch2 = [
@ -747,10 +749,10 @@ t_store_batch_fail(_Config) ->
            ?assertMatch(
                {error, unrecoverable, mock}, emqx_ds:store_batch(DB, Batch2, #{sync => true})
            ),
-            meck:unload(emqx_ds_storage_layer),
+            ok = meck:unload(emqx_ds_storage_layer),
            %% Inject a recoveralbe error:
-            meck:new(ra, [passthrough, no_history]),
-            meck:expect(ra, process_command, fun(Servers, Shard, Command) ->
+            ok = meck:new(ra, [passthrough, no_history]),
+            ok = meck:expect(ra, process_command, fun(Servers, Shard, Command) ->
                ?tp(ra_command, #{servers => Servers, shard => Shard, command => Command}),
                {timeout, mock}
            end),
@ -766,9 +768,9 @@ t_store_batch_fail(_Config) ->
                {error, recoverable, {timeout, mock}},
                emqx_ds:store_batch(DB, Batch3, #{sync => true})
            ),
-            meck:unload(ra),
+            ok = meck:unload(ra),
            ?assertMatch(ok, emqx_ds:store_batch(DB, Batch3, #{sync => true})),
-            lists:sort(emqx_ds_test_helpers:consume_per_stream(DB, ['#'], 1))
+            lists:sort(emqx_ds_test_helpers:consume_per_stream(DB, ['#'], 0))
        after
            meck:unload()
        end,
@ -803,7 +805,7 @@ t_crash_restart_recover(init, Config) ->
            {t_crash_stop_recover2, #{apps => Apps}},
            {t_crash_stop_recover3, #{apps => Apps}}
        ],
-        #{work_dir => emqx_cth_suite:work_dir(?FUNCTION_NAME, Config)}
+        #{work_dir => ?config(work_dir, Config)}
    ),
    Nodes = emqx_cth_cluster:start(Specs),
    [{nodes, Nodes}, {nodespecs, Specs} | Config];
@ -815,7 +817,7 @@ t_crash_restart_recover(Config) ->
    %% correctly preserved.
    Nodes = [N1, N2, N3] = ?config(nodes, Config),
    _Specs = [_, NS2, NS3] = ?config(nodespecs, Config),
-    DBOpts = opts(#{n_shards => 16, n_sites => 3, replication_factor => 3}),
+    DBOpts = opts(Config, #{n_shards => 16, n_sites => 3, replication_factor => 3}),

    %% Prepare test event stream.
    NMsgs = 400,
@ -856,7 +858,10 @@ t_crash_restart_recover(Config) ->
            MatchFlushFailed = ?match_event(#{?snk_kind := emqx_ds_buffer_flush_failed}),
            {ok, SubRef} = snabbkaffe:subscribe(MatchFlushFailed, NMsgs, _Timeout = 5000, infinity),
            {timeout, Events} = snabbkaffe:receive_events(SubRef),
-            LostMessages = [M || #{batch := Messages} <- Events, M <- Messages],
+            LostMessages = [
+                emqx_ds_test_helpers:message_canonical_form(M)
+             || #{batch := Messages} <- Events, M <- Messages
+            ],
            ct:pal("Some messages were lost: ~p", [LostMessages]),
            ?assert(length(LostMessages) < NMsgs div 20),

@ -876,8 +881,16 @@ t_crash_restart_recover(Config) ->
                %% Does any messages were lost unexpectedly?
                {_, DSMessages} = lists:unzip(emqx_utils_stream:consume(DSStream1)),
                ExpectedMessages = emqx_utils_stream:consume(ExpectedStream),
-                MissingMessages = ExpectedMessages -- DSMessages,
-                ?defer_assert(?assertEqual([], MissingMessages -- LostMessages, DSMessages))
+                MissingMessages = emqx_ds_test_helpers:message_set_subtract(
+                    ExpectedMessages, DSMessages
+                ),
+                ?defer_assert(
+                    ?assertEqual(
+                        [],
+                        emqx_ds_test_helpers:sublist(MissingMessages -- LostMessages),
+                        emqx_ds_test_helpers:sublist(DSMessages)
+                    )
+                )
            end,
            lists:foreach(VerifyClient, TopicStreams)
        end,
@ -984,12 +997,36 @@ sample(N, List) ->

 suite() -> [{timetrap, {seconds, 60}}].

-all() -> emqx_common_test_helpers:all(?MODULE).
+all() ->
+    [{group, Grp} || {Grp, _} <- groups()].
+
+groups() ->
+    TCs = emqx_common_test_helpers:all(?MODULE),
+    [
+        {bitfield_lts, TCs},
+        {skipstream_lts, TCs}
+    ].
+
+init_per_group(Group, Config) ->
+    LayoutConf =
+        case Group of
+            skipstream_lts ->
+                {emqx_ds_storage_skipstream_lts, #{with_guid => true}};
+            bitfield_lts ->
+                {emqx_ds_storage_bitfield_lts, #{}}
+        end,
+    [{layout, LayoutConf} | Config].
+
+end_per_group(_Group, Config) ->
+    Config.

 init_per_testcase(TCName, Config0) ->
-    Config = emqx_common_test_helpers:init_per_testcase(?MODULE, TCName, Config0),
-    Config.
+    Config1 = [{work_dir, emqx_cth_suite:work_dir(TCName, Config0)} | Config0],
+    emqx_common_test_helpers:init_per_testcase(?MODULE, TCName, Config1).

 end_per_testcase(TCName, Config) ->
    ok = snabbkaffe:stop(),
-    emqx_common_test_helpers:end_per_testcase(?MODULE, TCName, Config).
+    Result = emqx_common_test_helpers:end_per_testcase(?MODULE, TCName, Config),
+    catch emqx_ds:drop_db(TCName),
+    emqx_cth_suite:clean_work_dir(?config(work_dir, Config)),
+    Result.
--- a/apps/emqx_durable_storage/README.md
+++ b/apps/emqx_durable_storage/README.md
@ -13,11 +13,10 @@ This makes the storage disk requirements very predictable: only the number of _p

 DS _backend_ is a callback module that implements `emqx_ds` behavior.

-EMQX repository contains the "builtin" backend, implemented in `emqx_ds_replication_layer` module, that uses Raft algorithm for data replication, and RocksDB as the main storage.
+EMQX repository contains two builtin backends based on RocksDB:

-Note that builtin backend introduces the concept of **site** to alleviate the problem of changing node names.
-Site IDs are persistent, and they are randomly generated at the first startup of the node.
-Each node in the cluster has a unique site ID, that is independent from the Erlang node name (`emqx@...`).
+- `emqx_ds_builtin_local`
+- `emqx_ds_builtin_raft`

 ### Layout

@ -113,8 +112,8 @@ In the future it can serve as a storage for retained messages or as a generic me

 # Configurations

-Global options for `emqx_durable_storage` application are configured via OTP application environment.
-Database-specific settings are stored in the schema table.
+Common global options for builtin backends are configured via OTP application environment.
+Database-specific settings are stored in EMQX config.

 The following application environment variables are available:

@ -124,26 +123,9 @@ The following application environment variables are available:

 - `emqx_durable_storage.egress_flush_interval`: period at which the batches of messages are committed to the durable storage.

- `emqx_durable_storage.reads`: `leader_preferred` | `local_preferred`.
-
-Runtime settings for the durable storages can be modified via CLI as well as the REST API.
-The following CLI commands are available:
-
- `emqx ctl ds info` — get a quick overview of the durable storage state
- `emqx ctl ds set_replicas <DS> <Site1> <Site2> ...` — update the list of replicas for a durable storage.
- `emqx ctl ds join <DS> <Site>` — add a replica of durable storage on the site
- `emqx ctl ds leave <DS> <Site>` — remove a replica of a durable storage from the site
-
 # HTTP APIs

-The following REST APIs are available for managing the builtin durable storages:
-
- `/ds/sites` — list known sites.
- `/ds/sites/:site` — get information about the site (its status, current EMQX node name managing the site, etc.)
- `/ds/storages` — list durable storages
- `/ds/storages/:ds` — get information about the durable storage and its shards
- `/ds/storages/:ds/replicas` — list or update sites that contain replicas of a durable storage
- `/ds/storages/:ds/replicas/:site` — add or remove replica of the durable storage on the site
+None

 # Other

--- a/apps/emqx_durable_storage/asn.1/DurableMessage.asn
+++ b/apps/emqx_durable_storage/asn.1/DurableMessage.asn
@ -0,0 +1,90 @@
+-- This schema specifies binary encoding of EMQX's internal
+-- representation of a message.
+--
+-- Note: MQTT standard specifies that certain properties like topic
+-- should be UTF8 strings. Here we represent them as OCTET STRING to
+-- avoid extra checks.
+
+DurableMessage DEFINITIONS AUTOMATIC TAGS ::=
+BEGIN
+  -- Non-standard flag:
+  MiscFlag ::= SEQUENCE {
+    key UTF8String,
+    value BOOLEAN
+  }
+
+  -- Non-standard header or property.
+  -- Both key and value are interpreted as erlang terms:
+  MiscProperty ::= SEQUENCE {
+    key OCTET STRING,
+    value OCTET STRING
+  }
+
+  ClientAttr ::= SEQUENCE {
+    key OCTET STRING,
+    value OCTET STRING
+  }
+
+  -- Wrapper for any data that doesn't comply with the strict schema:
+  Misc ::= CHOICE {
+    flag MiscFlag,
+    header MiscProperty,
+    property MiscProperty,
+    -- Currently these are unused:
+    clientAttr ClientAttr,
+    extra MiscProperty
+  }
+
+  -- Both key and value are interpreted as binaries:
+  UserProperty ::= SEQUENCE {
+    key OCTET STRING,
+    value OCTET STRING
+  }
+
+  -- Common properties that are present in almost any message:
+  StdProperties ::= SEQUENCE {
+    payloadFormatIndicator INTEGER (0..255) OPTIONAL,
+    messageExpiryInterval INTEGER (0..4294967295) OPTIONAL,
+    responseTopic OCTET STRING OPTIONAL,
+    correlationData  OCTET STRING OPTIONAL,
+    contentType OCTET STRING OPTIONAL,
+    userProperty SEQUENCE OF UserProperty
+  }
+
+  ProtoVer ::= CHOICE {
+    mqtt INTEGER(0..255),
+    mqtt-sn INTEGER(0..255),
+    coap INTEGER(0..255)
+  }
+
+  -- Common headers that are present in almost any message:
+  StdHeaders ::= SEQUENCE {
+    protoVer ProtoVer OPTIONAL,
+    peerhost OCTET STRING (SIZE(4..16)) OPTIONAL, -- IPv4 (4 octets) .. IPv6 (16 octets)
+    peername OCTET STRING (SIZE(6..18)) OPTIONAL, -- IPv4 (4 octets) .. IPv6 (16 octets) + 2 octets for (TCP/UDP) port
+    username OCTET STRING OPTIONAL
+  }
+
+  From ::= CHOICE {
+    atom UTF8String,
+    binary OCTET STRING
+  }
+
+  DurableMessage ::= SEQUENCE {
+    id OCTET STRING,
+    from From,
+    topic OCTET STRING,
+    payload OCTET STRING,
+    timestamp INTEGER,
+    qos INTEGER (0..2),
+    -- MQTT PUBLISH flags:
+    sys BOOLEAN,
+    dup BOOLEAN,
+    retain BOOLEAN,
+    -- Headers:
+    headers StdHeaders,
+    properties StdProperties,
+    -- Miscellaneous, highly EMQX-specific internal data:
+    misc SEQUENCE OF Misc OPTIONAL
+  }
+END
--- a/apps/emqx_durable_storage/dev/storage_efficiency.erl
+++ b/apps/emqx_durable_storage/dev/storage_efficiency.erl
@ -0,0 +1,223 @@
+%%--------------------------------------------------------------------
+%% Copyright (c) 2024 EMQ Technologies Co., Ltd. All Rights Reserved.
+%%
+%% Licensed under the Apache License, Version 2.0 (the "License");
+%% you may not use this file except in compliance with the License.
+%% You may obtain a copy of the License at
+%%
+%%     http://www.apache.org/licenses/LICENSE-2.0
+%%
+%% Unless required by applicable law or agreed to in writing, software
+%% distributed under the License is distributed on an "AS IS" BASIS,
+%% WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+%% See the License for the specific language governing permissions and
+%% limitations under the License.
+%%--------------------------------------------------------------------
+
+%% @doc This script can be loaded to a running EMQX EE node. It will
+%% create a number of DS databases with different options and fill
+%% them with data of given size.
+%%
+%% Then it will measure size of the database directories and create
+%% a "storage (in)efficiency" report.
+-module(storage_efficiency).
+
+-include_lib("emqx_utils/include/emqx_message.hrl").
+
+%% API:
+-export([run/0, run/1]).
+
+%%================================================================================
+%% API functions
+%%================================================================================
+
+run() ->
+    run(#{}).
+
+run(Custom) ->
+    RunConf = maps:merge(
+        #{
+            %% Sleep between batches:
+            sleep => 1_000,
+            %% Don't run test, only plot data:
+            dry_run => false,
+            %% Payload size multiplier:
+            size => 10,
+            %% Number of batches:
+            batches => 100,
+            %% Add generation every N batches:
+            add_generation => 10
+        },
+        Custom
+    ),
+    lists:foreach(
+        fun(DBConf) ->
+            run(DBConf, RunConf)
+        end,
+        configs()
+    ).
+
+%% erlfmt-ignore
+gnuplot_script(Filename) ->
+    "set terminal qt\n"
+    %% "set logscale y 10\n"
+    "set title \"" ++ filename:basename(Filename, ".dat") ++ "\"\n"
+    "set key autotitle columnheader\n"
+    "plot for [n=2:*] \"" ++ Filename ++ "\" using 1:n with linespoints".
+
+%%================================================================================
+%% Internal functions
+%%================================================================================
+
+configs() ->
+    [
+        {'benchmark-skipstream-asn1',
+            db_conf({emqx_ds_storage_skipstream_lts, #{serialization_schema => asn1}})},
+        {'benchmark-skipstream-v1',
+            db_conf({emqx_ds_storage_skipstream_lts, #{serialization_schema => v1}})},
+        {'benchmark-bitfield', db_conf({emqx_ds_storage_bitfield_lts, #{}})}
+    ].
+
+db_conf(Storage) ->
+    #{
+        backend => builtin_local,
+        %% n_sites => 1,
+        n_shards => 1,
+        %% replication_factor => 1,
+        %% replication_options => #{},
+        storage => Storage
+    }.
+
+-record(s, {
+    data_size = 0,
+    payload_size = 0,
+    n_messages = 0,
+    datapoints = #{},
+    x_axis = []
+}).
+
+run({DB, Config}, RunConf) ->
+    #{
+        batches := NBatches,
+        size := PSMultiplier,
+        add_generation := AddGeneration,
+        sleep := Sleep,
+        dry_run := DryRun
+    } = RunConf,
+    {ok, _} = application:ensure_all_started(emqx_ds_backends),
+    Dir = dir(DB),
+    Filename = atom_to_list(DB) ++ ".dat",
+    DryRun orelse
+        begin
+            io:format(user, "Running benchmark for ~p in ~p~n", [DB, Dir]),
+            %% Ensure safe directory:
+            {match, _} = re:run(Dir, filename:join("data", DB)),
+            %% Ensure clean state:
+            ok = emqx_ds:open_db(DB, Config),
+            ok = emqx_ds:drop_db(DB),
+            ok = file:del_dir_r(Dir),
+            %% Open a fresh DB:
+            ok = emqx_ds:open_db(DB, Config),
+            S = lists:foldl(
+                fun(Batch, Acc0) ->
+                    Size = PSMultiplier * Batch,
+                    io:format(user, "Storing batch with payload size ~p~n", [Size]),
+                    Acc1 = store_batch(DB, Size, Acc0),
+                    %% Sleep so all data is hopefully flushed:
+                    timer:sleep(Sleep),
+                    (Batch div AddGeneration) =:= 0 andalso
+                        emqx_ds:add_generation(DB),
+                    collect_datapoint(DB, Acc1)
+                end,
+                collect_datapoint(DB, #s{}),
+                lists:seq(1, NBatches)
+            ),
+            {ok, FD} = file:open(Filename, [write]),
+            io:put_chars(FD, print(S)),
+            file:close(FD)
+        end,
+    os:cmd("echo '" ++ gnuplot_script(Filename) ++ "' | gnuplot --persist -"),
+    ok.
+
+collect_datapoint(
+    DB, S0 = #s{n_messages = N, data_size = DS, payload_size = PS, datapoints = DP0, x_axis = X}
+) ->
+    NewData = [{"$_n", N}, {"$data", DS}, {"$payloads", PS} | dirsize(DB)],
+    DP = lists:foldl(
+        fun({Key, Val}, Acc) ->
+            maps:update_with(
+                Key,
+                fun(M) -> M#{N => Val} end,
+                #{},
+                Acc
+            )
+        end,
+        DP0,
+        NewData
+    ),
+    S0#s{
+        datapoints = DP,
+        x_axis = [N | X]
+    }.
+
+print(#s{x_axis = XX, datapoints = DP}) ->
+    Cols = lists:sort(maps:keys(DP)),
+    Lines = [
+        %% Print header:
+        Cols
+        %% Scan through rows:
+        | [
+            %% Scan throgh columns:
+            [integer_to_binary(maps:get(X, maps:get(Col, DP), 0)) || Col <- Cols]
+         || X <- lists:reverse(XX)
+        ]
+    ],
+    lists:join(
+        "\n",
+        [lists:join(" ", Line) || Line <- Lines]
+    ).
+
+dirsize(DB) ->
+    RawOutput = os:cmd("cd " ++ dir(DB) ++ "; du -b --max-depth 1 ."),
+    [
+        begin
+            [Sz, Dir] = string:lexemes(L, "\t"),
+            {Dir, list_to_integer(Sz)}
+        end
+     || L <- string:lexemes(RawOutput, "\n")
+    ].
+
+dir(DB) ->
+    filename:join(emqx_ds_storage_layer:base_dir(), DB).
+
+store_batch(DB, PayloadSize, S0 = #s{n_messages = N, data_size = DS, payload_size = PS}) ->
+    From = rand:bytes(16),
+    BatchSize = 50,
+    Batch = [
+        #message{
+            id = emqx_guid:gen(),
+            timestamp = emqx_message:timestamp_now(),
+            payload = rand:bytes(PayloadSize),
+            from = From,
+            topic = emqx_topic:join([
+                <<"blah">>,
+                <<"blah">>,
+                '',
+                <<"blah">>,
+                From,
+                <<"bazzzzzzzzzzzzzzzzzzzzzzz">>,
+                integer_to_binary(I)
+            ])
+        }
+     || I <- lists:seq(1, BatchSize)
+    ],
+    ok = emqx_ds:store_batch(DB, Batch, #{sync => true}),
+    S0#s{
+        n_messages = N + length(Batch),
+        data_size = DS + lists:sum(lists:map(fun msg_size/1, Batch)),
+        payload_size = PS + length(Batch) * PayloadSize
+    }.
+
+%% We consider MQTT wire encoding to be "close to the ideal".
+msg_size(Msg = #message{}) ->
+    iolist_size(emqx_frame:serialize(emqx_message:to_packet(undefined, Msg))).
--- a/apps/emqx_durable_storage/gen_src/.gitignore
+++ b/apps/emqx_durable_storage/gen_src/.gitignore
@ -0,0 +1,4 @@
+*.hrl
+*.erl
+*.beam
+*.asn1db
--- a/apps/emqx_durable_storage/include/emqx_ds_metrics.hrl
+++ b/apps/emqx_durable_storage/include/emqx_ds_metrics.hrl
@ -35,15 +35,24 @@
 -define(DS_STORE_BATCH_TIME, emqx_ds_store_batch_time).
 -define(DS_BUILTIN_NEXT_TIME, emqx_ds_builtin_next_time).

-%%% LTS Storage counters:
+%%% Bitfield LTS Storage counters:

 %% This counter is incremented when the iterator seeks to the next interval:
-define(DS_LTS_SEEK_COUNTER, emqx_ds_storage_bitfield_lts_counter_seek).
+-define(DS_BITFIELD_LTS_SEEK_COUNTER, emqx_ds_storage_bitfield_lts_counter_seek).
 %% This counter is incremented when the iterator proceeds to the next
 %% key within the interval (this is is best case scenario):
-define(DS_LTS_NEXT_COUNTER, emqx_ds_storage_bitfield_lts_counter_next).
+-define(DS_BITFIELD_LTS_NEXT_COUNTER, emqx_ds_storage_bitfield_lts_counter_next).
 %% This counter is incremented when the key passes bitmask check, but
 %% the value is rejected by the subsequent post-processing:
-define(DS_LTS_COLLISION_COUNTER, emqx_ds_storage_bitfield_lts_counter_collision).
+-define(DS_BITFIELD_LTS_COLLISION_COUNTER, emqx_ds_storage_bitfield_lts_counter_collision).
+
+%%% Skipstream LTS Storage counters:
+-define(DS_SKIPSTREAM_LTS_SEEK, emqx_ds_storage_skipstream_lts_seek).
+-define(DS_SKIPSTREAM_LTS_NEXT, emqx_ds_storage_skipstream_lts_next).
+-define(DS_SKIPSTREAM_LTS_HASH_COLLISION, emqx_ds_storage_skipstream_lts_hash_collision).
+-define(DS_SKIPSTREAM_LTS_HIT, emqx_ds_storage_skipstream_lts_hit).
+-define(DS_SKIPSTREAM_LTS_MISS, emqx_ds_storage_skipstream_lts_miss).
+-define(DS_SKIPSTREAM_LTS_FUTURE, emqx_ds_storage_skipstream_lts_future).
+-define(DS_SKIPSTREAM_LTS_EOS, emqx_ds_storage_skipstream_lts_end_of_stream).

 -endif.
--- a/apps/emqx_durable_storage/rebar.config
+++ b/apps/emqx_durable_storage/rebar.config
@ -1,3 +1,8 @@
 %% -*- mode:erlang -*-
-
 {deps, [{emqx_utils, {path, "../emqx_utils"}}]}.
+
+{erl_opts, [{src_dirs, ["src", "gen_src"]}]}.
+
+{pre_hooks, [
+    {"(linux|darwin|solaris)", compile, "erlc -bper +noobj -o gen_src asn.1/DurableMessage.asn"}
+]}.
--- a/apps/emqx_durable_storage/src/emqx_ds_builtin_metrics.erl
+++ b/apps/emqx_durable_storage/src/emqx_ds_builtin_metrics.erl
@ -36,7 +36,9 @@

    inc_lts_seek_counter/2,
    inc_lts_next_counter/2,
-    inc_lts_collision_counter/2
+    inc_lts_collision_counter/2,
+
+    collect_shard_counter/3
 ]).

 %% behavior callbacks:
@ -57,9 +59,16 @@

 -define(STORAGE_LAYER_METRICS, [
    {slide, ?DS_STORE_BATCH_TIME},
-    {counter, ?DS_LTS_SEEK_COUNTER},
-    {counter, ?DS_LTS_NEXT_COUNTER},
-    {counter, ?DS_LTS_COLLISION_COUNTER}
+    {counter, ?DS_BITFIELD_LTS_SEEK_COUNTER},
+    {counter, ?DS_BITFIELD_LTS_NEXT_COUNTER},
+    {counter, ?DS_BITFIELD_LTS_COLLISION_COUNTER},
+    {counter, ?DS_SKIPSTREAM_LTS_SEEK},
+    {counter, ?DS_SKIPSTREAM_LTS_NEXT},
+    {counter, ?DS_SKIPSTREAM_LTS_HASH_COLLISION},
+    {counter, ?DS_SKIPSTREAM_LTS_HIT},
+    {counter, ?DS_SKIPSTREAM_LTS_MISS},
+    {counter, ?DS_SKIPSTREAM_LTS_FUTURE},
+    {counter, ?DS_SKIPSTREAM_LTS_EOS}
 ]).

 -define(FETCH_METRICS, [
@ -150,15 +159,19 @@ observe_next_time(DB, NextTime) ->

 -spec inc_lts_seek_counter(emqx_ds_storage_layer:shard_id(), non_neg_integer()) -> ok.
 inc_lts_seek_counter({DB, _}, Inc) ->
-    catch emqx_metrics_worker:inc(?WORKER, DB, ?DS_LTS_SEEK_COUNTER, Inc).
+    catch emqx_metrics_worker:inc(?WORKER, DB, ?DS_BITFIELD_LTS_SEEK_COUNTER, Inc).

 -spec inc_lts_next_counter(emqx_ds_storage_layer:shard_id(), non_neg_integer()) -> ok.
 inc_lts_next_counter({DB, _}, Inc) ->
-    catch emqx_metrics_worker:inc(?WORKER, DB, ?DS_LTS_NEXT_COUNTER, Inc).
+    catch emqx_metrics_worker:inc(?WORKER, DB, ?DS_BITFIELD_LTS_NEXT_COUNTER, Inc).

 -spec inc_lts_collision_counter(emqx_ds_storage_layer:shard_id(), non_neg_integer()) -> ok.
 inc_lts_collision_counter({DB, _}, Inc) ->
-    catch emqx_metrics_worker:inc(?WORKER, DB, ?DS_LTS_COLLISION_COUNTER, Inc).
+    catch emqx_metrics_worker:inc(?WORKER, DB, ?DS_BITFIELD_LTS_COLLISION_COUNTER, Inc).
+
+-spec collect_shard_counter(emqx_ds_storage_layer:shard_id(), atom(), non_neg_integer()) -> ok.
+collect_shard_counter({DB, _}, Key, Inc) ->
+    catch emqx_metrics_worker:inc(?WORKER, DB, Key, Inc).

 prometheus_meta() ->
    lists:map(
--- a/apps/emqx_durable_storage/src/emqx_ds_lts.erl
+++ b/apps/emqx_durable_storage/src/emqx_ds_lts.erl
@ -26,7 +26,13 @@
    trie_copy_learned_paths/2,
    topic_key/3,
    match_topics/2,
-    lookup_topic_key/2
+    lookup_topic_key/2,
+    reverse_lookup/2,
+    info/2,
+    info/1,
+
+    compress_topic/3,
+    decompress_topic/2
 ]).

 %% Debug:
@ -34,18 +40,21 @@

 -export_type([
    options/0,
+    level/0,
    static_key/0,
    trie/0,
-    msg_storage_key/0
+    msg_storage_key/0,
+    learned_structure/0
 ]).

 -include_lib("stdlib/include/ms_transform.hrl").

 -ifdef(TEST).
 -include_lib("eunit/include/eunit.hrl").
-endif.

 -elvis([{elvis_style, variable_naming_convention, disable}]).
+-elvis([{elvis_style, dont_repeat_yourself, disable}]).
+-endif.

 %%================================================================================
 %% Type declarations
@ -55,15 +64,22 @@
 -define(EOT, []).
 -define(PLUS, '+').

-type edge() :: binary() | ?EOT | ?PLUS.
+-type level() :: binary() | ''.

-%% Fixed size binary
-type static_key() :: non_neg_integer().
+-type edge() :: level() | ?EOT | ?PLUS.

+%% Fixed size binary or integer, depending on the options:
+-type static_key() :: non_neg_integer() | binary().
+
+%% Trie root:
 -define(PREFIX, prefix).
+%% Special prefix root for reverse lookups:
+-define(rlookup, rlookup).
+-define(rlookup(STATIC), {?rlookup, STATIC}).
+
 -type state() :: static_key() | ?PREFIX.

-type varying() :: [binary() | ?PLUS].
+-type varying() :: [level() | ?PLUS].

 -type msg_storage_key() :: {static_key(), varying()}.

@ -71,27 +87,42 @@

 -type persist_callback() :: fun((_Key, _Val) -> ok).

+-type learned_structure() :: [level() | ?PLUS, ...].
+
 -type options() ::
    #{
        persist_callback => persist_callback(),
-        static_key_size => pos_integer()
+        %% If set, static key is an integer that fits in a given nubmer of bits:
+        static_key_bits => pos_integer(),
+        %% If set, static key is a _binary_ of a given length:
+        static_key_bytes => pos_integer(),
+        reverse_lookups => boolean()
    }.

 -type dump() :: [{_Key, _Val}].

 -record(trie, {
    persist :: persist_callback(),
+    is_binary_key :: boolean(),
    static_key_size :: pos_integer(),
    trie :: ets:tid(),
-    stats :: ets:tid()
+    stats :: ets:tid(),
+    rlookups = false :: boolean()
 }).

 -opaque trie() :: #trie{}.

-record(trans, {
-    key :: {state(), edge()},
-    next :: state()
-}).
+-record(trans, {key, next}).
+
+-type trans() ::
+    #trans{
+        key :: {state(), edge()},
+        next :: state()
+    }
+    | #trans{
+        key :: {?rlookup, static_key()},
+        next :: [level() | ?PLUS]
+    }.

 %%================================================================================
 %% API functions
@ -100,21 +131,31 @@
 %% @doc Create an empty trie
 -spec trie_create(options()) -> trie().
 trie_create(UserOpts) ->
-    Defaults = #{
-        persist_callback => fun(_, _) -> ok end,
-        static_key_size => 8
-    },
-    #{
-        persist_callback := Persist,
-        static_key_size := StaticKeySize
-    } = maps:merge(Defaults, UserOpts),
+    Persist = maps:get(
+        persist_callback,
+        UserOpts,
+        fun(_, _) -> ok end
+    ),
+    Rlookups = maps:get(reverse_lookups, UserOpts, false),
+    IsBinaryKey =
+        case UserOpts of
+            #{static_key_bits := StaticKeySize} ->
+                false;
+            #{static_key_bytes := StaticKeySize} ->
+                true;
+            _ ->
+                StaticKeySize = 16,
+                true
+        end,
    Trie = ets:new(trie, [{keypos, #trans.key}, set, public]),
    Stats = ets:new(stats, [{keypos, 1}, set, public]),
    #trie{
        persist = Persist,
+        is_binary_key = IsBinaryKey,
        static_key_size = StaticKeySize,
        trie = Trie,
-        stats = Stats
+        stats = Stats,
+        rlookups = Rlookups
    }.

 -spec trie_create() -> trie().
@ -149,9 +190,21 @@ trie_dump(Trie, Filter) ->
        all ->
            Fun = fun(_) -> true end;
        wildcard ->
-            Fun = fun contains_wildcard/1
+            Fun = fun(L) -> lists:member(?PLUS, L) end
    end,
-    lists:append([P || P <- paths(Trie), Fun(P)]).
+    Paths = lists:filter(
+        fun(Path) ->
+            Fun(tokens_of_path(Path))
+        end,
+        paths(Trie)
+    ),
+    RlookupIdx = lists:filter(
+        fun({_, Tokens}) ->
+            Fun(Tokens)
+        end,
+        all_emanating(Trie, ?rlookup)
+    ),
+    lists:flatten([Paths, RlookupIdx]).

 -spec trie_copy_learned_paths(trie(), trie()) -> trie().
 trie_copy_learned_paths(OldTrie, NewTrie) ->
@ -164,17 +217,17 @@ trie_copy_learned_paths(OldTrie, NewTrie) ->
    NewTrie.

 %% @doc Lookup the topic key. Create a new one, if not found.
-spec topic_key(trie(), threshold_fun(), [binary() | '']) -> msg_storage_key().
+-spec topic_key(trie(), threshold_fun(), [level()]) -> msg_storage_key().
 topic_key(Trie, ThresholdFun, Tokens) ->
-    do_topic_key(Trie, ThresholdFun, 0, ?PREFIX, Tokens, []).
+    do_topic_key(Trie, ThresholdFun, 0, ?PREFIX, Tokens, [], []).

 %% @doc Return an exisiting topic key if it exists.
-spec lookup_topic_key(trie(), [binary()]) -> {ok, msg_storage_key()} | undefined.
+-spec lookup_topic_key(trie(), [level()]) -> {ok, msg_storage_key()} | undefined.
 lookup_topic_key(Trie, Tokens) ->
    do_lookup_topic_key(Trie, ?PREFIX, Tokens, []).

 %% @doc Return list of keys of topics that match a given topic filter
-spec match_topics(trie(), [binary() | '+' | '#']) ->
+-spec match_topics(trie(), [level() | '+' | '#']) ->
    [msg_storage_key()].
 match_topics(Trie, TopicFilter) ->
    do_match_topics(Trie, ?PREFIX, [], TopicFilter).
@ -206,7 +259,8 @@ dump_to_dot(#trie{trie = Trie, stats = Stats}, Filename) ->
    {ok, FD} = file:open(Filename, [write]),
    Print = fun
        (?PREFIX) -> "prefix";
-        (NodeId) -> integer_to_binary(NodeId, 16)
+        (Bin) when is_binary(Bin) -> Bin;
+        (NodeId) when is_integer(NodeId) -> integer_to_binary(NodeId, 16)
    end,
    io:format(FD, "digraph {~n", []),
    lists:foreach(
@ -225,11 +279,64 @@ dump_to_dot(#trie{trie = Trie, stats = Stats}, Filename) ->
    io:format(FD, "}~n", []),
    file:close(FD).

+-spec reverse_lookup(trie(), static_key()) -> {ok, learned_structure()} | undefined.
+reverse_lookup(#trie{rlookups = false}, _) ->
+    error({badarg, reverse_lookups_disabled});
+reverse_lookup(#trie{trie = Trie}, StaticKey) ->
+    case ets:lookup(Trie, ?rlookup(StaticKey)) of
+        [#trans{next = Next}] ->
+            {ok, Next};
+        [] ->
+            undefined
+    end.
+
+%% @doc Get information about the trie.
+%%
+%% Note: `reverse_lookups' must be enabled to get the number of
+%% topics.
+-spec info(trie(), size | topics) -> _.
+info(#trie{rlookups = true, stats = Stats}, topics) ->
+    case ets:lookup(Stats, ?rlookup) of
+        [{_, N}] -> N;
+        [] -> 0
+    end;
+info(#trie{}, topics) ->
+    undefined;
+info(#trie{trie = T}, size) ->
+    ets:info(T, size).
+
+%% @doc Return size of the trie
+-spec info(trie()) -> proplists:proplist().
+info(Trie) ->
+    [
+        {size, info(Trie, size)},
+        {topics, info(Trie, topics)}
+    ].
+
+%%%%%%%% Topic compression %%%%%%%%%%
+
+%% @doc Given topic structure for the static LTS index (as returned by
+%% `reverse_lookup'), compress a topic filter to exclude static
+%% levels:
+-spec compress_topic(static_key(), learned_structure(), emqx_ds:topic_filter()) ->
+    [emqx_ds_lts:level() | '+'].
+compress_topic(StaticKey, TopicStructure, TopicFilter) ->
+    compress_topic(StaticKey, TopicStructure, TopicFilter, []).
+
+%% @doc Given topic structure and a compressed topic filter, return
+%% the original* topic filter.
+%%
+%% * '#' will be replaced with '+'s
+-spec decompress_topic(learned_structure(), [level() | '+']) ->
+    emqx_ds:topic_filter().
+decompress_topic(TopicStructure, Topic) ->
+    decompress_topic(TopicStructure, Topic, []).
+
 %%================================================================================
 %% Internal exports
 %%================================================================================

-spec trie_next(trie(), state(), binary() | ?EOT) -> {Wildcard, state()} | undefined when
+-spec trie_next(trie(), state(), level() | ?EOT) -> {Wildcard, state()} | undefined when
    Wildcard :: boolean().
 trie_next(#trie{trie = Trie}, State, ?EOT) ->
    case ets:lookup(Trie, {State, ?EOT}) of
@ -261,16 +368,19 @@ trie_insert(Trie, State, Token) ->
 %% Internal functions
 %%================================================================================

-spec trie_insert(trie(), state(), edge(), state()) -> {Updated, state()} when
-    NChildren :: non_neg_integer(),
-    Updated :: false | NChildren.
+-spec trie_insert
+    (trie(), state(), edge(), state()) -> {Updated, state()} when
+        NChildren :: non_neg_integer(),
+        Updated :: false | NChildren;
+    (trie(), ?rlookup, static_key(), [level() | '+']) ->
+        {false | non_neg_integer(), state()}.
 trie_insert(#trie{trie = Trie, stats = Stats, persist = Persist}, State, Token, NewState) ->
    Key = {State, Token},
    Rec = #trans{
        key = Key,
        next = NewState
    },
-    case ets:insert_new(Trie, Rec) of
+    case ets_insert_new(Trie, Rec) of
        true ->
            ok = Persist(Key, NewState),
            Inc =
@ -287,7 +397,7 @@ trie_insert(#trie{trie = Trie, stats = Stats, persist = Persist}, State, Token,
    end.

 -spec get_id_for_key(trie(), state(), edge()) -> static_key().
-get_id_for_key(#trie{static_key_size = Size}, State, Token) when Size =< 32 ->
+get_id_for_key(#trie{is_binary_key = IsBin, static_key_size = Size}, State, Token) ->
    %% Requirements for the return value:
    %%
    %% It should be globally unique for the `{State, Token}` pair. Other
@ -303,11 +413,21 @@ get_id_for_key(#trie{static_key_size = Size}, State, Token) when Size =< 32 ->
    %% If we want to impress computer science crowd, sorry, I mean to
    %% minimize storage requirements, we can even employ Huffman coding
    %% based on the frequency of messages.
-    <<Int:(Size * 8), _/bytes>> = crypto:hash(sha256, term_to_binary([State | Token])),
-    Int.
+    Hash = crypto:hash(sha256, term_to_binary([State | Token])),
+    case IsBin of
+        false ->
+            %% Note: for backward compatibility with bitstream_lts
+            %% layout we allow the key to be an integer. But this also
+            %% changes the semantics of `static_key_size` from number
+            %% of bytes to bits:
+            <<Int:Size, _/bytes>> = Hash,
+            Int;
+        true ->
+            element(1, erlang:split_binary(Hash, Size))
+    end.

 %% erlfmt-ignore
-spec do_match_topics(trie(), state(), [binary() | '+'], [binary() | '+' | '#']) ->
+-spec do_match_topics(trie(), state(), [level() | '+'], [level() | '+' | '#']) ->
          list().
 do_match_topics(Trie, State, Varying, []) ->
    case trie_next(Trie, State, ?EOT) of
@ -341,7 +461,7 @@ do_match_topics(Trie, State, Varying, [Level | Rest]) ->
        Emanating
    ).

-spec do_lookup_topic_key(trie(), state(), [binary()], [binary()]) ->
+-spec do_lookup_topic_key(trie(), state(), [level()], [level()]) ->
    {ok, msg_storage_key()} | undefined.
 do_lookup_topic_key(Trie, State, [], Varying) ->
    case trie_next(Trie, State, ?EOT) of
@ -360,29 +480,42 @@ do_lookup_topic_key(Trie, State, [Tok | Rest], Varying) ->
            undefined
    end.

-do_topic_key(Trie, _, _, State, [], Varying) ->
+do_topic_key(Trie, _, _, State, [], Tokens, Varying) ->
    %% We reached the end of topic. Assert: Trie node that corresponds
    %% to EOT cannot be a wildcard.
-    {_, false, Static} = trie_next_(Trie, State, ?EOT),
+    {Updated, false, Static} = trie_next_(Trie, State, ?EOT),
+    _ =
+        case Trie#trie.rlookups andalso Updated of
+            false ->
+                ok;
+            _ ->
+                trie_insert(Trie, rlookup, Static, lists:reverse(Tokens))
+        end,
    {Static, lists:reverse(Varying)};
-do_topic_key(Trie, ThresholdFun, Depth, State, [Tok | Rest], Varying0) ->
+do_topic_key(Trie, ThresholdFun, Depth, State, [Tok | Rest], Tokens, Varying0) ->
    % TODO: it's not necessary to call it every time.
    Threshold = ThresholdFun(Depth),
+    {NChildren, IsWildcard, NextState} = trie_next_(Trie, State, Tok),
    Varying =
-        case trie_next_(Trie, State, Tok) of
-            {NChildren, _, NextState} when is_integer(NChildren), NChildren >= Threshold ->
+        case IsWildcard of
+            _ when is_integer(NChildren), NChildren >= Threshold ->
                %% Number of children for the trie node reached the
                %% threshold, we need to insert wildcard here.
                {_, _WildcardState} = trie_insert(Trie, State, ?PLUS),
                Varying0;
-            {_, false, NextState} ->
+            false ->
                Varying0;
-            {_, true, NextState} ->
+            true ->
                %% This topic level is marked as wildcard in the trie,
                %% we need to add it to the varying part of the key:
                [Tok | Varying0]
        end,
-    do_topic_key(Trie, ThresholdFun, Depth + 1, NextState, Rest, Varying).
+    TokOrWildcard =
+        case IsWildcard of
+            true -> ?PLUS;
+            false -> Tok
+        end,
+    do_topic_key(Trie, ThresholdFun, Depth + 1, NextState, Rest, [TokOrWildcard | Tokens], Varying).

 %% @doc Has side effects! Inserts missing elements
 -spec trie_next_(trie(), state(), binary() | ?EOT) -> {New, Wildcard, state()} when
@ -450,12 +583,51 @@ follow_path(#trie{} = T, State, Path) ->
        all_emanating(T, State)
    ).

-contains_wildcard([{{_State, ?PLUS}, _Next} | _Rest]) ->
-    true;
-contains_wildcard([_ | Rest]) ->
-    contains_wildcard(Rest);
-contains_wildcard([]) ->
-    false.
+tokens_of_path([{{_State, Token}, _Next} | Rest]) ->
+    [Token | tokens_of_path(Rest)];
+tokens_of_path([]) ->
+    [].
+
+%% Wrapper for type checking only:
+-compile({inline, ets_insert_new/2}).
+-spec ets_insert_new(ets:tid(), trans()) -> boolean().
+ets_insert_new(Tid, Trans) ->
+    ets:insert_new(Tid, Trans).
+
+compress_topic(_StaticKey, [], [], Acc) ->
+    lists:reverse(Acc);
+compress_topic(StaticKey, TStructL0, ['#'], Acc) ->
+    case TStructL0 of
+        [] ->
+            lists:reverse(Acc);
+        ['+' | TStructL] ->
+            compress_topic(StaticKey, TStructL, ['#'], ['+' | Acc]);
+        [_ | TStructL] ->
+            compress_topic(StaticKey, TStructL, ['#'], Acc)
+    end;
+compress_topic(StaticKey, ['+' | TStructL], [Level | TopicL], Acc) ->
+    compress_topic(StaticKey, TStructL, TopicL, [Level | Acc]);
+compress_topic(StaticKey, [Struct | TStructL], [Level | TopicL], Acc) when
+    Level =:= '+'; Level =:= Struct
+->
+    compress_topic(StaticKey, TStructL, TopicL, Acc);
+compress_topic(StaticKey, TStructL, TopicL, _Acc) ->
+    %% Topic is mismatched with the structure. This should never
+    %% happen. LTS got corrupted?
+    Err = #{
+        msg => 'Topic structure mismatch',
+        static_key => StaticKey,
+        input => TopicL,
+        structure => TStructL
+    },
+    throw({unrecoverable, Err}).
+
+decompress_topic(['+' | TStructL], [Level | TopicL], Acc) ->
+    decompress_topic(TStructL, TopicL, [Level | Acc]);
+decompress_topic([StaticLevel | TStructL], TopicL, Acc) ->
+    decompress_topic(TStructL, TopicL, [StaticLevel | Acc]);
+decompress_topic([], [], Acc) ->
+    lists:reverse(Acc).

 %%================================================================================
 %% Tests
@ -658,6 +830,76 @@ topic_match_test() ->
        dump_to_dot(T, filename:join("_build", atom_to_list(?FUNCTION_NAME) ++ ".dot"))
    end.

+%% erlfmt-ignore
+rlookup_test() ->
+    T = trie_create(#{reverse_lookups => true}),
+    Threshold = 2,
+    ThresholdFun = fun(0) -> 1000;
+                      (_) -> Threshold
+                   end,
+    {S1, []} = test_key(T, ThresholdFun, [1]),
+    {S11, []} = test_key(T, ThresholdFun, [1, 1]),
+    {S12, []} = test_key(T, ThresholdFun, [1, 2]),
+    {S111, []} = test_key(T, ThresholdFun, [1, 1, 1]),
+    {S11e, []} = test_key(T, ThresholdFun, [1, 1, '']),
+    %% Now add learned wildcards:
+    {S21, []} = test_key(T, ThresholdFun, [2, 1]),
+    {S22, []} = test_key(T, ThresholdFun, [2, 2]),
+    {S2_, [<<"3">>]} = test_key(T, ThresholdFun, [2, 3]),
+    {S2_11, [<<"3">>]} = test_key(T, ThresholdFun, [2, 3, 1, 1]),
+    {S2_12, [<<"4">>]} = test_key(T, ThresholdFun, [2, 4, 1, 2]),
+    {S2_1_, [<<"3">>, <<"3">>]} = test_key(T, ThresholdFun, [2, 3, 1, 3]),
+    %% Check reverse matching:
+    ?assertEqual({ok, [<<"1">>]}, reverse_lookup(T, S1)),
+    ?assertEqual({ok, [<<"1">>, <<"1">>]}, reverse_lookup(T, S11)),
+    ?assertEqual({ok, [<<"1">>, <<"2">>]}, reverse_lookup(T, S12)),
+    ?assertEqual({ok, [<<"1">>, <<"1">>, <<"1">>]}, reverse_lookup(T, S111)),
+    ?assertEqual({ok, [<<"1">>, <<"1">>, '']}, reverse_lookup(T, S11e)),
+    ?assertEqual({ok, [<<"2">>, <<"1">>]}, reverse_lookup(T, S21)),
+    ?assertEqual({ok, [<<"2">>, <<"2">>]}, reverse_lookup(T, S22)),
+    ?assertEqual({ok, [<<"2">>, '+']}, reverse_lookup(T, S2_)),
+    ?assertEqual({ok, [<<"2">>, '+', <<"1">>, <<"1">>]}, reverse_lookup(T, S2_11)),
+    ?assertEqual({ok, [<<"2">>, '+', <<"1">>, <<"2">>]}, reverse_lookup(T, S2_12)),
+    ?assertEqual({ok, [<<"2">>, '+', <<"1">>, '+']}, reverse_lookup(T, S2_1_)),
+    %% Dump and restore trie to make sure rlookup still works:
+    T1 = trie_restore(#{reverse_lookups => true}, trie_dump(T, all)),
+    destroy(T),
+    ?assertEqual({ok, [<<"2">>, <<"1">>]}, reverse_lookup(T1, S21)),
+    ?assertEqual({ok, [<<"2">>, '+', <<"1">>, '+']}, reverse_lookup(T1, S2_1_)).
+
+n_topics_test() ->
+    Threshold = 3,
+    ThresholdFun = fun
+        (0) -> 1000;
+        (_) -> Threshold
+    end,
+
+    T = trie_create(#{reverse_lookups => true}),
+    ?assertEqual(0, info(T, topics)),
+    {S11, []} = test_key(T, ThresholdFun, [1, 1]),
+    {S11, []} = test_key(T, ThresholdFun, [1, 1]),
+    ?assertEqual(1, info(T, topics)),
+
+    {S12, []} = test_key(T, ThresholdFun, [1, 2]),
+    {S12, []} = test_key(T, ThresholdFun, [1, 2]),
+    ?assertEqual(2, info(T, topics)),
+
+    {_S13, []} = test_key(T, ThresholdFun, [1, 3]),
+    ?assertEqual(3, info(T, topics)),
+
+    {S1_, [_]} = test_key(T, ThresholdFun, [1, 4]),
+    ?assertEqual(4, info(T, topics)),
+
+    {S1_, [_]} = test_key(T, ThresholdFun, [1, 5]),
+    {S1_, [_]} = test_key(T, ThresholdFun, [1, 6]),
+    {S1_, [_]} = test_key(T, ThresholdFun, [1, 7]),
+    ?assertEqual(4, info(T, topics)),
+
+    ?assertMatch(
+        [{size, N}, {topics, 4}] when is_integer(N),
+        info(T)
+    ).
+
 -define(keys_history, topic_key_history).

 %% erlfmt-ignore
@ -773,11 +1015,16 @@ paths_test() ->
    ),

    %% Test filter function for paths containing wildcards
-    WildcardPaths = lists:filter(fun contains_wildcard/1, Paths),
+    WildcardPaths = lists:filter(
+        fun(Path) ->
+            lists:member(?PLUS, tokens_of_path(Path))
+        end,
+        Paths
+    ),
    FormattedWildcardPaths = lists:map(fun format_path/1, WildcardPaths),
    ?assertEqual(
-        sets:from_list(FormattedWildcardPaths, [{version, 2}]),
        sets:from_list(lists:map(FormatPathSpec, ExpectedWildcardPaths), [{version, 2}]),
+        sets:from_list(FormattedWildcardPaths, [{version, 2}]),
        #{
            expected => ExpectedWildcardPaths,
            wildcards => FormattedWildcardPaths
@ -795,13 +1042,97 @@ paths_test() ->
    #trie{trie = Tab2} = T2,
    Dump1 = sets:from_list(ets:tab2list(Tab1), [{version, 2}]),
    Dump2 = sets:from_list(ets:tab2list(Tab2), [{version, 2}]),
-    ?assertEqual(Dump1, Dump2),
-
-    ok.
+    ?assertEqual(Dump1, Dump2).

 format_path([{{_State, Edge}, _Next} | Rest]) ->
    [Edge | format_path(Rest)];
 format_path([]) ->
    [].

+compress_topic_test() ->
+    %% Structure without wildcards:
+    ?assertEqual([], compress_topic(42, [], [])),
+    ?assertEqual([], compress_topic(42, [<<"foo">>, <<"bar">>], [<<"foo">>, <<"bar">>])),
+    ?assertEqual([], compress_topic(42, [<<"foo">>, ''], [<<"foo">>, ''])),
+    ?assertEqual([], compress_topic(42, [<<"foo">>, ''], [<<"foo">>, '+'])),
+    ?assertEqual([], compress_topic(42, [<<"foo">>, ''], ['+', '+'])),
+    ?assertEqual([], compress_topic(42, [<<"foo">>, <<"bar">>, ''], ['#'])),
+    ?assertEqual([], compress_topic(42, [<<"foo">>, <<"bar">>, ''], [<<"foo">>, <<"bar">>, '#'])),
+    ?assertEqual([], compress_topic(42, [<<"foo">>, <<"bar">>, ''], ['+', '#'])),
+    ?assertEqual(
+        [], compress_topic(42, [<<"foo">>, <<"bar">>, ''], [<<"foo">>, <<"bar">>, '', '#'])
+    ),
+    %% With wildcards:
+    ?assertEqual(
+        [<<"1">>], compress_topic(42, [<<"foo">>, '+', <<"bar">>], [<<"foo">>, <<"1">>, <<"bar">>])
+    ),
+    ?assertEqual(
+        [<<"1">>, <<"2">>],
+        compress_topic(
+            42,
+            [<<"foo">>, '+', <<"bar">>, '+', <<"baz">>],
+            [<<"foo">>, <<"1">>, <<"bar">>, <<"2">>, <<"baz">>]
+        )
+    ),
+    ?assertEqual(
+        ['+', <<"2">>],
+        compress_topic(
+            42,
+            [<<"foo">>, '+', <<"bar">>, '+', <<"baz">>],
+            [<<"foo">>, '+', <<"bar">>, <<"2">>, <<"baz">>]
+        )
+    ),
+    ?assertEqual(
+        ['+', '+'],
+        compress_topic(
+            42,
+            [<<"foo">>, '+', <<"bar">>, '+', <<"baz">>],
+            [<<"foo">>, '+', <<"bar">>, '+', <<"baz">>]
+        )
+    ),
+    ?assertEqual(
+        ['+', '+'],
+        compress_topic(
+            42,
+            [<<"foo">>, '+', <<"bar">>, '+', <<"baz">>],
+            ['#']
+        )
+    ),
+    ?assertEqual(
+        ['+', '+'],
+        compress_topic(
+            42,
+            [<<"foo">>, '+', <<"bar">>, '+', <<"baz">>],
+            [<<"foo">>, '+', '+', '#']
+        )
+    ),
+    %% Mismatch:
+    ?assertException(_, {unrecoverable, _}, compress_topic(42, [<<"foo">>], [<<"bar">>])),
+    ?assertException(_, {unrecoverable, _}, compress_topic(42, [], [<<"bar">>])),
+    ?assertException(_, {unrecoverable, _}, compress_topic(42, [<<"foo">>], [])),
+    ?assertException(_, {unrecoverable, _}, compress_topic(42, ['', ''], ['', '', ''])),
+    ?assertException(_, {unrecoverable, _}, compress_topic(42, ['', ''], [<<"foo">>, '#'])),
+    ?assertException(_, {unrecoverable, _}, compress_topic(42, ['', ''], ['+', '+', '+', '#'])),
+    ?assertException(_, {unrecoverable, _}, compress_topic(42, ['+'], [<<"bar">>, '+'])),
+    ?assertException(
+        _, {unrecoverable, _}, compress_topic(42, [<<"foo">>, '+'], [<<"bar">>, <<"baz">>])
+    ).
+
+decompress_topic_test() ->
+    %% Structure without wildcards:
+    ?assertEqual([], decompress_topic([], [])),
+    ?assertEqual(
+        [<<"foo">>, '', <<"bar">>],
+        decompress_topic([<<"foo">>, '', <<"bar">>], [])
+    ),
+    %% With wildcards:
+    ?assertEqual(
+        [<<"foo">>, '', <<"bar">>, <<"baz">>],
+        decompress_topic([<<"foo">>, '+', <<"bar">>, '+'], ['', <<"baz">>])
+    ),
+    ?assertEqual(
+        [<<"foo">>, '+', <<"bar">>, '+', ''],
+        decompress_topic([<<"foo">>, '+', <<"bar">>, '+', ''], ['+', '+'])
+    ).
+
 -endif.
--- a/apps/emqx_durable_storage/src/emqx_ds_msg_serializer.erl
+++ b/apps/emqx_durable_storage/src/emqx_ds_msg_serializer.erl
@ -0,0 +1,515 @@
+%%--------------------------------------------------------------------
+%% Copyright (c) 2024 EMQ Technologies Co., Ltd. All Rights Reserved.
+%%
+%% Licensed under the Apache License, Version 2.0 (the "License");
+%% you may not use this file except in compliance with the License.
+%% You may obtain a copy of the License at
+%%
+%%     http://www.apache.org/licenses/LICENSE-2.0
+%%
+%% Unless required by applicable law or agreed to in writing, software
+%% distributed under the License is distributed on an "AS IS" BASIS,
+%% WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+%% See the License for the specific language governing permissions and
+%% limitations under the License.
+%%--------------------------------------------------------------------
+
+%% @doc This utility module provides a generic method for encoding
+%% (and decoding) MQTT messages at rest.
+%%
+%% Note to developer: backward compatibility has to be maintained at
+%% all times, for all releases.
+-module(emqx_ds_msg_serializer).
+
+%% API:
+-export([serialize/2, deserialize/2, check_schema/1]).
+
+%% internal exports:
+-export([]).
+
+-include_lib("emqx_utils/include/emqx_message.hrl").
+-include_lib("typerefl/include/types.hrl").
+-include("../gen_src/DurableMessage.hrl").
+
+-ifdef(TEST).
+-include_lib("eunit/include/eunit.hrl").
+-endif.
+
+-elvis([{elvis_style, atom_naming_convention, disable}]).
+-dialyzer({nowarn_function, [serialize_asn1/1, deserialize_asn1/1]}).
+
+%%================================================================================
+%% Type declarations
+%%================================================================================
+
+%% FIXME: Properl reflection fails dialyzer check due wrong spec in
+%% typerefl
+-type schema() :: term().
+
+-reflect_type([schema/0]).
+
+%%================================================================================
+%% API functions
+%%================================================================================
+
+-spec check_schema(schema()) -> ok | {error, _}.
+check_schema(v1) ->
+    ok;
+check_schema(asn1) ->
+    ok;
+check_schema(_) ->
+    {error, "Unknown schema type"}.
+
+-spec serialize(schema(), emqx_types:message()) -> binary().
+serialize(v1, Msg) ->
+    serialize_v1(Msg);
+serialize(asn1, Msg) ->
+    serialize_asn1(Msg).
+
+-spec deserialize(schema(), binary()) -> emqx_types:message().
+deserialize(v1, Blob) ->
+    deserialize_v1(Blob);
+deserialize(asn1, Blob) ->
+    deserialize_asn1(Blob).
+
+%%================================================================================
+%% Internal functions
+%%================================================================================
+
+%%--------------------------------------------------------------------------------
+%% V1 (erlang:term_to_binary/binary_to_term). Simple not the most
+%% space- and CPU-efficient encoding
+%% --------------------------------------------------------------------------------
+
+serialize_v1(Msg) ->
+    term_to_binary(message_to_value_v1(Msg)).
+
+message_to_value_v1(#message{
+    id = Id,
+    qos = Qos,
+    from = From,
+    flags = Flags,
+    headers = Headers,
+    topic = Topic,
+    payload = Payload,
+    timestamp = Timestamp,
+    extra = Extra
+}) ->
+    {Id, Qos, From, Flags, Headers, Topic, Payload, Timestamp, Extra}.
+
+deserialize_v1(Blob) ->
+    value_v1_to_message(binary_to_term(Blob)).
+
+value_v1_to_message({Id, Qos, From, Flags, Headers, Topic, Payload, Timestamp, Extra}) ->
+    #message{
+        id = Id,
+        qos = Qos,
+        from = From,
+        flags = Flags,
+        headers = Headers,
+        topic = Topic,
+        payload = Payload,
+        timestamp = Timestamp,
+        extra = Extra
+    }.
+
+%%--------------------------------------------------------------------------------
+%% Encoding based on ASN1.
+%%--------------------------------------------------------------------------------
+
+serialize_asn1(#message{
+    id = Id,
+    qos = Qos,
+    from = From0,
+    flags = Flags,
+    topic = Topic,
+    payload = Payload,
+    timestamp = Timestamp,
+    headers = Headers
+}) ->
+    MiscFlags = maps:fold(
+        fun
+            (Key, Val, Acc) when Key =/= sys, Key =/= dup, Key =/= retain ->
+                [asn1_encode_misc(flag, Key, Val) | Acc];
+            (_, _, Acc) ->
+                Acc
+        end,
+        [],
+        Flags
+    ),
+    {StdHeaders, StdProps, MiscHeaders} = asn1_encode_headers(Headers),
+    {ok, Bin} = 'DurableMessage':encode('DurableMessage', #'DurableMessage'{
+        id = Id,
+        from =
+            case is_atom(From0) of
+                true -> {atom, erlang:atom_to_binary(From0, utf8)};
+                false -> {binary, From0}
+            end,
+        topic = Topic,
+        payload = iolist_to_binary(Payload),
+        timestamp = Timestamp,
+
+        qos = Qos,
+        sys = maps:get(sys, Flags, false),
+        dup = maps:get(dup, Flags, false),
+        retain = maps:get(retain, Flags, false),
+
+        properties = StdProps,
+        headers = StdHeaders,
+
+        %% TODO: store client attrs?
+        misc = MiscFlags ++ MiscHeaders
+    }),
+    Bin.
+
+deserialize_asn1(Blob) ->
+    {ok, #'DurableMessage'{
+        id = Id,
+        from = From0,
+        topic = Topic,
+        payload = Payload,
+        timestamp = Timestamp,
+        qos = Qos,
+
+        sys = Sys,
+        dup = Dup,
+        retain = Retain,
+
+        headers = StdHeaders,
+        properties = StdProperties,
+
+        misc = Misc
+    }} = 'DurableMessage':decode('DurableMessage', Blob),
+    From =
+        case From0 of
+            {atom, Bin} -> erlang:binary_to_atom(Bin, utf8);
+            {binary, Bin} -> Bin
+        end,
+    %% Decode flags:
+    Flags = #{sys => Sys, dup => Dup, retain => Retain},
+    asn1_deserialize_misc(Misc, #message{
+        id = Id,
+        qos = Qos,
+        from = From,
+        topic = Topic,
+        payload = Payload,
+        timestamp = Timestamp,
+        flags = Flags,
+        headers = asn1_decode_headers(StdHeaders, StdProperties)
+    }).
+
+asn1_encode_headers(Headers) ->
+    PeerName =
+        case Headers of
+            #{peername := {IP1, Port}} -> encode_ip_port(16, IP1, Port);
+            _ -> asn1_NOVALUE
+        end,
+    PeerHost =
+        case Headers of
+            #{peerhost := IP2} -> encode_ip_port(0, IP2, 0);
+            _ -> asn1_NOVALUE
+        end,
+    ProtoVer = asn1_encode_proto_ver(Headers),
+    StdHeaders = #'StdHeaders'{
+        protoVer = ProtoVer,
+        peername = PeerName,
+        peerhost = PeerHost,
+        username =
+            case Headers of
+                #{username := U} when is_binary(U) -> U;
+                _ -> asn1_NOVALUE
+            end
+    },
+    {StdProps, MiscProps} = asn1_encode_properties(maps:get(properties, Headers, #{})),
+    MiscHeaders = maps:fold(
+        fun
+            (Header, _V, Acc) when
+                Header =:= properties; Header =:= username; Header =:= client_attrs
+            ->
+                Acc;
+            (protocol, _V, Acc) when ProtoVer =/= asn1_NOVALUE ->
+                Acc;
+            (proto_ver, _V, Acc) when ProtoVer =/= asn1_NOVALUE ->
+                Acc;
+            (peername, _V, Acc) when PeerName =/= asn1_NOVALUE ->
+                Acc;
+            (peerhost, _V, Acc) when PeerHost =/= asn1_NOVALUE ->
+                Acc;
+            %% Add headers that could not be encoded using fixed schema:
+            (Key, Val, Acc) ->
+                [asn1_encode_misc(header, Key, Val) | Acc]
+        end,
+        [],
+        Headers
+    ),
+    {StdHeaders, StdProps, MiscHeaders ++ MiscProps}.
+
+asn1_encode_properties(Props) ->
+    UserProps = maps:get('User-Property', Props, []),
+    StdProperties = #'StdProperties'{
+        payloadFormatIndicator = asn1_std_prop('Payload-Format-Indicator', Props),
+        messageExpiryInterval = asn1_std_prop('Message-Expiry-Interval', Props),
+        responseTopic = asn1_std_prop('Response-Topic', Props),
+        correlationData = asn1_std_prop('Correlation-Data', Props),
+        contentType = asn1_std_prop('Content-Type', Props),
+        userProperty = [#'UserProperty'{key = K, value = V} || {K, V} <- UserProps]
+    },
+    MiscProperties = maps:fold(
+        fun
+            (K, V, Acc) when
+                K =/= 'Payload-Format-Indicator',
+                K =/= 'Message-Expiry-Interval',
+                K =/= 'Response-Topic',
+                K =/= 'Correlation-Data',
+                K =/= 'Content-Type',
+                K =/= 'User-Property'
+            ->
+                [asn1_encode_misc(property, K, V) | Acc];
+            (_, _, Acc) ->
+                Acc
+        end,
+        [],
+        Props
+    ),
+    {StdProperties, MiscProperties}.
+
+asn1_encode_misc(header, Key, Val) ->
+    {header, #'MiscProperty'{
+        key = term_to_binary(Key), value = term_to_binary(Val)
+    }};
+asn1_encode_misc(property, Key, Val) ->
+    {property, #'MiscProperty'{
+        key = term_to_binary(Key), value = term_to_binary(Val)
+    }};
+asn1_encode_misc(flag, Key, Val) ->
+    {flag, #'MiscFlag'{
+        key = atom_to_binary(Key, utf8), value = Val
+    }}.
+
+asn1_std_prop(Key, Map) ->
+    case Map of
+        #{Key := Val} -> Val;
+        _ -> asn1_NOVALUE
+    end.
+
+asn1_decode_headers(
+    #'StdHeaders'{
+        protoVer = ProtoVer, peerhost = Peerhost, peername = Peername, username = Username
+    },
+    StdProperties
+) ->
+    M0 = asn1_decode_properties(StdProperties),
+    M1 =
+        case ProtoVer of
+            asn1_NOVALUE -> M0;
+            {Protocol, Ver} -> M0#{protocol => Protocol, proto_ver => Ver}
+        end,
+    M2 = asn1_add_optional(peername, decode_ip_port(16, Peername), M1),
+    M3 =
+        case decode_ip_port(0, Peerhost) of
+            asn1_NOVALUE -> M2;
+            {PeerIP, _} -> M2#{peerhost => PeerIP}
+        end,
+    asn1_add_optional(username, Username, M3).
+
+asn1_decode_properties(#'StdProperties'{
+    payloadFormatIndicator = PFI,
+    userProperty = UP,
+    messageExpiryInterval = MEI,
+    responseTopic = RT,
+    correlationData = CD,
+    contentType = CT
+}) ->
+    M0 =
+        case [{K, V} || #'UserProperty'{key = K, value = V} <- UP] of
+            [] -> #{};
+            UserProps -> #{'User-Property' => UserProps}
+        end,
+    M1 = asn1_add_optional('Payload-Format-Indicator', PFI, M0),
+    M2 = asn1_add_optional('Message-Expiry-Interval', MEI, M1),
+    M3 = asn1_add_optional('Response-Topic', RT, M2),
+    M4 = asn1_add_optional('Correlation-Data', CD, M3),
+    M5 = asn1_add_optional('Content-Type', CT, M4),
+    case maps:size(M5) of
+        0 -> #{};
+        _ -> #{properties => M5}
+    end.
+
+asn1_add_optional(_Key, asn1_NOVALUE, Acc) -> Acc;
+asn1_add_optional(Key, Val, Acc) -> maps:put(Key, Val, Acc).
+
+-define(IS_VER(V), is_integer(V), V >= 0, V =< 255).
+
+asn1_encode_proto_ver(#{protocol := mqtt, proto_ver := V}) when ?IS_VER(V) ->
+    {mqtt, V};
+asn1_encode_proto_ver(#{protocol := 'mqtt-sn', proto_ver := V}) when ?IS_VER(V) ->
+    {'mqtt-sn', V};
+asn1_encode_proto_ver(#{protocol := coap, proto_ver := V}) when ?IS_VER(V) ->
+    {coap, V};
+asn1_encode_proto_ver(_) ->
+    asn1_NOVALUE.
+
+-undef(IS_VER).
+
+asn1_deserialize_misc(asn1_NOVALUE, Message) ->
+    Message;
+asn1_deserialize_misc(MiscData, Message0) ->
+    lists:foldl(
+        fun
+            ({flag, #'MiscFlag'{key = Key, value = Val}}, Acc) ->
+                Flags = maps:put(binary_to_atom(Key, utf8), Val, Acc#message.flags),
+                Acc#message{flags = Flags};
+            ({header, #'MiscProperty'{key = Key, value = Val}}, Acc) ->
+                Headers = maps:put(binary_to_term(Key), binary_to_term(Val), Acc#message.headers),
+                Acc#message{headers = Headers};
+            ({property, #'MiscProperty'{key = Key, value = Val}}, Acc) ->
+                #message{headers = Headers0} = Acc,
+                Headers = maps:update_with(
+                    properties,
+                    fun(Props) ->
+                        maps:put(binary_to_term(Key), binary_to_term(Val), Props)
+                    end,
+                    Headers0
+                ),
+                Acc#message{headers = Headers};
+            ({clientAttr, #'ClientAttr'{key = Key, value = Val}}, Acc) ->
+                #message{headers = Headers0} = Acc,
+                Headers = maps:update_with(
+                    client_attrs,
+                    fun(Props) ->
+                        maps:put(Key, Val, Props)
+                    end,
+                    Headers0
+                ),
+                Acc#message{headers = Headers};
+            ({extra, #'MiscProperty'{key = Key, value = Val}}, Acc) ->
+                Extra = maps:put(binary_to_term(Key), binary_to_term(Val), Acc#message.extra),
+                Acc#message{extra = Extra}
+        end,
+        Message0,
+        MiscData
+    ).
+
+encode_ip_port(PortSize, {A0, A1, A2, A3}, Port) ->
+    <<A0:8, A1:8, A2:8, A3:8, Port:PortSize>>;
+encode_ip_port(PortSize, {A0, A1, A2, A3, A4, A5, A6, A7, A8, A9, AA, AB, AC, AD, AE, AF}, Port) ->
+    <<A0:8, A1:8, A2:8, A3:8, A4:8, A5:8, A6:8, A7:8, A8:8, A9:8, AA:8, AB:8, AC:8, AD:8, AE:8,
+        AF:8, Port:PortSize>>;
+encode_ip_port(_, _, _) ->
+    asn1_NOVALUE.
+
+decode_ip_port(PortSize, Blob) ->
+    case Blob of
+        <<A0:8, A1:8, A2:8, A3:8, Port:PortSize>> ->
+            {{A0, A1, A2, A3}, Port};
+        <<A0:8, A1:8, A2:8, A3:8, A4:8, A5:8, A6:8, A7:8, A8:8, A9:8, AA:8, AB:8, AC:8, AD:8, AE:8,
+            AF:8, Port:PortSize>> ->
+            {{A0, A1, A2, A3, A4, A5, A6, A7, A8, A9, AA, AB, AC, AD, AE, AF}, Port};
+        _ ->
+            asn1_NOVALUE
+    end.
+
+-ifdef(TEST).
+
+test_messages() ->
+    [
+        #message{
+            id = <<"message_id_val">>,
+            qos = 2,
+            from = <<"from_val">>,
+            flags = #{sys => true, dup => true},
+            topic = <<"topic/value">>,
+            payload = [<<"foo">>, <<"bar">>],
+            timestamp = 42424242,
+            extra = #{}
+        },
+        #message{
+            id = <<0, 6, 28, 54, 12, 158, 221, 191, 244, 69, 0, 0, 13, 214, 0, 3>>,
+            qos = 0,
+            from = <<"MzE3MjU5NzA4NDY3MzcwNzg0NDYxNzI5NDg0NDk4NTM0NDA">>,
+            flags = #{dup => true, retain => true, sys => true},
+            headers = #{
+                peername => {{127, 0, 0, 1}, 34560},
+                protocol => mqtt,
+                username => <<"foobar">>,
+                proto_ver => 5,
+                peerhost => {1, 1, 1, 1},
+                properties =>
+                    #{
+                        'Content-Type' => <<"text/json">>,
+                        'User-Property' => [{<<"foo">>, <<"bar">>}, {<<"baz">>, <<"quux">>}],
+                        'Message-Expiry-Interval' => 10001,
+                        'Payload-Format-Indicator' => 1
+                    }
+            },
+            topic = <<"foo/bar">>,
+            payload = <<"foo">>,
+            timestamp = 1719868325813,
+            extra = #{}
+        },
+        #message{
+            id = <<>>,
+            from = undefined,
+            flags = #{other_flag => true},
+            headers = #{
+                properties =>
+                    #{
+                        'Payload-Format-Indicator' => 1,
+                        'Message-Expiry-Interval' => 1 bsl 32 - 1,
+                        'Response-Topic' => <<"foo/bar/baz">>,
+                        'Correlation-Data' => <<"correlation data">>,
+                        'Content-Type' => <<"text/json">>,
+                        'User-Property' => [{<<"foo">>, <<"bar">>}, {<<"baz">>, <<"quux">>}],
+                        junk => garbage,
+                        {34, 33, 2} => more_garbage
+                    },
+                junk => garbage
+            },
+            topic = <<"foo/bar">>,
+            payload = <<"foo">>,
+            timestamp = 171986,
+            extra = #{}
+        },
+        #message{
+            id = <<>>,
+            from = undefined,
+            headers = #{
+                protocol => "some_protocol",
+                proto_ver => 42,
+                peername => "some.fancy.peername:222",
+                peerhost => "some.fancy.peerhost"
+            },
+            topic = <<"foo/bar">>,
+            payload = <<"foo">>,
+            timestamp = 171986,
+            extra = #{}
+        }
+    ].
+
+v1_serialize_deserialize_test_() ->
+    [
+        assert_transcode(v1, Msg)
+     || Msg <- test_messages()
+    ].
+
+asn1_serialize_deserialize_test_() ->
+    [
+        assert_transcode(asn1, Msg)
+     || Msg <- test_messages()
+    ].
+
+assert_transcode(Schema, Msg) ->
+    fun() ->
+        Blob = serialize(Schema, Msg),
+        ?debugFmt("encoded size (~p) = ~p~n", [Schema, size(Blob)]),
+        assert_eq(Msg, deserialize(Schema, Blob))
+    end.
+
+assert_eq(Expect, Got) ->
+    ?assertEqual(
+        emqx_ds_test_helpers:message_canonical_form(Expect),
+        emqx_ds_test_helpers:message_canonical_form(Got),
+        {Expect, Got}
+    ).
+
+-endif.
--- a/apps/emqx_durable_storage/src/emqx_ds_storage_bitfield_lts.erl
+++ b/apps/emqx_durable_storage/src/emqx_ds_storage_bitfield_lts.erl
@ -36,7 +36,7 @@
    make_delete_iterator/5,
    update_iterator/4,
    next/6,
-    delete_next/6,
+    delete_next/7,

    handle_event/4
 ]).
@ -156,7 +156,9 @@
 -define(DIM_TOPIC, 1).
 -define(DIM_TS, 2).

-define(DS_LTS_COUNTERS, [?DS_LTS_SEEK_COUNTER, ?DS_LTS_NEXT_COUNTER, ?DS_LTS_COLLISION_COUNTER]).
+-define(DS_LTS_COUNTERS, [
+    ?DS_BITFIELD_LTS_SEEK_COUNTER, ?DS_BITFIELD_LTS_NEXT_COUNTER, ?DS_BITFIELD_LTS_COLLISION_COUNTER
+]).

 %% GVar used for idle detection:
 -define(IDLE_DETECT, idle_detect).
@ -196,7 +198,7 @@ create(_ShardId, DBHandle, GenId, Options, SPrev) ->
    case SPrev of
        #s{trie = TriePrev} ->
            ok = copy_previous_trie(DBHandle, TrieCFHandle, TriePrev),
-            ?tp(bitfield_lts_inherited_trie, #{}),
+            ?tp(layout_inherited_lts_trie, #{}),
            ok;
        undefined ->
            ok
@ -495,14 +497,19 @@ next_until(#s{db = DB, data = CF, keymappers = Keymappers}, It, SafeCutoffTime,
        rocksdb:iterator_close(ITHandle)
    end.

-delete_next(Shard, Schema = #s{ts_offset = TSOffset}, It, Selector, BatchSize, Now) ->
+delete_next(Shard, Schema = #s{ts_offset = TSOffset}, It, Selector, BatchSize, Now, IsCurrent) ->
    %% Compute safe cutoff time.
    %% It's the point in time where the last complete epoch ends, so we need to know
    %% the current time to compute it.
    init_counters(),
-    SafeCutoffTime = (Now bsr TSOffset) bsl TSOffset,
+    SafeCutoffTime = ?EPOCH(Schema, Now) bsl TSOffset,
    try
-        delete_next_until(Schema, It, SafeCutoffTime, Selector, BatchSize)
+        case delete_next_until(Schema, It, SafeCutoffTime, Selector, BatchSize) of
+            {ok, _It, 0, 0} when not IsCurrent ->
+                {ok, end_of_stream};
+            Result ->
+                Result
+        end
    after
        report_counters(Shard)
    end.
@ -596,7 +603,7 @@ prepare_loop_context(DB, CF, TopicIndex, StartTime, SafeCutoffTime, Varying, Key
            fun
                ('+') ->
                    any;
-                (TopicLevel) when is_binary(TopicLevel) ->
+                (TopicLevel) when is_binary(TopicLevel); TopicLevel =:= '' ->
                    {'=', hash_topic_level(TopicLevel)}
            end,
            Varying
@ -632,7 +639,7 @@ next_loop(ITHandle, KeyMapper, Filter, Cutoff, It0, Acc0, N0) ->
        Key1 ->
            %% assert
            true = Key1 > Key0,
-            inc_counter(?DS_LTS_SEEK_COUNTER),
+            inc_counter(?DS_BITFIELD_LTS_SEEK_COUNTER),
            case rocksdb:iterator_move(ITHandle, {seek, Key1}) of
                {ok, Key, Val} ->
                    {N, It, Acc} = traverse_interval(
@ -658,7 +665,7 @@ traverse_interval(ITHandle, KeyMapper, Filter, Cutoff, Key, Val, It0, Acc0, N) -
                    Acc = [{Key, Msg} | Acc0],
                    traverse_interval(ITHandle, KeyMapper, Filter, Cutoff, It, Acc, N - 1);
                false ->
-                    inc_counter(?DS_LTS_COLLISION_COUNTER),
+                    inc_counter(?DS_BITFIELD_LTS_COLLISION_COUNTER),
                    traverse_interval(ITHandle, KeyMapper, Filter, Cutoff, It, Acc0, N)
            end;
        overflow ->
@ -670,7 +677,7 @@ traverse_interval(ITHandle, KeyMapper, Filter, Cutoff, Key, Val, It0, Acc0, N) -
 traverse_interval(_ITHandle, _KeyMapper, _Filter, _Cutoff, It, Acc, 0) ->
    {0, It, Acc};
 traverse_interval(ITHandle, KeyMapper, Filter, Cutoff, It, Acc, N) ->
-    inc_counter(?DS_LTS_NEXT_COUNTER),
+    inc_counter(?DS_BITFIELD_LTS_NEXT_COUNTER),
    case rocksdb:iterator_move(ITHandle, next) of
        {ok, Key, Val} ->
            traverse_interval(ITHandle, KeyMapper, Filter, Cutoff, Key, Val, It, Acc, N);
@ -690,7 +697,7 @@ delete_next_loop(LoopContext0) ->
        iterated_over := AccIter0,
        it_handle := ITHandle
    } = LoopContext0,
-    inc_counter(?DS_LTS_SEEK_COUNTER),
+    inc_counter(?DS_BITFIELD_LTS_SEEK_COUNTER),
    #{?tag := ?DELETE_IT, ?last_seen_key := Key0} = It0,
    case emqx_ds_bitmask_keymapper:bin_increment(Filter, Key0) of
        overflow ->
@ -772,7 +779,7 @@ delete_traverse_interval1(LoopContext0) ->
        iterated_over := AccIter,
        storage_iter := It
    } = LoopContext0,
-    inc_counter(?DS_LTS_NEXT_COUNTER),
+    inc_counter(?DS_BITFIELD_LTS_NEXT_COUNTER),
    case rocksdb:iterator_move(ITHandle, next) of
        {ok, Key, Val} ->
            delete_traverse_interval(LoopContext0#{
@ -831,6 +838,8 @@ threshold_fun(0) ->
 threshold_fun(_) ->
    20.

+hash_topic_level('') ->
+    hash_topic_level(<<>>);
 hash_topic_level(TopicLevel) ->
    <<Int:64, _/binary>> = erlang:md5(TopicLevel),
    Int.
@ -896,7 +905,7 @@ restore_trie(TopicIndexBytes, DB, CF) ->
    {ok, IT} = rocksdb:iterator(DB, CF, []),
    try
        Dump = read_persisted_trie(IT, rocksdb:iterator_move(IT, first)),
-        TrieOpts = #{persist_callback => PersistCallback, static_key_size => TopicIndexBytes},
+        TrieOpts = #{persist_callback => PersistCallback, static_key_bits => TopicIndexBytes * 8},
        emqx_ds_lts:trie_restore(TrieOpts, Dump)
    after
        rocksdb:iterator_close(IT)
@ -933,9 +942,11 @@ init_counters() ->
    ok.

 report_counters(Shard) ->
-    emqx_ds_builtin_metrics:inc_lts_seek_counter(Shard, get(?DS_LTS_SEEK_COUNTER)),
-    emqx_ds_builtin_metrics:inc_lts_next_counter(Shard, get(?DS_LTS_NEXT_COUNTER)),
-    emqx_ds_builtin_metrics:inc_lts_collision_counter(Shard, get(?DS_LTS_COLLISION_COUNTER)),
+    emqx_ds_builtin_metrics:inc_lts_seek_counter(Shard, get(?DS_BITFIELD_LTS_SEEK_COUNTER)),
+    emqx_ds_builtin_metrics:inc_lts_next_counter(Shard, get(?DS_BITFIELD_LTS_NEXT_COUNTER)),
+    emqx_ds_builtin_metrics:inc_lts_collision_counter(
+        Shard, get(?DS_BITFIELD_LTS_COLLISION_COUNTER)
+    ),
    _ = [erase(I) || I <- ?DS_LTS_COUNTERS],
    ok.

--- a/apps/emqx_durable_storage/src/emqx_ds_storage_layer.erl
+++ b/apps/emqx_durable_storage/src/emqx_ds_storage_layer.erl
@ -261,6 +261,11 @@
 ) ->
    [_Stream].

+-callback get_delete_streams(
+    shard_id(), generation_data(), emqx_ds:topic_filter(), emqx_ds:time()
+) ->
+    [_Stream].
+
 -callback make_iterator(
    shard_id(), generation_data(), _Stream, emqx_ds:topic_filter(), emqx_ds:time()
 ) ->
@ -282,9 +287,12 @@
    DeleteIterator,
    emqx_ds:delete_selector(),
    pos_integer(),
-    emqx_ds:time()
+    emqx_ds:time(),
+    _IsCurrentGeneration :: boolean()
 ) ->
-    {ok, DeleteIterator, _NDeleted :: non_neg_integer(), _IteratedOver :: non_neg_integer()}.
+    {ok, DeleteIterator, _NDeleted :: non_neg_integer(), _IteratedOver :: non_neg_integer()}
+    | {ok, end_of_stream}
+    | emqx_ds:error(_).

 -callback handle_event(shard_id(), generation_data(), emqx_ds:time(), CustomEvent | tick) ->
    [CustomEvent].
@ -307,6 +315,8 @@
 drop_shard(Shard) ->
    ok = rocksdb:destroy(db_dir(Shard), []).

+%% @doc This is a convenicence wrapper that combines `prepare' and
+%% `commit' operations.
 -spec store_batch(
    shard_id(),
    [{emqx_ds:time(), emqx_types:message()}],
@ -323,6 +333,15 @@ store_batch(Shard, Messages, Options) ->
            Error
    end.

+%% @doc Transform a batch of messages into a "cooked batch" that can
+%% be stored in the transaction log or transfered over the network.
+%%
+%% Important: the caller MUST ensure that timestamps within the shard
+%% form a strictly increasing monotonic sequence through out the whole
+%% lifetime of the shard.
+%%
+%% The underlying storage layout MAY use timestamp as a unique message
+%% ID.
 -spec prepare_batch(
    shard_id(),
    [{emqx_ds:time(), emqx_types:message()}],
@ -355,6 +374,10 @@ prepare_batch(Shard, Messages = [{Time, _} | _], Options) ->
 prepare_batch(_Shard, [], _Options) ->
    ignore.

+%% @doc Commit cooked batch to the storage.
+%%
+%% The underlying storage layout must guarantee that this operation is
+%% idempotent.
 -spec commit_batch(
    shard_id(),
    cooked_batch(),
@ -511,15 +534,12 @@ delete_next(
 ) ->
    case generation_get(Shard, GenId) of
        #{module := Mod, data := GenData} ->
-            Current = generation_current(Shard),
-            case Mod:delete_next(Shard, GenData, GenIter0, Selector, BatchSize, Now) of
-                {ok, _GenIter, _Deleted = 0, _IteratedOver = 0} when GenId < Current ->
-                    %% This is a past generation. Storage layer won't write
-                    %% any more messages here. The iterator reached the end:
-                    %% the stream has been fully replayed.
-                    {ok, end_of_stream};
+            IsCurrent = GenId =:= generation_current(Shard),
+            case Mod:delete_next(Shard, GenData, GenIter0, Selector, BatchSize, Now, IsCurrent) of
                {ok, GenIter, NumDeleted, _IteratedOver} ->
                    {ok, Iter#{?enc := GenIter}, NumDeleted};
+                EOS = {ok, end_of_stream} ->
+                    EOS;
                Error = {error, _} ->
                    Error
            end;
--- a/apps/emqx_durable_storage/src/emqx_ds_storage_reference.erl
+++ b/apps/emqx_durable_storage/src/emqx_ds_storage_reference.erl
@ -39,7 +39,7 @@
    make_delete_iterator/5,
    update_iterator/4,
    next/6,
-    delete_next/6
+    delete_next/7
 ]).

 %% internal exports:
@ -169,7 +169,7 @@ next(_Shard, #s{db = DB, cf = CF}, It0, BatchSize, _Now, IsCurrent) ->
            {ok, It, lists:reverse(Messages)}
    end.

-delete_next(_Shard, #s{db = DB, cf = CF}, It0, Selector, BatchSize, _Now) ->
+delete_next(_Shard, #s{db = DB, cf = CF}, It0, Selector, BatchSize, _Now, IsCurrent) ->
    #delete_it{
        topic_filter = TopicFilter,
        start_time = StartTime,
@ -198,7 +198,12 @@ delete_next(_Shard, #s{db = DB, cf = CF}, It0, Selector, BatchSize, _Now) ->
    ),
    rocksdb:iterator_close(ITHandle),
    It = It0#delete_it{last_seen_message_key = Key},
-    {ok, It, NumDeleted, NumIterated}.
+    case IsCurrent of
+        false when NumDeleted =:= 0, NumIterated =:= 0 ->
+            {ok, end_of_stream};
+        _ ->
+            {ok, It, NumDeleted, NumIterated}
+    end.

 %%================================================================================
 %% Internal functions
--- a/apps/emqx_durable_storage/src/emqx_ds_storage_skipstream_lts.erl
+++ b/apps/emqx_durable_storage/src/emqx_ds_storage_skipstream_lts.erl
@ -0,0 +1,749 @@
+%%--------------------------------------------------------------------
+%% Copyright (c) 2024 EMQ Technologies Co., Ltd. All Rights Reserved.
+%%
+%% Licensed under the Apache License, Version 2.0 (the "License");
+%% you may not use this file except in compliance with the License.
+%% You may obtain a copy of the License at
+%%
+%%     http://www.apache.org/licenses/LICENSE-2.0
+%%
+%% Unless required by applicable law or agreed to in writing, software
+%% distributed under the License is distributed on an "AS IS" BASIS,
+%% WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+%% See the License for the specific language governing permissions and
+%% limitations under the License.
+%%--------------------------------------------------------------------
+-module(emqx_ds_storage_skipstream_lts).
+
+-behaviour(emqx_ds_storage_layer).
+
+%% API:
+-export([]).
+
+%% behavior callbacks:
+-export([
+    create/5,
+    open/5,
+    drop/5,
+    prepare_batch/4,
+    commit_batch/4,
+    get_streams/4,
+    get_delete_streams/4,
+    make_iterator/5,
+    make_delete_iterator/5,
+    update_iterator/4,
+    next/6,
+    delete_next/7
+]).
+
+%% internal exports:
+-export([]).
+
+-export_type([schema/0, s/0]).
+
+-include_lib("emqx_utils/include/emqx_message.hrl").
+-include_lib("snabbkaffe/include/trace.hrl").
+-include("emqx_ds_metrics.hrl").
+
+-ifdef(TEST).
+-include_lib("eunit/include/eunit.hrl").
+-endif.
+
+-elvis([{elvis_style, nesting_level, disable}]).
+
+%%================================================================================
+%% Type declarations
+%%================================================================================
+
+%% keys:
+-define(cooked_payloads, 6).
+-define(cooked_lts_ops, 7).
+
+-define(lts_persist_ops, emqx_ds_storage_skipstream_lts_ops).
+
+%% Width of the wildcard layer, in bits:
+-define(wcb, 16).
+-type wildcard_idx() :: 0..16#ffff.
+
+%% Width of the timestamp, in bits:
+-define(tsb, 64).
+-define(max_ts, 16#ffffffffffffffff).
+-type ts() :: 0..?max_ts.
+
+-type wildcard_hash() :: binary().
+
+%% Permanent state:
+-type schema() ::
+    #{
+        wildcard_hash_bytes := pos_integer(),
+        topic_index_bytes := pos_integer(),
+        keep_message_id := boolean(),
+        serialization_schema := emqx_ds_msg_serializer:schema(),
+        with_guid := boolean()
+    }.
+
+%% Runtime state:
+-record(s, {
+    db :: rocksdb:db_handle(),
+    data_cf :: rocksdb:cf_handle(),
+    trie :: emqx_ds_lts:trie(),
+    trie_cf :: rocksdb:cf_handle(),
+    serialization_schema :: emqx_ds_msg_serializer:schema(),
+    hash_bytes :: pos_integer(),
+    with_guid :: boolean()
+}).
+
+-type s() :: #s{}.
+
+-record(stream, {
+    static_index :: emqx_ds_lts:static_key()
+}).
+
+-record(it, {
+    static_index :: emqx_ds_lts:static_key(),
+    %% Minimal timestamp of the next message:
+    ts :: ts(),
+    %% Compressed topic filter:
+    compressed_tf :: binary()
+}).
+
+%% Level iterator:
+-record(l, {
+    n :: non_neg_integer(),
+    handle :: rocksdb:itr_handle(),
+    hash :: binary()
+}).
+
+%%================================================================================
+%% API functions
+%%================================================================================
+
+%%================================================================================
+%% behavior callbacks
+%%================================================================================
+
+create(_ShardId, DBHandle, GenId, Schema0, SPrev) ->
+    Defaults = #{
+        wildcard_hash_bytes => 8,
+        topic_index_bytes => 8,
+        serialization_schema => asn1,
+        with_guid => false
+    },
+    Schema = maps:merge(Defaults, Schema0),
+    ok = emqx_ds_msg_serializer:check_schema(maps:get(serialization_schema, Schema)),
+    DataCFName = data_cf(GenId),
+    TrieCFName = trie_cf(GenId),
+    {ok, DataCFHandle} = rocksdb:create_column_family(DBHandle, DataCFName, []),
+    {ok, TrieCFHandle} = rocksdb:create_column_family(DBHandle, TrieCFName, []),
+    case SPrev of
+        #s{trie = TriePrev} ->
+            ok = copy_previous_trie(DBHandle, TrieCFHandle, TriePrev),
+            ?tp(layout_inherited_lts_trie, #{}),
+            ok;
+        undefined ->
+            ok
+    end,
+    {Schema, [{DataCFName, DataCFHandle}, {TrieCFName, TrieCFHandle}]}.
+
+open(_Shard, DBHandle, GenId, CFRefs, #{
+    topic_index_bytes := TIBytes,
+    wildcard_hash_bytes := WCBytes,
+    serialization_schema := SSchema,
+    with_guid := WithGuid
+}) ->
+    {_, DataCF} = lists:keyfind(data_cf(GenId), 1, CFRefs),
+    {_, TrieCF} = lists:keyfind(trie_cf(GenId), 1, CFRefs),
+    Trie = restore_trie(TIBytes, DBHandle, TrieCF),
+    #s{
+        db = DBHandle,
+        data_cf = DataCF,
+        trie_cf = TrieCF,
+        trie = Trie,
+        hash_bytes = WCBytes,
+        serialization_schema = SSchema,
+        with_guid = WithGuid
+    }.
+
+drop(_ShardId, DBHandle, _GenId, _CFRefs, #s{data_cf = DataCF, trie_cf = TrieCF, trie = Trie}) ->
+    emqx_ds_lts:destroy(Trie),
+    ok = rocksdb:drop_column_family(DBHandle, DataCF),
+    ok = rocksdb:drop_column_family(DBHandle, TrieCF),
+    ok.
+
+prepare_batch(
+    _ShardId,
+    S = #s{trie = Trie, hash_bytes = HashBytes},
+    Messages,
+    _Options
+) ->
+    _ = erase(?lts_persist_ops),
+    Payloads =
+        lists:flatmap(
+            fun({Timestamp, Msg = #message{topic = Topic}}) ->
+                Tokens = words(Topic),
+                {Static, Varying} = emqx_ds_lts:topic_key(Trie, fun threshold_fun/1, Tokens),
+                %% TODO: is it possible to create index during the
+                %% commit phase to avoid transferring indexes through
+                %% the translog?
+                [
+                    {mk_key(Static, 0, <<>>, Timestamp), serialize(S, Varying, Msg)}
+                    | mk_index(HashBytes, Static, Timestamp, Varying)
+                ]
+            end,
+            Messages
+        ),
+    {ok, #{
+        ?cooked_payloads => Payloads,
+        ?cooked_lts_ops => pop_lts_persist_ops()
+    }}.
+
+commit_batch(
+    _ShardId,
+    #s{db = DB, trie_cf = TrieCF, data_cf = DataCF, trie = Trie},
+    #{?cooked_lts_ops := LtsOps, ?cooked_payloads := Payloads},
+    Options
+) ->
+    {ok, Batch} = rocksdb:batch(),
+    try
+        %% Commit LTS trie to the storage:
+        lists:foreach(
+            fun({Key, Val}) ->
+                ok = rocksdb:batch_put(Batch, TrieCF, term_to_binary(Key), term_to_binary(Val))
+            end,
+            LtsOps
+        ),
+        %% Apply LTS ops to the memory cache:
+        _ = emqx_ds_lts:trie_update(Trie, LtsOps),
+        %% Commit payloads:
+        lists:foreach(
+            fun({Key, Val}) ->
+                ok = rocksdb:batch_put(Batch, DataCF, Key, Val)
+            end,
+            Payloads
+        ),
+        Result = rocksdb:write_batch(DB, Batch, [
+            {disable_wal, not maps:get(durable, Options, true)}
+        ]),
+        %% NOTE
+        %% Strictly speaking, `{error, incomplete}` is a valid result but should be impossible to
+        %% observe until there's `{no_slowdown, true}` in write options.
+        case Result of
+            ok ->
+                ok;
+            {error, {error, Reason}} ->
+                {error, unrecoverable, {rocksdb, Reason}}
+        end
+    after
+        rocksdb:release_batch(Batch)
+    end.
+
+get_streams(_Shard, #s{trie = Trie}, TopicFilter, _StartTime) ->
+    get_streams(Trie, TopicFilter).
+
+get_delete_streams(_Shard, #s{trie = Trie}, TopicFilter, _StartTime) ->
+    get_streams(Trie, TopicFilter).
+
+make_iterator(_Shard, #s{trie = Trie}, #stream{static_index = StaticIdx}, TopicFilter, StartTime) ->
+    {ok, TopicStructure} = emqx_ds_lts:reverse_lookup(Trie, StaticIdx),
+    CompressedTF = emqx_ds_lts:compress_topic(StaticIdx, TopicStructure, TopicFilter),
+    {ok, #it{
+        static_index = StaticIdx,
+        ts = StartTime,
+        compressed_tf = emqx_topic:join(CompressedTF)
+    }}.
+
+make_delete_iterator(Shard, Data, Stream, TopicFilter, StartTime) ->
+    make_iterator(Shard, Data, Stream, TopicFilter, StartTime).
+
+update_iterator(_Shard, _Data, OldIter, DSKey) ->
+    case match_ds_key(OldIter#it.static_index, DSKey) of
+        false ->
+            {error, unrecoverable, "Invalid datastream key"};
+        TS ->
+            {ok, OldIter#it{ts = TS}}
+    end.
+
+next(ShardId = {_DB, Shard}, S, It, BatchSize, TMax, IsCurrent) ->
+    init_counters(),
+    Iterators = init_iterators(S, It),
+    %% ?tp(notice, skipstream_init_iters, #{it => It, its => Iterators}),
+    try
+        case next_loop(Shard, S, It, Iterators, BatchSize, TMax) of
+            {ok, _, []} when not IsCurrent ->
+                {ok, end_of_stream};
+            Result ->
+                Result
+        end
+    after
+        free_iterators(Iterators),
+        collect_counters(ShardId)
+    end.
+
+delete_next(Shard, S, It0, Selector, BatchSize, Now, IsCurrent) ->
+    case next(Shard, S, It0, BatchSize, Now, IsCurrent) of
+        {ok, It, KVs} ->
+            batch_delete(S, It, Selector, KVs);
+        Ret ->
+            Ret
+    end.
+
+%%================================================================================
+%% Internal exports
+%%================================================================================
+
+%%================================================================================
+%% Internal functions
+%%================================================================================
+
+%% Loop context:
+-record(ctx, {
+    shard,
+    %% Generation runtime state
+    s,
+    %% RocksDB iterators:
+    iters,
+    %% Cached topic structure for the static index:
+    topic_structure,
+    %% Maximum time:
+    tmax,
+    %% Compressed topic filter, split into words:
+    filter
+}).
+
+get_streams(Trie, TopicFilter) ->
+    lists:map(
+        fun({Static, _Varying}) ->
+            #stream{static_index = Static}
+        end,
+        emqx_ds_lts:match_topics(Trie, TopicFilter)
+    ).
+
+%%%%%%%% Value (de)serialization %%%%%%%%%%
+
+serialize(#s{serialization_schema = SSchema, with_guid = WithGuid}, Varying, Msg0) ->
+    %% Replace original topic with the varying parts:
+    Msg = Msg0#message{
+        id =
+            case WithGuid of
+                true -> Msg0#message.id;
+                false -> <<>>
+            end,
+        topic = emqx_topic:join(Varying)
+    },
+    emqx_ds_msg_serializer:serialize(SSchema, Msg).
+
+enrich(
+    #ctx{shard = Shard, topic_structure = Structure, s = #s{with_guid = WithGuid}},
+    DSKey,
+    Msg0
+) ->
+    Topic = emqx_topic:join(emqx_ds_lts:decompress_topic(Structure, words(Msg0#message.topic))),
+    Msg0#message{
+        topic = Topic,
+        id =
+            case WithGuid of
+                true -> Msg0#message.id;
+                false -> fake_guid(Shard, DSKey)
+            end
+    }.
+
+deserialize(
+    #s{serialization_schema = SSchema},
+    Blob
+) ->
+    emqx_ds_msg_serializer:deserialize(SSchema, Blob).
+
+fake_guid(_Shard, DSKey) ->
+    %% Both guid and MD5 are 16 bytes:
+    crypto:hash(md5, DSKey).
+
+%%%%%%%% Deletion %%%%%%%%%%
+
+batch_delete(#s{hash_bytes = HashBytes, db = DB, data_cf = CF}, It, Selector, KVs) ->
+    #it{static_index = Static, compressed_tf = CompressedTF} = It,
+    {Indices, _} = lists:foldl(
+        fun
+            ('+', {Acc, WildcardIdx}) ->
+                {Acc, WildcardIdx + 1};
+            (LevelFilter, {Acc0, WildcardIdx}) ->
+                Acc = [{WildcardIdx, hash(HashBytes, LevelFilter)} | Acc0],
+                {Acc, WildcardIdx + 1}
+        end,
+        {[], 1},
+        words(CompressedTF)
+    ),
+    KeyFamily = [{0, <<>>} | Indices],
+    {ok, Batch} = rocksdb:batch(),
+    try
+        Ndeleted = lists:foldl(
+            fun({MsgKey, Val}, Acc) ->
+                case Selector(Val) of
+                    true ->
+                        do_delete(CF, Batch, Static, KeyFamily, MsgKey),
+                        Acc + 1;
+                    false ->
+                        Acc
+                end
+            end,
+            0,
+            KVs
+        ),
+        case rocksdb:write_batch(DB, Batch, []) of
+            ok ->
+                {ok, It, Ndeleted, length(KVs)};
+            {error, {error, Reason}} ->
+                {error, unrecoverable, {rocksdb, Reason}}
+        end
+    after
+        rocksdb:release_batch(Batch)
+    end.
+
+do_delete(CF, Batch, Static, KeyFamily, MsgKey) ->
+    TS = match_ds_key(Static, MsgKey),
+    lists:foreach(
+        fun({WildcardIdx, Hash}) ->
+            ok = rocksdb:batch_delete(Batch, CF, mk_key(Static, WildcardIdx, Hash, TS))
+        end,
+        KeyFamily
+    ).
+
+%%%%%%%% Iteration %%%%%%%%%%
+
+init_iterators(S, #it{static_index = Static, compressed_tf = CompressedTF}) ->
+    do_init_iterators(S, Static, words(CompressedTF), 1).
+
+do_init_iterators(S, Static, ['+' | TopicFilter], WildcardLevel) ->
+    %% Ignore wildcard levels in the topic filter:
+    do_init_iterators(S, Static, TopicFilter, WildcardLevel + 1);
+do_init_iterators(S, Static, [Constraint | TopicFilter], WildcardLevel) ->
+    %% Create iterator for the index stream:
+    #s{hash_bytes = HashBytes, db = DB, data_cf = DataCF} = S,
+    Hash = hash(HashBytes, Constraint),
+    {ok, ItHandle} = rocksdb:iterator(DB, DataCF, get_key_range(Static, WildcardLevel, Hash)),
+    It = #l{
+        n = WildcardLevel,
+        handle = ItHandle,
+        hash = Hash
+    },
+    [It | do_init_iterators(S, Static, TopicFilter, WildcardLevel + 1)];
+do_init_iterators(S, Static, [], _WildcardLevel) ->
+    %% Create an iterator for the data stream:
+    #s{db = DB, data_cf = DataCF} = S,
+    Hash = <<>>,
+    {ok, ItHandle} = rocksdb:iterator(DB, DataCF, get_key_range(Static, 0, Hash)),
+    [
+        #l{
+            n = 0,
+            handle = ItHandle,
+            hash = Hash
+        }
+    ].
+
+next_loop(
+    Shard,
+    S = #s{trie = Trie},
+    It = #it{static_index = StaticIdx, ts = TS, compressed_tf = CompressedTF},
+    Iterators,
+    BatchSize,
+    TMax
+) ->
+    TopicStructure =
+        case emqx_ds_lts:reverse_lookup(Trie, StaticIdx) of
+            {ok, Rev} ->
+                Rev;
+            undefined ->
+                throw(#{
+                    msg => "LTS trie missing key",
+                    key => StaticIdx
+                })
+        end,
+    Ctx = #ctx{
+        shard = Shard,
+        s = S,
+        iters = Iterators,
+        topic_structure = TopicStructure,
+        filter = words(CompressedTF),
+        tmax = TMax
+    },
+    next_loop(Ctx, It, BatchSize, {seek, TS}, []).
+
+next_loop(_Ctx, It, 0, Op, Acc) ->
+    finalize_loop(It, Op, Acc);
+next_loop(Ctx, It0, BatchSize, Op, Acc) ->
+    %% ?tp(notice, skipstream_loop, #{
+    %%     ts => It0#it.ts, tf => It0#it.compressed_tf, bs => BatchSize, tmax => TMax, op => Op
+    %% }),
+    #ctx{s = S, tmax = TMax, iters = Iterators} = Ctx,
+    #it{static_index = StaticIdx, compressed_tf = CompressedTF} = It0,
+    case next_step(S, StaticIdx, CompressedTF, Iterators, undefined, Op) of
+        none ->
+            %% ?tp(notice, skipstream_loop_result, #{r => none}),
+            inc_counter(?DS_SKIPSTREAM_LTS_EOS),
+            finalize_loop(It0, Op, Acc);
+        {seek, TS} when TS > TMax ->
+            %% ?tp(notice, skipstream_loop_result, #{r => seek_future, ts => TS}),
+            inc_counter(?DS_SKIPSTREAM_LTS_FUTURE),
+            finalize_loop(It0, {seek, TS}, Acc);
+        {ok, TS, _Key, _Msg0} when TS > TMax ->
+            %% ?tp(notice, skipstream_loop_result, #{r => ok_future, ts => TS, key => _Key}),
+            inc_counter(?DS_SKIPSTREAM_LTS_FUTURE),
+            finalize_loop(It0, {seek, TS}, Acc);
+        {seek, TS} ->
+            %% ?tp(notice, skipstream_loop_result, #{r => seek, ts => TS}),
+            It = It0#it{ts = TS},
+            next_loop(Ctx, It, BatchSize, {seek, TS}, Acc);
+        {ok, TS, DSKey, Msg0} ->
+            %% ?tp(notice, skipstream_loop_result, #{r => ok, ts => TS, key => Key}),
+            Message = enrich(Ctx, DSKey, Msg0),
+            It = It0#it{ts = TS},
+            next_loop(Ctx, It, BatchSize - 1, next, [{DSKey, Message} | Acc])
+    end.
+
+finalize_loop(It0, Op, Acc) ->
+    case Op of
+        next -> NextTS = It0#it.ts + 1;
+        {seek, NextTS} -> ok
+    end,
+    It = It0#it{ts = NextTS},
+    {ok, It, lists:reverse(Acc)}.
+
+next_step(
+    S, StaticIdx, CompressedTF, [#l{hash = Hash, handle = IH, n = N} | Iterators], ExpectedTS, Op
+) ->
+    Result =
+        case Op of
+            next ->
+                inc_counter(?DS_SKIPSTREAM_LTS_NEXT),
+                rocksdb:iterator_move(IH, next);
+            {seek, TS} ->
+                inc_counter(?DS_SKIPSTREAM_LTS_SEEK),
+                rocksdb:iterator_move(IH, {seek, mk_key(StaticIdx, N, Hash, TS)})
+        end,
+    case Result of
+        {error, invalid_iterator} ->
+            none;
+        {ok, Key, Blob} ->
+            case match_key(StaticIdx, N, Hash, Key) of
+                false ->
+                    %% This should not happen, since we set boundaries
+                    %% to the iterators, and overflow to a different
+                    %% key prefix should be caught by the previous
+                    %% clause:
+                    none;
+                NextTS when ExpectedTS =:= undefined; NextTS =:= ExpectedTS ->
+                    %% We found a key that corresponds to the
+                    %% timestamp we expect.
+                    %% ?tp(notice, ?MODULE_STRING "_step_hit", #{
+                    %%     next_ts => NextTS, expected => ExpectedTS, n => N
+                    %% }),
+                    case Iterators of
+                        [] ->
+                            %% This is data stream as well. Check
+                            %% message for hash collisions and return
+                            %% value:
+                            Msg0 = deserialize(S, Blob),
+                            case emqx_topic:match(Msg0#message.topic, CompressedTF) of
+                                true ->
+                                    inc_counter(?DS_SKIPSTREAM_LTS_HIT),
+                                    {ok, NextTS, Key, Msg0};
+                                false ->
+                                    %% Hash collision. Advance to the
+                                    %% next timestamp:
+                                    inc_counter(?DS_SKIPSTREAM_LTS_HASH_COLLISION),
+                                    {seek, NextTS + 1}
+                            end;
+                        _ ->
+                            %% This is index stream. Keep going:
+                            next_step(S, StaticIdx, CompressedTF, Iterators, NextTS, {seek, NextTS})
+                    end;
+                NextTS when NextTS > ExpectedTS, N > 0 ->
+                    %% Next index level is not what we expect. Reset
+                    %% search to the first wilcard index, but continue
+                    %% from `NextTS'.
+                    %%
+                    %% Note: if `NextTS > ExpectedTS' and `N =:= 0',
+                    %% it means the upper (replication) level is
+                    %% broken and supplied us NextTS that advenced
+                    %% past the point of time that can be safely read.
+                    %% We don't handle it here.
+                    inc_counter(?DS_SKIPSTREAM_LTS_MISS),
+                    {seek, NextTS}
+            end
+    end.
+
+free_iterators(Its) ->
+    lists:foreach(
+        fun(#l{handle = IH}) ->
+            ok = rocksdb:iterator_close(IH)
+        end,
+        Its
+    ).
+
+%%%%%%%% Indexes %%%%%%%%%%
+
+mk_index(HashBytes, Static, Timestamp, Varying) ->
+    mk_index(HashBytes, Static, Timestamp, 1, Varying, []).
+
+mk_index(HashBytes, Static, Timestamp, N, [TopicLevel | Varying], Acc) ->
+    Op = {mk_key(Static, N, hash(HashBytes, TopicLevel), Timestamp), <<>>},
+    mk_index(HashBytes, Static, Timestamp, N + 1, Varying, [Op | Acc]);
+mk_index(_HashBytes, _Static, _Timestamp, _N, [], Acc) ->
+    Acc.
+
+%%%%%%%% Keys %%%%%%%%%%
+
+get_key_range(StaticIdx, WildcardIdx, Hash) ->
+    [
+        {iterate_lower_bound, mk_key(StaticIdx, WildcardIdx, Hash, 0)},
+        {iterate_upper_bound, mk_key(StaticIdx, WildcardIdx, Hash, ?max_ts)}
+    ].
+
+-spec match_ds_key(emqx_ds_lts:static_key(), binary()) -> ts() | false.
+match_ds_key(StaticIdx, Key) ->
+    match_key(StaticIdx, 0, <<>>, Key).
+
+-spec match_key(emqx_ds_lts:static_key(), wildcard_idx(), wildcard_hash(), binary()) ->
+    ts() | false.
+match_key(StaticIdx, 0, <<>>, Key) ->
+    TSz = size(StaticIdx),
+    case Key of
+        <<StaticIdx:TSz/binary, 0:?wcb, Timestamp:?tsb>> ->
+            Timestamp;
+        _ ->
+            false
+    end;
+match_key(StaticIdx, Idx, Hash, Key) when Idx > 0 ->
+    Tsz = size(StaticIdx),
+    Hsz = size(Hash),
+    case Key of
+        <<StaticIdx:Tsz/binary, Idx:?wcb, Hash:Hsz/binary, Timestamp:?tsb>> ->
+            Timestamp;
+        _ ->
+            false
+    end.
+
+-spec mk_key(emqx_ds_lts:static_key(), wildcard_idx(), wildcard_hash(), ts()) -> binary().
+mk_key(StaticIdx, 0, <<>>, Timestamp) ->
+    %% Data stream is identified by wildcard level = 0
+    <<StaticIdx/binary, 0:?wcb, Timestamp:?tsb>>;
+mk_key(StaticIdx, N, Hash, Timestamp) when N > 0 ->
+    %% Index stream:
+    <<StaticIdx/binary, N:?wcb, Hash/binary, Timestamp:?tsb>>.
+
+hash(HashBytes, '') ->
+    hash(HashBytes, <<>>);
+hash(HashBytes, TopicLevel) ->
+    {Hash, _} = split_binary(erlang:md5(TopicLevel), HashBytes),
+    Hash.
+
+%%%%%%%% LTS %%%%%%%%%%
+
+%% TODO: don't hardcode the thresholds
+threshold_fun(0) ->
+    100;
+threshold_fun(_) ->
+    10.
+
+-spec restore_trie(pos_integer(), rocksdb:db_handle(), rocksdb:cf_handle()) -> emqx_ds_lts:trie().
+restore_trie(StaticIdxBytes, DB, CF) ->
+    PersistCallback = fun(Key, Val) ->
+        push_lts_persist_op(Key, Val),
+        ok
+    end,
+    {ok, IT} = rocksdb:iterator(DB, CF, []),
+    try
+        Dump = read_persisted_trie(IT, rocksdb:iterator_move(IT, first)),
+        TrieOpts = #{
+            persist_callback => PersistCallback,
+            static_key_bytes => StaticIdxBytes,
+            reverse_lookups => true
+        },
+        emqx_ds_lts:trie_restore(TrieOpts, Dump)
+    after
+        rocksdb:iterator_close(IT)
+    end.
+
+-spec copy_previous_trie(rocksdb:db_handle(), rocksdb:cf_handle(), emqx_ds_lts:trie()) ->
+    ok.
+copy_previous_trie(DB, TrieCF, TriePrev) ->
+    {ok, Batch} = rocksdb:batch(),
+    lists:foreach(
+        fun({Key, Val}) ->
+            ok = rocksdb:batch_put(Batch, TrieCF, term_to_binary(Key), term_to_binary(Val))
+        end,
+        emqx_ds_lts:trie_dump(TriePrev, wildcard)
+    ),
+    Result = rocksdb:write_batch(DB, Batch, []),
+    rocksdb:release_batch(Batch),
+    Result.
+
+push_lts_persist_op(Key, Val) ->
+    case erlang:get(?lts_persist_ops) of
+        undefined ->
+            erlang:put(?lts_persist_ops, [{Key, Val}]);
+        L when is_list(L) ->
+            erlang:put(?lts_persist_ops, [{Key, Val} | L])
+    end.
+
+pop_lts_persist_ops() ->
+    case erlang:erase(?lts_persist_ops) of
+        undefined ->
+            [];
+        L when is_list(L) ->
+            L
+    end.
+
+read_persisted_trie(IT, {ok, KeyB, ValB}) ->
+    [
+        {binary_to_term(KeyB), binary_to_term(ValB)}
+        | read_persisted_trie(IT, rocksdb:iterator_move(IT, next))
+    ];
+read_persisted_trie(_IT, {error, invalid_iterator}) ->
+    [].
+
+%%%%%%%% Column families %%%%%%%%%%
+
+%% @doc Generate a column family ID for the MQTT messages
+-spec data_cf(emqx_ds_storage_layer:gen_id()) -> [char()].
+data_cf(GenId) ->
+    "emqx_ds_storage_skipstream_lts_data" ++ integer_to_list(GenId).
+
+%% @doc Generate a column family ID for the trie
+-spec trie_cf(emqx_ds_storage_layer:gen_id()) -> [char()].
+trie_cf(GenId) ->
+    "emqx_ds_storage_skipstream_lts_trie" ++ integer_to_list(GenId).
+
+%%%%%%%% Topic encoding %%%%%%%%%%
+
+words(<<>>) ->
+    [];
+words(Bin) ->
+    emqx_topic:words(Bin).
+
+%%%%%%%% Counters %%%%%%%%%%
+
+-define(COUNTERS, [
+    ?DS_SKIPSTREAM_LTS_SEEK,
+    ?DS_SKIPSTREAM_LTS_NEXT,
+    ?DS_SKIPSTREAM_LTS_HASH_COLLISION,
+    ?DS_SKIPSTREAM_LTS_HIT,
+    ?DS_SKIPSTREAM_LTS_MISS,
+    ?DS_SKIPSTREAM_LTS_FUTURE,
+    ?DS_SKIPSTREAM_LTS_EOS
+]).
+
+inc_counter(Counter) ->
+    N = get(Counter),
+    put(Counter, N + 1).
+
+init_counters() ->
+    _ = [put(I, 0) || I <- ?COUNTERS],
+    ok.
+
+collect_counters(Shard) ->
+    lists:foreach(
+        fun(Key) ->
+            emqx_ds_builtin_metrics:collect_shard_counter(Shard, Key, get(Key))
+        end,
+        ?COUNTERS
+    ).
--- a/apps/emqx_durable_storage/test/emqx_ds_storage_bitfield_lts_SUITE.erl
+++ b/apps/emqx_durable_storage/test/emqx_ds_storage_bitfield_lts_SUITE.erl
@ -13,7 +13,7 @@
 %% See the License for the specific language governing permissions and
 %% limitations under the License.
 %%--------------------------------------------------------------------
-module(emqx_ds_storage_bitfield_lts_SUITE).
+-module(emqx_ds_storage_layout_SUITE).

 -compile(export_all).
 -compile(nowarn_export_all).
@ -23,23 +23,34 @@
 -include_lib("snabbkaffe/include/snabbkaffe.hrl").
 -include_lib("stdlib/include/assert.hrl").

+-define(FUTURE, (1 bsl 64 - 1)).
+
 -define(SHARD, shard(?FUNCTION_NAME)).

-define(DEFAULT_CONFIG, #{
+-define(DB_CONFIG(CONFIG), #{
    backend => builtin_local,
-    storage => {emqx_ds_storage_bitfield_lts, #{}},
+    storage => ?config(layout, CONFIG),
    n_shards => 1
 }).

-define(COMPACT_CONFIG, #{
-    backend => builtin_local,
-    storage =>
-        {emqx_ds_storage_bitfield_lts, #{
-            bits_per_wildcard_level => 8
-        }},
-    n_shards => 1,
-    replication_factor => 1
-}).
+all() ->
+    [
+        {group, bitfield_lts},
+        {group, skipstream_lts}
+    ].
+
+init_per_group(Group, Config) ->
+    LayoutConf =
+        case Group of
+            skipstream_lts ->
+                {emqx_ds_storage_skipstream_lts, #{with_guid => true}};
+            bitfield_lts ->
+                {emqx_ds_storage_bitfield_lts, #{}}
+        end,
+    [{layout, LayoutConf} | Config].
+
+end_per_group(_Group, Config) ->
+    Config.

 %% Smoke test of store function
 t_store(_Config) ->
@ -53,7 +64,7 @@ t_store(_Config) ->
        payload = Payload,
        timestamp = PublishedAt
    },
-    ?assertMatch(ok, emqx_ds_storage_layer:store_batch(?SHARD, [{PublishedAt, Msg}], #{})).
+    ?assertMatch(ok, emqx_ds:store_batch(?FUNCTION_NAME, [Msg])).

 %% Smoke test for iteration through a concrete topic
 t_iterate(_Config) ->
@ -61,15 +72,17 @@ t_iterate(_Config) ->
    Topics = [<<"foo/bar">>, <<"foo/bar/baz">>, <<"a">>],
    Timestamps = lists:seq(1, 10),
    Batch = [
-        {PublishedAt, make_message(PublishedAt, Topic, integer_to_binary(PublishedAt))}
+        make_message(PublishedAt, Topic, integer_to_binary(PublishedAt))
     || Topic <- Topics, PublishedAt <- Timestamps
    ],
-    ok = emqx_ds_storage_layer:store_batch(?SHARD, Batch, #{}),
+    ok = emqx_ds:store_batch(?FUNCTION_NAME, Batch),
    %% Iterate through individual topics:
    [
        begin
-            [{_Rank, Stream}] = emqx_ds_storage_layer:get_streams(?SHARD, parse_topic(Topic), 0),
+            [{Rank, Stream}] = emqx_ds_storage_layer:get_streams(?SHARD, parse_topic(Topic), 0),
+            ct:pal("Streams for ~p: {~p, ~p}", [Topic, Rank, Stream]),
            {ok, It} = emqx_ds_storage_layer:make_iterator(?SHARD, Stream, parse_topic(Topic), 0),
+            ct:pal("Iterator for ~p: ~p", [Topic, It]),
            {ok, NextIt, MessagesAndKeys} = emqx_ds_storage_layer:next(
                ?SHARD, It, 100, emqx_ds:timestamp_us()
            ),
@ -91,10 +104,10 @@ t_delete(_Config) ->
    Topics = [<<"foo/bar">>, TopicToDelete, <<"a">>],
    Timestamps = lists:seq(1, 10),
    Batch = [
-        {PublishedAt, make_message(PublishedAt, Topic, integer_to_binary(PublishedAt))}
+        make_message(PublishedAt, Topic, integer_to_binary(PublishedAt))
     || Topic <- Topics, PublishedAt <- Timestamps
    ],
-    ok = emqx_ds_storage_layer:store_batch(?SHARD, Batch, #{}),
+    ok = emqx_ds:store_batch(?FUNCTION_NAME, Batch),

    %% Iterate through topics:
    StartTime = 0,
@ -109,23 +122,21 @@ t_delete(_Config) ->
    Messages = [Msg || {_DSKey, Msg} <- replay(?SHARD, TopicFilter, StartTime)],
    MessagesByTopic = maps:groups_from_list(fun emqx_message:topic/1, Messages),
    ?assertNot(is_map_key(TopicToDelete, MessagesByTopic), #{msgs => MessagesByTopic}),
-    ?assertEqual(20, length(Messages)),
-
-    ok.
+    ?assertEqual(20, length(Messages)).

 -define(assertSameSet(A, B), ?assertEqual(lists:sort(A), lists:sort(B))).

 %% Smoke test that verifies that concrete topics are mapped to
 %% individual streams, unless there's too many of them.
-t_get_streams(_Config) ->
+t_get_streams(Config) ->
    %% Prepare data (without wildcards):
    Topics = [<<"foo/bar">>, <<"foo/bar/baz">>, <<"a">>],
    Timestamps = lists:seq(1, 10),
    Batch = [
-        {PublishedAt, make_message(PublishedAt, Topic, integer_to_binary(PublishedAt))}
+        make_message(PublishedAt, Topic, integer_to_binary(PublishedAt))
     || Topic <- Topics, PublishedAt <- Timestamps
    ],
-    ok = emqx_ds_storage_layer:store_batch(?SHARD, Batch, #{}),
+    ok = emqx_ds:store_batch(?FUNCTION_NAME, Batch),
    GetStream = fun(Topic) ->
        StartTime = 0,
        emqx_ds_storage_layer:get_streams(?SHARD, parse_topic(Topic), StartTime)
@ -136,7 +147,7 @@ t_get_streams(_Config) ->
    [A] = GetStream(<<"a">>),
    %% Restart shard to make sure trie is persisted and restored:
    ok = emqx_ds:close_db(?FUNCTION_NAME),
-    ok = emqx_ds:open_db(?FUNCTION_NAME, ?DEFAULT_CONFIG),
+    ok = emqx_ds:open_db(?FUNCTION_NAME, ?DB_CONFIG(Config)),
    %% Verify that there are no "ghost streams" for topics that don't
    %% have any messages:
    [] = GetStream(<<"bar/foo">>),
@ -148,11 +159,11 @@ t_get_streams(_Config) ->
    NewBatch = [
        begin
            B = integer_to_binary(I),
-            {100, make_message(100, <<"foo/bar/", B/binary>>, <<"filler", B/binary>>)}
+            make_message(100, <<"foo/bar/", B/binary>>, <<"filler", B/binary>>)
        end
     || I <- lists:seq(1, 200)
    ],
-    ok = emqx_ds_storage_layer:store_batch(?SHARD, NewBatch, #{}),
+    ok = emqx_ds:store_batch(?FUNCTION_NAME, NewBatch),
    %% Check that "foo/bar/baz" topic now appears in two streams:
    %% "foo/bar/baz" and "foo/bar/+":
    NewStreams = lists:sort(GetStream("foo/bar/baz")),
@ -168,7 +179,7 @@ t_get_streams(_Config) ->
    ?assert(lists:member(A, AllStreams)),
    ok.

-t_new_generation_inherit_trie(_Config) ->
+t_new_generation_inherit_trie(Config) ->
    %% This test checks that we inherit the previous generation's LTS when creating a new
    %% generation.
    ?check_trace(
@ -176,25 +187,25 @@ t_new_generation_inherit_trie(_Config) ->
            %% Create a bunch of topics to be learned in the first generation
            TS1 = 500,
            Batch1 = [
-                {TS1, make_message(TS1, make_topic([wildcard, I, suffix, Suffix]), bin(I))}
+                make_message(TS1, make_topic([wildcard, I, suffix, Suffix]), bin(I))
             || I <- lists:seq(1, 200),
                Suffix <- [<<"foo">>, <<"bar">>]
            ],
-            ok = emqx_ds_storage_layer:store_batch(?SHARD, Batch1, #{}),
+            ok = emqx_ds:store_batch(?FUNCTION_NAME, Batch1),
            %% Now we create a new generation with the same LTS module.  It should inherit the
            %% learned trie.
            ok = emqx_ds_storage_layer:add_generation(?SHARD, _Since = 1_000),
            %% Restart the shard, to verify that LTS is persisted.
            ok = emqx_ds:close_db(?FUNCTION_NAME),
-            ok = emqx_ds:open_db(?FUNCTION_NAME, ?DEFAULT_CONFIG),
+            ok = emqx_ds:open_db(?FUNCTION_NAME, ?DB_CONFIG(Config)),
            %% Store a batch of messages with the same set of topics.
            TS2 = 1_500,
            Batch2 = [
-                {TS2, make_message(TS2, make_topic([wildcard, I, suffix, Suffix]), bin(I))}
+                make_message(TS2, make_topic([wildcard, I, suffix, Suffix]), bin(I))
             || I <- lists:seq(1, 200),
                Suffix <- [<<"foo">>, <<"bar">>]
            ],
-            ok = emqx_ds_storage_layer:store_batch(?SHARD, Batch2, #{}),
+            ok = emqx_ds:store_batch(?FUNCTION_NAME, Batch2),
            %% We should get only two streams for wildcard query, for "foo" and for "bar".
            ?assertMatch(
                [_Foo, _Bar],
@ -203,29 +214,30 @@ t_new_generation_inherit_trie(_Config) ->
            ok
        end,
        fun(Trace) ->
-            ?assertMatch([_], ?of_kind(bitfield_lts_inherited_trie, Trace)),
+            ?assertMatch([_], ?of_kind(layout_inherited_lts_trie, Trace)),
            ok
        end
    ),
    ok.

-t_replay(_Config) ->
+t_replay(Config) ->
    %% Create concrete topics:
    Topics = [<<"foo/bar">>, <<"foo/bar/baz">>],
-    Timestamps = lists:seq(1, 10_000, 100),
+    Values = lists:seq(1, 1_000, 100),
    Batch1 = [
-        {PublishedAt, make_message(PublishedAt, Topic, integer_to_binary(PublishedAt))}
-     || Topic <- Topics, PublishedAt <- Timestamps
+        make_message(Val, Topic, bin(Val))
+     || Topic <- Topics, Val <- Values
    ],
-    ok = emqx_ds_storage_layer:store_batch(?SHARD, Batch1, #{}),
+    ok = emqx_ds:store_batch(?FUNCTION_NAME, Batch1),
    %% Create wildcard topics `wildcard/+/suffix/foo' and `wildcard/+/suffix/bar':
    Batch2 = [
-        {TS, make_message(TS, make_topic([wildcard, I, suffix, Suffix]), bin(TS))}
-     || I <- lists:seq(1, 200), TS <- Timestamps, Suffix <- [<<"foo">>, <<"bar">>]
+        make_message(Val, make_topic([wildcard, Prefix, suffix, Suffix]), bin(Val))
+     || Prefix <- lists:seq(1, 200), Val <- Values, Suffix <- [<<"foo">>, <<"bar">>]
    ],
-    ok = emqx_ds_storage_layer:store_batch(?SHARD, Batch2, #{}),
+    ok = emqx_ds:store_batch(?FUNCTION_NAME, Batch2),
+    timer:sleep(5_000),
    %% Check various topic filters:
-    Messages = [M || {_TS, M} <- Batch1 ++ Batch2],
+    Messages = Batch1 ++ Batch2,
    %% Missing topics (no ghost messages):
    ?assertNot(check(?SHARD, <<"missing/foo/bar">>, 0, Messages)),
    %% Regular topics:
@ -238,7 +250,7 @@ t_replay(_Config) ->
    ?assert(check(?SHARD, <<"+/+/baz">>, 0, Messages)),
    %% Restart the DB to make sure trie is persisted and restored:
    ok = emqx_ds:close_db(?FUNCTION_NAME),
-    ok = emqx_ds:open_db(?FUNCTION_NAME, ?DEFAULT_CONFIG),
+    ok = emqx_ds:open_db(?FUNCTION_NAME, ?DB_CONFIG(Config)),
    %% Learned wildcard topics:
    ?assertNot(check(?SHARD, <<"wildcard/1000/suffix/foo">>, 0, [])),
    ?assert(check(?SHARD, <<"wildcard/1/suffix/foo">>, 0, Messages)),
@ -314,6 +326,9 @@ t_non_atomic_store_batch(_Config) ->
    ).

 check(Shard, TopicFilter, StartTime, ExpectedMessages) ->
+    ?tp(notice, ?MODULE_STRING "_check", #{
+        shard => Shard, tf => TopicFilter, start_time => StartTime
+    }),
    ExpectedFiltered = lists:filter(
        fun(#message{topic = Topic, timestamp = TS}) ->
            emqx_topic:match(Topic, TopicFilter) andalso TS >= StartTime
@ -325,17 +340,9 @@ check(Shard, TopicFilter, StartTime, ExpectedMessages) ->
        begin
            Dump = dump_messages(Shard, TopicFilter, StartTime),
            verify_dump(TopicFilter, StartTime, Dump),
-            Missing = ExpectedFiltered -- Dump,
-            Extras = Dump -- ExpectedFiltered,
-            ?assertMatch(
-                #{missing := [], unexpected := []},
-                #{
-                    missing => Missing,
-                    unexpected => Extras,
-                    topic_filter => TopicFilter,
-                    start_time => StartTime
-                }
-            )
+            emqx_ds_test_helpers:assert_same_set(ExpectedFiltered, Dump, #{
+                topic_filter => TopicFilter, start_time => StartTime
+            })
        end,
        []
    ),
@ -362,6 +369,7 @@ verify_dump(TopicFilter, StartTime, Dump) ->

 dump_messages(Shard, TopicFilter, StartTime) ->
    Streams = emqx_ds_storage_layer:get_streams(Shard, parse_topic(TopicFilter), StartTime),
+    ct:pal("Streams for ~p:~n ~p", [TopicFilter, Streams]),
    lists:flatmap(
        fun({_Rank, Stream}) ->
            dump_stream(Shard, Stream, TopicFilter, StartTime)
@ -374,6 +382,7 @@ dump_stream(Shard, Stream, TopicFilter, StartTime) ->
    {ok, Iterator} = emqx_ds_storage_layer:make_iterator(
        Shard, Stream, parse_topic(TopicFilter), StartTime
    ),
+    ct:pal("Iterator for ~p at stream ~p:~n  ~p", [TopicFilter, Stream, Iterator]),
    Loop = fun
        F(It, 0) ->
            error({too_many_iterations, It});
@ -502,24 +511,31 @@ bin(X) ->

 %% CT callbacks

-all() -> emqx_common_test_helpers:all(?MODULE).
+groups() ->
+    TCs = emqx_common_test_helpers:all(?MODULE),
+    [
+        {bitfield_lts, TCs},
+        {skipstream_lts, TCs}
+    ].
+
 suite() -> [{timetrap, {seconds, 20}}].

 init_per_suite(Config) ->
-    emqx_common_test_helpers:clear_screen(),
+    WorkDir = emqx_cth_suite:work_dir(Config),
    Apps = emqx_cth_suite:start(
        [emqx_ds_builtin_local],
-        #{work_dir => emqx_cth_suite:work_dir(Config)}
+        #{work_dir => WorkDir}
    ),
-    [{apps, Apps} | Config].
+    [{apps, Apps}, {work_dir, WorkDir} | Config].

 end_per_suite(Config) ->
    Apps = ?config(apps, Config),
    ok = emqx_cth_suite:stop(Apps),
+    emqx_cth_suite:clean_work_dir(?config(work_dir, Config)),
    ok.

 init_per_testcase(TC, Config) ->
-    ok = emqx_ds:open_db(TC, ?DEFAULT_CONFIG),
+    ok = emqx_ds:open_db(TC, ?DB_CONFIG(Config)),
    Config.

 end_per_testcase(TC, _Config) ->
@ -558,7 +574,7 @@ delete(Shard, Iterators, Selector) ->
        fun(Iterator0, {AccIterators, NAcc}) ->
            case
                emqx_ds_storage_layer:delete_next(
-                    Shard, Iterator0, Selector, 10, emqx_ds:timestamp_us()
+                    Shard, Iterator0, Selector, 10, ?FUTURE
                )
            of
                {ok, end_of_stream} ->
@ -591,7 +607,7 @@ replay(_Shard, []) ->
 replay(Shard, Iterators) ->
    {NewIterators0, Messages0} = lists:foldl(
        fun(Iterator0, {AccIterators, AccMessages}) ->
-            case emqx_ds_storage_layer:next(Shard, Iterator0, 10, emqx_ds:timestamp_us()) of
+            case emqx_ds_storage_layer:next(Shard, Iterator0, 10, ?FUTURE) of
                {ok, end_of_stream} ->
                    {AccIterators, AccMessages};
                {ok, _Iterator1, []} ->
--- a/apps/emqx_durable_storage/test/emqx_ds_test_helpers.erl
+++ b/apps/emqx_durable_storage/test/emqx_ds_test_helpers.erl
@ -122,6 +122,7 @@ topic_messages(TestCase, ClientId, N) ->
    fun() ->
        NBin = integer_to_binary(N),
        Msg = #message{
+            id = <<N:128>>,
            from = ClientId,
            topic = client_topic(TestCase, ClientId),
            timestamp = N * 100,
@ -148,8 +149,7 @@ do_ds_topic_generation_stream(DB, Node, Shard, It0) ->
            ?ON(
                Node,
                begin
-                    Now = emqx_ds_replication_layer:current_timestamp(DB, Shard),
-                    emqx_ds_storage_layer:next(Shard, It0, 1, Now)
+                    emqx_ds_storage_layer:next(Shard, It0, 1, _Now = 1 bsl 63)
                end
            )
        of
@ -233,15 +233,60 @@ transitions(Node, DB) ->
        end
    ).

-%% Stream comparison
+%% Message comparison
+
+%% Try to eliminate any ambiguity in the message representation.
+message_canonical_form(Msg0 = #message{}) ->
+    message_canonical_form(emqx_message:to_map(Msg0));
+message_canonical_form(#{flags := Flags0, headers := Headers0, payload := Payload0} = Msg) ->
+    %% Remove flags that are false:
+    Flags = maps:filter(
+        fun(_Key, Val) -> Val end,
+        Flags0
+    ),
+    Msg#{flags := Flags, payload := iolist_to_binary(Payload0)}.
+
+sublist(L) ->
+    PrintMax = 20,
+    case length(L) of
+        0 ->
+            [];
+        N when N > PrintMax ->
+            lists:sublist(L, 1, PrintMax) ++ ['...', N - PrintMax, 'more'];
+        _ ->
+            L
+    end.
+
+message_set(L) ->
+    ordsets:from_list([message_canonical_form(I) || I <- L]).
+
+message_set_subtract(A, B) ->
+    ordsets:subtract(message_set(A), message_set(B)).
+
+assert_same_set(Expected, Got) ->
+    assert_same_set(Expected, Got, #{}).
+
+assert_same_set(Expected, Got, Comment) ->
+    SE = message_set(Expected),
+    SG = message_set(Got),
+    case {ordsets:subtract(SE, SG), ordsets:subtract(SG, SE)} of
+        {[], []} ->
+            ok;
+        {Missing, Unexpected} ->
+            error(Comment#{
+                matching => sublist(ordsets:intersection(SE, SG)),
+                missing => sublist(Missing),
+                unexpected => sublist(Unexpected)
+            })
+    end.

 message_eq(Fields, {_Key, Msg1 = #message{}}, Msg2) ->
    message_eq(Fields, Msg1, Msg2);
 message_eq(Fields, Msg1, {_Key, Msg2 = #message{}}) ->
    message_eq(Fields, Msg1, Msg2);
 message_eq(Fields, Msg1 = #message{}, Msg2 = #message{}) ->
-    maps:with(Fields, emqx_message:to_map(Msg1)) =:=
-        maps:with(Fields, emqx_message:to_map(Msg2)).
+    maps:with(Fields, message_canonical_form(Msg1)) =:=
+        maps:with(Fields, message_canonical_form(Msg2)).

 %% Consuming streams and iterators

@ -304,6 +349,7 @@ ds_topic_stream(DB, ClientId, TopicBin, Node) ->
                {DBShard, emqx_ds_storage_layer:get_streams(DBShard, Topic, 0)}
            end
        ),
+    ct:pal("Streams for ~p, ~p @ ~p:~n    ~p", [ClientId, TopicBin, Node, DSStreams]),
    %% Sort streams by their rank Y, and chain them together:
    emqx_utils_stream:chain([
        ds_topic_generation_stream(DB, Node, ShardId, Topic, S)
--- a/apps/emqx_prometheus/src/emqx_prometheus.erl
+++ b/apps/emqx_prometheus/src/emqx_prometheus.erl
@ -512,9 +512,16 @@ emqx_collect(K = ?DS_BUFFER_BYTES, D) -> counter_metrics(?MG(K, D, []));
 emqx_collect(K = ?DS_BUFFER_FLUSH_TIME, D) -> gauge_metrics(?MG(K, D, []));
 emqx_collect(K = ?DS_STORE_BATCH_TIME, D) -> gauge_metrics(?MG(K, D, []));
 emqx_collect(K = ?DS_BUILTIN_NEXT_TIME, D) -> gauge_metrics(?MG(K, D, []));
-emqx_collect(K = ?DS_LTS_SEEK_COUNTER, D) -> counter_metrics(?MG(K, D, []));
-emqx_collect(K = ?DS_LTS_NEXT_COUNTER, D) -> counter_metrics(?MG(K, D, []));
-emqx_collect(K = ?DS_LTS_COLLISION_COUNTER, D) -> counter_metrics(?MG(K, D, [])).
+emqx_collect(K = ?DS_BITFIELD_LTS_SEEK_COUNTER, D) -> counter_metrics(?MG(K, D, []));
+emqx_collect(K = ?DS_BITFIELD_LTS_NEXT_COUNTER, D) -> counter_metrics(?MG(K, D, []));
+emqx_collect(K = ?DS_BITFIELD_LTS_COLLISION_COUNTER, D) -> counter_metrics(?MG(K, D, []));
+emqx_collect(K = ?DS_SKIPSTREAM_LTS_SEEK, D) -> counter_metrics(?MG(K, D, []));
+emqx_collect(K = ?DS_SKIPSTREAM_LTS_NEXT, D) -> counter_metrics(?MG(K, D, []));
+emqx_collect(K = ?DS_SKIPSTREAM_LTS_HASH_COLLISION, D) -> counter_metrics(?MG(K, D, []));
+emqx_collect(K = ?DS_SKIPSTREAM_LTS_HIT, D) -> counter_metrics(?MG(K, D, []));
+emqx_collect(K = ?DS_SKIPSTREAM_LTS_MISS, D) -> counter_metrics(?MG(K, D, []));
+emqx_collect(K = ?DS_SKIPSTREAM_LTS_FUTURE, D) -> counter_metrics(?MG(K, D, []));
+emqx_collect(K = ?DS_SKIPSTREAM_LTS_EOS, D) -> counter_metrics(?MG(K, D, [])).

 %%--------------------------------------------------------------------
 %% Indicators
--- a/changes/ce/feat-13370.en.md
+++ b/changes/ce/feat-13370.en.md
@ -0,0 +1,7 @@
+Add a new version of `wildcard_optimized` storage layout for the durable storage.
+
+Improvements:
+
+- New layout does not have an inherent latency
+
+- MQTT messages are serialized into a much more space-efficient format
--- a/rebar.config
+++ b/rebar.config
@ -47,7 +47,8 @@
        emqx_exproto_v_1_connection_adapter_bhvr,
        emqx_exproto_v_1_connection_unary_handler_client,
        emqx_exhook_v_2_hook_provider_client,
-        emqx_exhook_v_2_hook_provider_bhvr
+        emqx_exhook_v_2_hook_provider_bhvr,
+        'DurableMessage'
    ]},
    {plt_location, "."},
    {plt_prefix, "emqx_dialyzer"},