From b56597679450c5507d833c2d50c165885e346591 Mon Sep 17 00:00:00 2001 From: ieQu1 <99872536+ieQu1@users.noreply.github.com> Date: Sat, 29 Jun 2024 15:49:42 +0200 Subject: [PATCH 01/21] fix(ds): Fix hashing of empty wildcard topic levels in bitfield_lts --- .../emqx_durable_storage/src/emqx_ds_storage_bitfield_lts.erl | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/apps/emqx_durable_storage/src/emqx_ds_storage_bitfield_lts.erl b/apps/emqx_durable_storage/src/emqx_ds_storage_bitfield_lts.erl index 20c3bc087..d5834546f 100644 --- a/apps/emqx_durable_storage/src/emqx_ds_storage_bitfield_lts.erl +++ b/apps/emqx_durable_storage/src/emqx_ds_storage_bitfield_lts.erl @@ -596,7 +596,7 @@ prepare_loop_context(DB, CF, TopicIndex, StartTime, SafeCutoffTime, Varying, Key fun ('+') -> any; - (TopicLevel) when is_binary(TopicLevel) -> + (TopicLevel) when is_binary(TopicLevel); TopicLevel =:= '' -> {'=', hash_topic_level(TopicLevel)} end, Varying @@ -831,6 +831,8 @@ threshold_fun(0) -> threshold_fun(_) -> 20. +hash_topic_level('') -> + hash_topic_level(<<>>); hash_topic_level(TopicLevel) -> <> = erlang:md5(TopicLevel), Int. From 0f2c19b65661573d44ffc84baa546b97ff946f57 Mon Sep 17 00:00:00 2001 From: ieQu1 <99872536+ieQu1@users.noreply.github.com> Date: Mon, 1 Jul 2024 00:18:40 +0200 Subject: [PATCH 02/21] refactor(ds): Move end_of_stream detection logic for delete_next --- .../src/emqx_ds_storage_bitfield_lts.erl | 13 ++++++++---- .../src/emqx_ds_storage_layer.erl | 21 +++++++++++-------- .../src/emqx_ds_storage_reference.erl | 11 +++++++--- 3 files changed, 29 insertions(+), 16 deletions(-) diff --git a/apps/emqx_durable_storage/src/emqx_ds_storage_bitfield_lts.erl b/apps/emqx_durable_storage/src/emqx_ds_storage_bitfield_lts.erl index d5834546f..a161cb8d8 100644 --- a/apps/emqx_durable_storage/src/emqx_ds_storage_bitfield_lts.erl +++ b/apps/emqx_durable_storage/src/emqx_ds_storage_bitfield_lts.erl @@ -36,7 +36,7 @@ make_delete_iterator/5, update_iterator/4, next/6, - delete_next/6, + delete_next/7, handle_event/4 ]). @@ -495,14 +495,19 @@ next_until(#s{db = DB, data = CF, keymappers = Keymappers}, It, SafeCutoffTime, rocksdb:iterator_close(ITHandle) end. -delete_next(Shard, Schema = #s{ts_offset = TSOffset}, It, Selector, BatchSize, Now) -> +delete_next(Shard, Schema = #s{ts_offset = TSOffset}, It, Selector, BatchSize, Now, IsCurrent) -> %% Compute safe cutoff time. %% It's the point in time where the last complete epoch ends, so we need to know %% the current time to compute it. init_counters(), - SafeCutoffTime = (Now bsr TSOffset) bsl TSOffset, + SafeCutoffTime = ?EPOCH(Schema, Now) bsl TSOffset, try - delete_next_until(Schema, It, SafeCutoffTime, Selector, BatchSize) + case delete_next_until(Schema, It, SafeCutoffTime, Selector, BatchSize) of + {ok, _It, 0, 0} when not IsCurrent -> + {ok, end_of_stream}; + Result -> + Result + end after report_counters(Shard) end. diff --git a/apps/emqx_durable_storage/src/emqx_ds_storage_layer.erl b/apps/emqx_durable_storage/src/emqx_ds_storage_layer.erl index fe1d36a35..db3b4e5c3 100644 --- a/apps/emqx_durable_storage/src/emqx_ds_storage_layer.erl +++ b/apps/emqx_durable_storage/src/emqx_ds_storage_layer.erl @@ -261,6 +261,11 @@ ) -> [_Stream]. +-callback get_delete_streams( + shard_id(), generation_data(), emqx_ds:topic_filter(), emqx_ds:time() +) -> + [_Stream]. + -callback make_iterator( shard_id(), generation_data(), _Stream, emqx_ds:topic_filter(), emqx_ds:time() ) -> @@ -282,9 +287,10 @@ DeleteIterator, emqx_ds:delete_selector(), pos_integer(), - emqx_ds:time() + emqx_ds:time(), + _IsCurrentGeneration :: boolean() ) -> - {ok, DeleteIterator, _NDeleted :: non_neg_integer(), _IteratedOver :: non_neg_integer()}. + {ok, DeleteIterator, _NDeleted :: non_neg_integer(), _IteratedOver :: non_neg_integer()} | emqx_ds:error(_). -callback handle_event(shard_id(), generation_data(), emqx_ds:time(), CustomEvent | tick) -> [CustomEvent]. @@ -511,15 +517,12 @@ delete_next( ) -> case generation_get(Shard, GenId) of #{module := Mod, data := GenData} -> - Current = generation_current(Shard), - case Mod:delete_next(Shard, GenData, GenIter0, Selector, BatchSize, Now) of - {ok, _GenIter, _Deleted = 0, _IteratedOver = 0} when GenId < Current -> - %% This is a past generation. Storage layer won't write - %% any more messages here. The iterator reached the end: - %% the stream has been fully replayed. - {ok, end_of_stream}; + IsCurrent = GenId =:= generation_current(Shard), + case Mod:delete_next(Shard, GenData, GenIter0, Selector, BatchSize, Now, IsCurrent) of {ok, GenIter, NumDeleted, _IteratedOver} -> {ok, Iter#{?enc := GenIter}, NumDeleted}; + EOS = {ok, end_of_stream} -> + EOS; Error = {error, _} -> Error end; diff --git a/apps/emqx_durable_storage/src/emqx_ds_storage_reference.erl b/apps/emqx_durable_storage/src/emqx_ds_storage_reference.erl index ca29c11a8..cfd6f30ac 100644 --- a/apps/emqx_durable_storage/src/emqx_ds_storage_reference.erl +++ b/apps/emqx_durable_storage/src/emqx_ds_storage_reference.erl @@ -39,7 +39,7 @@ make_delete_iterator/5, update_iterator/4, next/6, - delete_next/6 + delete_next/7 ]). %% internal exports: @@ -169,7 +169,7 @@ next(_Shard, #s{db = DB, cf = CF}, It0, BatchSize, _Now, IsCurrent) -> {ok, It, lists:reverse(Messages)} end. -delete_next(_Shard, #s{db = DB, cf = CF}, It0, Selector, BatchSize, _Now) -> +delete_next(_Shard, #s{db = DB, cf = CF}, It0, Selector, BatchSize, _Now, IsCurrent) -> #delete_it{ topic_filter = TopicFilter, start_time = StartTime, @@ -198,7 +198,12 @@ delete_next(_Shard, #s{db = DB, cf = CF}, It0, Selector, BatchSize, _Now) -> ), rocksdb:iterator_close(ITHandle), It = It0#delete_it{last_seen_message_key = Key}, - {ok, It, NumDeleted, NumIterated}. + case IsCurrent of + false when NumDeleted =:= 0, NumIterated =:= 0 -> + {ok, end_of_stream}; + _ -> + {ok, It, NumDeleted, NumIterated} + end. %%================================================================================ %% Internal functions From afe1c5617df2d5834445c53bf0b5fff2d83ae951 Mon Sep 17 00:00:00 2001 From: ieQu1 <99872536+ieQu1@users.noreply.github.com> Date: Sun, 30 Jun 2024 10:49:50 +0200 Subject: [PATCH 03/21] refactor(ds): Rename macros for bitfield_lts metrics --- .../include/emqx_ds_metrics.hrl | 8 +++---- .../src/emqx_ds_builtin_metrics.erl | 12 +++++----- .../src/emqx_ds_storage_bitfield_lts.erl | 22 +++++++++++-------- apps/emqx_prometheus/src/emqx_prometheus.erl | 6 ++--- 4 files changed, 26 insertions(+), 22 deletions(-) diff --git a/apps/emqx_durable_storage/include/emqx_ds_metrics.hrl b/apps/emqx_durable_storage/include/emqx_ds_metrics.hrl index a76289eb9..4e463b14f 100644 --- a/apps/emqx_durable_storage/include/emqx_ds_metrics.hrl +++ b/apps/emqx_durable_storage/include/emqx_ds_metrics.hrl @@ -35,15 +35,15 @@ -define(DS_STORE_BATCH_TIME, emqx_ds_store_batch_time). -define(DS_BUILTIN_NEXT_TIME, emqx_ds_builtin_next_time). -%%% LTS Storage counters: +%%% Bitfield LTS Storage counters: %% This counter is incremented when the iterator seeks to the next interval: --define(DS_LTS_SEEK_COUNTER, emqx_ds_storage_bitfield_lts_counter_seek). +-define(DS_BITFIELD_LTS_SEEK_COUNTER, emqx_ds_storage_bitfield_lts_counter_seek). %% This counter is incremented when the iterator proceeds to the next %% key within the interval (this is is best case scenario): --define(DS_LTS_NEXT_COUNTER, emqx_ds_storage_bitfield_lts_counter_next). +-define(DS_BITFIELD_LTS_NEXT_COUNTER, emqx_ds_storage_bitfield_lts_counter_next). %% This counter is incremented when the key passes bitmask check, but %% the value is rejected by the subsequent post-processing: --define(DS_LTS_COLLISION_COUNTER, emqx_ds_storage_bitfield_lts_counter_collision). +-define(DS_BITFIELD_LTS_COLLISION_COUNTER, emqx_ds_storage_bitfield_lts_counter_collision). -endif. diff --git a/apps/emqx_durable_storage/src/emqx_ds_builtin_metrics.erl b/apps/emqx_durable_storage/src/emqx_ds_builtin_metrics.erl index d48cd0e34..107e2e5e6 100644 --- a/apps/emqx_durable_storage/src/emqx_ds_builtin_metrics.erl +++ b/apps/emqx_durable_storage/src/emqx_ds_builtin_metrics.erl @@ -57,9 +57,9 @@ -define(STORAGE_LAYER_METRICS, [ {slide, ?DS_STORE_BATCH_TIME}, - {counter, ?DS_LTS_SEEK_COUNTER}, - {counter, ?DS_LTS_NEXT_COUNTER}, - {counter, ?DS_LTS_COLLISION_COUNTER} + {counter, ?DS_BITFIELD_LTS_SEEK_COUNTER}, + {counter, ?DS_BITFIELD_LTS_NEXT_COUNTER}, + {counter, ?DS_BITFIELD_LTS_COLLISION_COUNTER} ]). -define(FETCH_METRICS, [ @@ -150,15 +150,15 @@ observe_next_time(DB, NextTime) -> -spec inc_lts_seek_counter(emqx_ds_storage_layer:shard_id(), non_neg_integer()) -> ok. inc_lts_seek_counter({DB, _}, Inc) -> - catch emqx_metrics_worker:inc(?WORKER, DB, ?DS_LTS_SEEK_COUNTER, Inc). + catch emqx_metrics_worker:inc(?WORKER, DB, ?DS_BITFIELD_LTS_SEEK_COUNTER, Inc). -spec inc_lts_next_counter(emqx_ds_storage_layer:shard_id(), non_neg_integer()) -> ok. inc_lts_next_counter({DB, _}, Inc) -> - catch emqx_metrics_worker:inc(?WORKER, DB, ?DS_LTS_NEXT_COUNTER, Inc). + catch emqx_metrics_worker:inc(?WORKER, DB, ?DS_BITFIELD_LTS_NEXT_COUNTER, Inc). -spec inc_lts_collision_counter(emqx_ds_storage_layer:shard_id(), non_neg_integer()) -> ok. inc_lts_collision_counter({DB, _}, Inc) -> - catch emqx_metrics_worker:inc(?WORKER, DB, ?DS_LTS_COLLISION_COUNTER, Inc). + catch emqx_metrics_worker:inc(?WORKER, DB, ?DS_BITFIELD_LTS_COLLISION_COUNTER, Inc). prometheus_meta() -> lists:map( diff --git a/apps/emqx_durable_storage/src/emqx_ds_storage_bitfield_lts.erl b/apps/emqx_durable_storage/src/emqx_ds_storage_bitfield_lts.erl index a161cb8d8..23225a5f6 100644 --- a/apps/emqx_durable_storage/src/emqx_ds_storage_bitfield_lts.erl +++ b/apps/emqx_durable_storage/src/emqx_ds_storage_bitfield_lts.erl @@ -156,7 +156,9 @@ -define(DIM_TOPIC, 1). -define(DIM_TS, 2). --define(DS_LTS_COUNTERS, [?DS_LTS_SEEK_COUNTER, ?DS_LTS_NEXT_COUNTER, ?DS_LTS_COLLISION_COUNTER]). +-define(DS_LTS_COUNTERS, [ + ?DS_BITFIELD_LTS_SEEK_COUNTER, ?DS_BITFIELD_LTS_NEXT_COUNTER, ?DS_BITFIELD_LTS_COLLISION_COUNTER +]). %% GVar used for idle detection: -define(IDLE_DETECT, idle_detect). @@ -637,7 +639,7 @@ next_loop(ITHandle, KeyMapper, Filter, Cutoff, It0, Acc0, N0) -> Key1 -> %% assert true = Key1 > Key0, - inc_counter(?DS_LTS_SEEK_COUNTER), + inc_counter(?DS_BITFIELD_LTS_SEEK_COUNTER), case rocksdb:iterator_move(ITHandle, {seek, Key1}) of {ok, Key, Val} -> {N, It, Acc} = traverse_interval( @@ -663,7 +665,7 @@ traverse_interval(ITHandle, KeyMapper, Filter, Cutoff, Key, Val, It0, Acc0, N) - Acc = [{Key, Msg} | Acc0], traverse_interval(ITHandle, KeyMapper, Filter, Cutoff, It, Acc, N - 1); false -> - inc_counter(?DS_LTS_COLLISION_COUNTER), + inc_counter(?DS_BITFIELD_LTS_COLLISION_COUNTER), traverse_interval(ITHandle, KeyMapper, Filter, Cutoff, It, Acc0, N) end; overflow -> @@ -675,7 +677,7 @@ traverse_interval(ITHandle, KeyMapper, Filter, Cutoff, Key, Val, It0, Acc0, N) - traverse_interval(_ITHandle, _KeyMapper, _Filter, _Cutoff, It, Acc, 0) -> {0, It, Acc}; traverse_interval(ITHandle, KeyMapper, Filter, Cutoff, It, Acc, N) -> - inc_counter(?DS_LTS_NEXT_COUNTER), + inc_counter(?DS_BITFIELD_LTS_NEXT_COUNTER), case rocksdb:iterator_move(ITHandle, next) of {ok, Key, Val} -> traverse_interval(ITHandle, KeyMapper, Filter, Cutoff, Key, Val, It, Acc, N); @@ -695,7 +697,7 @@ delete_next_loop(LoopContext0) -> iterated_over := AccIter0, it_handle := ITHandle } = LoopContext0, - inc_counter(?DS_LTS_SEEK_COUNTER), + inc_counter(?DS_BITFIELD_LTS_SEEK_COUNTER), #{?tag := ?DELETE_IT, ?last_seen_key := Key0} = It0, case emqx_ds_bitmask_keymapper:bin_increment(Filter, Key0) of overflow -> @@ -777,7 +779,7 @@ delete_traverse_interval1(LoopContext0) -> iterated_over := AccIter, storage_iter := It } = LoopContext0, - inc_counter(?DS_LTS_NEXT_COUNTER), + inc_counter(?DS_BITFIELD_LTS_NEXT_COUNTER), case rocksdb:iterator_move(ITHandle, next) of {ok, Key, Val} -> delete_traverse_interval(LoopContext0#{ @@ -940,9 +942,11 @@ init_counters() -> ok. report_counters(Shard) -> - emqx_ds_builtin_metrics:inc_lts_seek_counter(Shard, get(?DS_LTS_SEEK_COUNTER)), - emqx_ds_builtin_metrics:inc_lts_next_counter(Shard, get(?DS_LTS_NEXT_COUNTER)), - emqx_ds_builtin_metrics:inc_lts_collision_counter(Shard, get(?DS_LTS_COLLISION_COUNTER)), + emqx_ds_builtin_metrics:inc_lts_seek_counter(Shard, get(?DS_BITFIELD_LTS_SEEK_COUNTER)), + emqx_ds_builtin_metrics:inc_lts_next_counter(Shard, get(?DS_BITFIELD_LTS_NEXT_COUNTER)), + emqx_ds_builtin_metrics:inc_lts_collision_counter( + Shard, get(?DS_BITFIELD_LTS_COLLISION_COUNTER) + ), _ = [erase(I) || I <- ?DS_LTS_COUNTERS], ok. diff --git a/apps/emqx_prometheus/src/emqx_prometheus.erl b/apps/emqx_prometheus/src/emqx_prometheus.erl index 5d88ebd17..e205952d2 100644 --- a/apps/emqx_prometheus/src/emqx_prometheus.erl +++ b/apps/emqx_prometheus/src/emqx_prometheus.erl @@ -512,9 +512,9 @@ emqx_collect(K = ?DS_BUFFER_BYTES, D) -> counter_metrics(?MG(K, D, [])); emqx_collect(K = ?DS_BUFFER_FLUSH_TIME, D) -> gauge_metrics(?MG(K, D, [])); emqx_collect(K = ?DS_STORE_BATCH_TIME, D) -> gauge_metrics(?MG(K, D, [])); emqx_collect(K = ?DS_BUILTIN_NEXT_TIME, D) -> gauge_metrics(?MG(K, D, [])); -emqx_collect(K = ?DS_LTS_SEEK_COUNTER, D) -> counter_metrics(?MG(K, D, [])); -emqx_collect(K = ?DS_LTS_NEXT_COUNTER, D) -> counter_metrics(?MG(K, D, [])); -emqx_collect(K = ?DS_LTS_COLLISION_COUNTER, D) -> counter_metrics(?MG(K, D, [])). +emqx_collect(K = ?DS_BITFIELD_LTS_SEEK_COUNTER, D) -> counter_metrics(?MG(K, D, [])); +emqx_collect(K = ?DS_BITFIELD_LTS_NEXT_COUNTER, D) -> counter_metrics(?MG(K, D, [])); +emqx_collect(K = ?DS_BITFIELD_LTS_COLLISION_COUNTER, D) -> counter_metrics(?MG(K, D, [])). %%-------------------------------------------------------------------- %% Indicators From 71dad0242e0a611acb7beeb5fa9c6f86290d7eba Mon Sep 17 00:00:00 2001 From: ieQu1 <99872536+ieQu1@users.noreply.github.com> Date: Fri, 28 Jun 2024 15:55:10 +0200 Subject: [PATCH 04/21] docs(ds): Move Raft-related parts to emqx_ds_builtin_raft README --- apps/emqx_ds_builtin_raft/README.md | 33 +++++++++++++++++++++++++++++ apps/emqx_durable_storage/README.md | 30 ++++++-------------------- 2 files changed, 39 insertions(+), 24 deletions(-) diff --git a/apps/emqx_ds_builtin_raft/README.md b/apps/emqx_ds_builtin_raft/README.md index 7f468f365..ab288ef0e 100644 --- a/apps/emqx_ds_builtin_raft/README.md +++ b/apps/emqx_ds_builtin_raft/README.md @@ -1,3 +1,36 @@ # `emqx_ds_builtin_raft` Replication layer for the builtin EMQX durable storage backend that uses Raft algorithm. + + +Raft backend introduces the concept of **site** to alleviate the problem of changing node names. +Site IDs are persistent, and they are randomly generated at the first startup of the node. +Each node in the cluster has a unique site ID, that is independent from the Erlang node name (`emqx@...`). + +## Configurations + +OTP application environment variables: + +- `emqx_durable_storage.reads`: `leader_preferred` | `local_preferred`. + +# CLI + + +Runtime settings for the durable storages can be modified via CLI as well as the REST API. +The following CLI commands are available: + +- `emqx ctl ds info` — get a quick overview of the durable storage state +- `emqx ctl ds set_replicas ...` — update the list of replicas for a durable storage. +- `emqx ctl ds join ` — add a replica of durable storage on the site +- `emqx ctl ds leave ` — remove a replica of a durable storage from the site + +# HTTP APIs + +The following REST APIs are available for managing the builtin durable storages: + +- `/ds/sites` — list known sites. +- `/ds/sites/:site` — get information about the site (its status, current EMQX node name managing the site, etc.) +- `/ds/storages` — list durable storages +- `/ds/storages/:ds` — get information about the durable storage and its shards +- `/ds/storages/:ds/replicas` — list or update sites that contain replicas of a durable storage +- `/ds/storages/:ds/replicas/:site` — add or remove replica of the durable storage on the site diff --git a/apps/emqx_durable_storage/README.md b/apps/emqx_durable_storage/README.md index 739cbdc64..1e9f6be1b 100644 --- a/apps/emqx_durable_storage/README.md +++ b/apps/emqx_durable_storage/README.md @@ -13,11 +13,10 @@ This makes the storage disk requirements very predictable: only the number of _p DS _backend_ is a callback module that implements `emqx_ds` behavior. -EMQX repository contains the "builtin" backend, implemented in `emqx_ds_replication_layer` module, that uses Raft algorithm for data replication, and RocksDB as the main storage. +EMQX repository contains two builtin backends based on RocksDB: -Note that builtin backend introduces the concept of **site** to alleviate the problem of changing node names. -Site IDs are persistent, and they are randomly generated at the first startup of the node. -Each node in the cluster has a unique site ID, that is independent from the Erlang node name (`emqx@...`). +- `emqx_ds_builtin_local` +- `emqx_ds_builtin_raft` ### Layout @@ -113,8 +112,8 @@ In the future it can serve as a storage for retained messages or as a generic me # Configurations -Global options for `emqx_durable_storage` application are configured via OTP application environment. -Database-specific settings are stored in the schema table. +Common global options for builtin backends are configured via OTP application environment. +Database-specific settings are stored in EMQX config. The following application environment variables are available: @@ -124,26 +123,9 @@ The following application environment variables are available: - `emqx_durable_storage.egress_flush_interval`: period at which the batches of messages are committed to the durable storage. -- `emqx_durable_storage.reads`: `leader_preferred` | `local_preferred`. - -Runtime settings for the durable storages can be modified via CLI as well as the REST API. -The following CLI commands are available: - -- `emqx ctl ds info` — get a quick overview of the durable storage state -- `emqx ctl ds set_replicas ...` — update the list of replicas for a durable storage. -- `emqx ctl ds join ` — add a replica of durable storage on the site -- `emqx ctl ds leave ` — remove a replica of a durable storage from the site - # HTTP APIs -The following REST APIs are available for managing the builtin durable storages: - -- `/ds/sites` — list known sites. -- `/ds/sites/:site` — get information about the site (its status, current EMQX node name managing the site, etc.) -- `/ds/storages` — list durable storages -- `/ds/storages/:ds` — get information about the durable storage and its shards -- `/ds/storages/:ds/replicas` — list or update sites that contain replicas of a durable storage -- `/ds/storages/:ds/replicas/:site` — add or remove replica of the durable storage on the site +None # Other From eb80402ccbb4cc62c2d29096a73e99e5f393b9a8 Mon Sep 17 00:00:00 2001 From: ieQu1 <99872536+ieQu1@users.noreply.github.com> Date: Mon, 1 Jul 2024 01:01:03 +0200 Subject: [PATCH 05/21] fix(ds): Improve typespecs and descriptions in storage_layer --- .../src/emqx_ds_storage_layer.erl | 19 ++++++++++++++++++- 1 file changed, 18 insertions(+), 1 deletion(-) diff --git a/apps/emqx_durable_storage/src/emqx_ds_storage_layer.erl b/apps/emqx_durable_storage/src/emqx_ds_storage_layer.erl index db3b4e5c3..d6250254d 100644 --- a/apps/emqx_durable_storage/src/emqx_ds_storage_layer.erl +++ b/apps/emqx_durable_storage/src/emqx_ds_storage_layer.erl @@ -290,7 +290,9 @@ emqx_ds:time(), _IsCurrentGeneration :: boolean() ) -> - {ok, DeleteIterator, _NDeleted :: non_neg_integer(), _IteratedOver :: non_neg_integer()} | emqx_ds:error(_). + {ok, DeleteIterator, _NDeleted :: non_neg_integer(), _IteratedOver :: non_neg_integer()} + | {ok, end_of_stream} + | emqx_ds:error(_). -callback handle_event(shard_id(), generation_data(), emqx_ds:time(), CustomEvent | tick) -> [CustomEvent]. @@ -313,6 +315,8 @@ drop_shard(Shard) -> ok = rocksdb:destroy(db_dir(Shard), []). +%% @doc This is a convenicence wrapper that combines `prepare' and +%% `commit' operations. -spec store_batch( shard_id(), [{emqx_ds:time(), emqx_types:message()}], @@ -329,6 +333,15 @@ store_batch(Shard, Messages, Options) -> Error end. +%% @doc Transform a batch of messages into a "cooked batch" that can +%% be stored in the transaction log or transfered over the network. +%% +%% Important: the caller MUST ensure that timestamps within the shard +%% form a strictly increasing monotonic sequence through out the whole +%% lifetime of the shard. +%% +%% The underlying storage layout MAY use timestamp as a unique message +%% ID. -spec prepare_batch( shard_id(), [{emqx_ds:time(), emqx_types:message()}], @@ -361,6 +374,10 @@ prepare_batch(Shard, Messages = [{Time, _} | _], Options) -> prepare_batch(_Shard, [], _Options) -> ignore. +%% @doc Commit cooked batch to the storage. +%% +%% The underlying storage layout must guarantee that this operation is +%% idempotent. -spec commit_batch( shard_id(), cooked_batch(), From f84fb3469239cb95a2fbc7236de4023ca2df8f5f Mon Sep 17 00:00:00 2001 From: ieQu1 <99872536+ieQu1@users.noreply.github.com> Date: Mon, 1 Jul 2024 00:54:49 +0200 Subject: [PATCH 06/21] feat(ds_lts): New APIs: info, reverse lookups and topic compression --- apps/emqx_durable_storage/src/emqx_ds_lts.erl | 443 +++++++++++++++--- 1 file changed, 385 insertions(+), 58 deletions(-) diff --git a/apps/emqx_durable_storage/src/emqx_ds_lts.erl b/apps/emqx_durable_storage/src/emqx_ds_lts.erl index be13591e6..c0625593a 100644 --- a/apps/emqx_durable_storage/src/emqx_ds_lts.erl +++ b/apps/emqx_durable_storage/src/emqx_ds_lts.erl @@ -26,7 +26,13 @@ trie_copy_learned_paths/2, topic_key/3, match_topics/2, - lookup_topic_key/2 + lookup_topic_key/2, + reverse_lookup/2, + info/2, + info/1, + + compress_topic/3, + decompress_topic/2 ]). %% Debug: @@ -34,18 +40,21 @@ -export_type([ options/0, + level/0, static_key/0, trie/0, - msg_storage_key/0 + msg_storage_key/0, + learned_structure/0 ]). -include_lib("stdlib/include/ms_transform.hrl"). -ifdef(TEST). -include_lib("eunit/include/eunit.hrl"). --endif. -elvis([{elvis_style, variable_naming_convention, disable}]). +-elvis([{elvis_style, dont_repeat_yourself, disable}]). +-endif. %%================================================================================ %% Type declarations @@ -55,15 +64,22 @@ -define(EOT, []). -define(PLUS, '+'). --type edge() :: binary() | ?EOT | ?PLUS. +-type level() :: binary() | ''. -%% Fixed size binary --type static_key() :: non_neg_integer(). +-type edge() :: level() | ?EOT | ?PLUS. +%% Fixed size binary or integer, depending on the options: +-type static_key() :: non_neg_integer() | binary(). + +%% Trie root: -define(PREFIX, prefix). +%% Special prefix root for reverse lookups: +-define(rlookup, rlookup). +-define(rlookup(STATIC), {?rlookup, STATIC}). + -type state() :: static_key() | ?PREFIX. --type varying() :: [binary() | ?PLUS]. +-type varying() :: [level() | ?PLUS]. -type msg_storage_key() :: {static_key(), varying()}. @@ -71,27 +87,42 @@ -type persist_callback() :: fun((_Key, _Val) -> ok). +-type learned_structure() :: [level() | ?PLUS, ...]. + -type options() :: #{ persist_callback => persist_callback(), - static_key_size => pos_integer() + %% If set, static key is an integer that fits in a given nubmer of bits: + static_key_bits => pos_integer(), + %% If set, static key is a _binary_ of a given length: + static_key_bytes => pos_integer(), + reverse_lookups => boolean() }. -type dump() :: [{_Key, _Val}]. -record(trie, { persist :: persist_callback(), + is_binary_key :: boolean(), static_key_size :: pos_integer(), trie :: ets:tid(), - stats :: ets:tid() + stats :: ets:tid(), + rlookups = false :: boolean() }). -opaque trie() :: #trie{}. --record(trans, { - key :: {state(), edge()}, - next :: state() -}). +-record(trans, {key, next}). + +-type trans() :: + #trans{ + key :: {state(), edge()}, + next :: state() + } + | #trans{ + key :: {?rlookup, static_key()}, + next :: [level() | ?PLUS] + }. %%================================================================================ %% API functions @@ -100,21 +131,31 @@ %% @doc Create an empty trie -spec trie_create(options()) -> trie(). trie_create(UserOpts) -> - Defaults = #{ - persist_callback => fun(_, _) -> ok end, - static_key_size => 8 - }, - #{ - persist_callback := Persist, - static_key_size := StaticKeySize - } = maps:merge(Defaults, UserOpts), + Persist = maps:get( + persist_callback, + UserOpts, + fun(_, _) -> ok end + ), + Rlookups = maps:get(reverse_lookups, UserOpts, false), + IsBinaryKey = + case UserOpts of + #{static_key_bits := StaticKeySize} -> + false; + #{static_key_bytes := StaticKeySize} -> + true; + _ -> + StaticKeySize = 16, + true + end, Trie = ets:new(trie, [{keypos, #trans.key}, set, public]), Stats = ets:new(stats, [{keypos, 1}, set, public]), #trie{ persist = Persist, + is_binary_key = IsBinaryKey, static_key_size = StaticKeySize, trie = Trie, - stats = Stats + stats = Stats, + rlookups = Rlookups }. -spec trie_create() -> trie(). @@ -149,9 +190,21 @@ trie_dump(Trie, Filter) -> all -> Fun = fun(_) -> true end; wildcard -> - Fun = fun contains_wildcard/1 + Fun = fun(L) -> lists:member(?PLUS, L) end end, - lists:append([P || P <- paths(Trie), Fun(P)]). + Paths = lists:filter( + fun(Path) -> + Fun(tokens_of_path(Path)) + end, + paths(Trie) + ), + RlookupIdx = lists:filter( + fun({_, Tokens}) -> + Fun(Tokens) + end, + all_emanating(Trie, ?rlookup) + ), + lists:flatten([Paths, RlookupIdx]). -spec trie_copy_learned_paths(trie(), trie()) -> trie(). trie_copy_learned_paths(OldTrie, NewTrie) -> @@ -164,17 +217,17 @@ trie_copy_learned_paths(OldTrie, NewTrie) -> NewTrie. %% @doc Lookup the topic key. Create a new one, if not found. --spec topic_key(trie(), threshold_fun(), [binary() | '']) -> msg_storage_key(). +-spec topic_key(trie(), threshold_fun(), [level()]) -> msg_storage_key(). topic_key(Trie, ThresholdFun, Tokens) -> - do_topic_key(Trie, ThresholdFun, 0, ?PREFIX, Tokens, []). + do_topic_key(Trie, ThresholdFun, 0, ?PREFIX, Tokens, [], []). %% @doc Return an exisiting topic key if it exists. --spec lookup_topic_key(trie(), [binary()]) -> {ok, msg_storage_key()} | undefined. +-spec lookup_topic_key(trie(), [level()]) -> {ok, msg_storage_key()} | undefined. lookup_topic_key(Trie, Tokens) -> do_lookup_topic_key(Trie, ?PREFIX, Tokens, []). %% @doc Return list of keys of topics that match a given topic filter --spec match_topics(trie(), [binary() | '+' | '#']) -> +-spec match_topics(trie(), [level() | '+' | '#']) -> [msg_storage_key()]. match_topics(Trie, TopicFilter) -> do_match_topics(Trie, ?PREFIX, [], TopicFilter). @@ -206,7 +259,8 @@ dump_to_dot(#trie{trie = Trie, stats = Stats}, Filename) -> {ok, FD} = file:open(Filename, [write]), Print = fun (?PREFIX) -> "prefix"; - (NodeId) -> integer_to_binary(NodeId, 16) + (Bin) when is_binary(Bin) -> Bin; + (NodeId) when is_integer(NodeId) -> integer_to_binary(NodeId, 16) end, io:format(FD, "digraph {~n", []), lists:foreach( @@ -225,11 +279,64 @@ dump_to_dot(#trie{trie = Trie, stats = Stats}, Filename) -> io:format(FD, "}~n", []), file:close(FD). +-spec reverse_lookup(trie(), static_key()) -> {ok, learned_structure()} | undefined. +reverse_lookup(#trie{rlookups = false}, _) -> + error({badarg, reverse_lookups_disabled}); +reverse_lookup(#trie{trie = Trie}, StaticKey) -> + case ets:lookup(Trie, ?rlookup(StaticKey)) of + [#trans{next = Next}] -> + {ok, Next}; + [] -> + undefined + end. + +%% @doc Get information about the trie. +%% +%% Note: `reverse_lookups' must be enabled to get the number of +%% topics. +-spec info(trie(), size | topics) -> _. +info(#trie{rlookups = true, stats = Stats}, topics) -> + case ets:lookup(Stats, ?rlookup) of + [{_, N}] -> N; + [] -> 0 + end; +info(#trie{}, topics) -> + undefined; +info(#trie{trie = T}, size) -> + ets:info(T, size). + +%% @doc Return size of the trie +-spec info(trie()) -> proplists:proplist(). +info(Trie) -> + [ + {size, info(Trie, size)}, + {topics, info(Trie, topics)} + ]. + +%%%%%%%% Topic compression %%%%%%%%%% + +%% @doc Given topic structure for the static LTS index (as returned by +%% `reverse_lookup'), compress a topic filter to exclude static +%% levels: +-spec compress_topic(static_key(), learned_structure(), emqx_ds:topic_filter()) -> + [emqx_ds_lts:level() | '+']. +compress_topic(StaticKey, TopicStructure, TopicFilter) -> + compress_topic(StaticKey, TopicStructure, TopicFilter, []). + +%% @doc Given topic structure and a compressed topic filter, return +%% the original* topic filter. +%% +%% * '#' will be replaced with '+'s +-spec decompress_topic(learned_structure(), [level() | '+']) -> + emqx_ds:topic_filter(). +decompress_topic(TopicStructure, Topic) -> + decompress_topic(TopicStructure, Topic, []). + %%================================================================================ %% Internal exports %%================================================================================ --spec trie_next(trie(), state(), binary() | ?EOT) -> {Wildcard, state()} | undefined when +-spec trie_next(trie(), state(), level() | ?EOT) -> {Wildcard, state()} | undefined when Wildcard :: boolean(). trie_next(#trie{trie = Trie}, State, ?EOT) -> case ets:lookup(Trie, {State, ?EOT}) of @@ -261,16 +368,19 @@ trie_insert(Trie, State, Token) -> %% Internal functions %%================================================================================ --spec trie_insert(trie(), state(), edge(), state()) -> {Updated, state()} when - NChildren :: non_neg_integer(), - Updated :: false | NChildren. +-spec trie_insert + (trie(), state(), edge(), state()) -> {Updated, state()} when + NChildren :: non_neg_integer(), + Updated :: false | NChildren; + (trie(), ?rlookup, static_key(), [level() | '+']) -> + {false | non_neg_integer(), state()}. trie_insert(#trie{trie = Trie, stats = Stats, persist = Persist}, State, Token, NewState) -> Key = {State, Token}, Rec = #trans{ key = Key, next = NewState }, - case ets:insert_new(Trie, Rec) of + case ets_insert_new(Trie, Rec) of true -> ok = Persist(Key, NewState), Inc = @@ -287,7 +397,7 @@ trie_insert(#trie{trie = Trie, stats = Stats, persist = Persist}, State, Token, end. -spec get_id_for_key(trie(), state(), edge()) -> static_key(). -get_id_for_key(#trie{static_key_size = Size}, State, Token) when Size =< 32 -> +get_id_for_key(#trie{is_binary_key = IsBin, static_key_size = Size}, State, Token) -> %% Requirements for the return value: %% %% It should be globally unique for the `{State, Token}` pair. Other @@ -303,11 +413,17 @@ get_id_for_key(#trie{static_key_size = Size}, State, Token) when Size =< 32 -> %% If we want to impress computer science crowd, sorry, I mean to %% minimize storage requirements, we can even employ Huffman coding %% based on the frequency of messages. - <> = crypto:hash(sha256, term_to_binary([State | Token])), - Int. + Hash = crypto:hash(sha256, term_to_binary([State | Token])), + case IsBin of + false -> + <> = Hash, + Int; + true -> + element(1, erlang:split_binary(Hash, Size)) + end. %% erlfmt-ignore --spec do_match_topics(trie(), state(), [binary() | '+'], [binary() | '+' | '#']) -> +-spec do_match_topics(trie(), state(), [level() | '+'], [level() | '+' | '#']) -> list(). do_match_topics(Trie, State, Varying, []) -> case trie_next(Trie, State, ?EOT) of @@ -341,7 +457,7 @@ do_match_topics(Trie, State, Varying, [Level | Rest]) -> Emanating ). --spec do_lookup_topic_key(trie(), state(), [binary()], [binary()]) -> +-spec do_lookup_topic_key(trie(), state(), [level()], [level()]) -> {ok, msg_storage_key()} | undefined. do_lookup_topic_key(Trie, State, [], Varying) -> case trie_next(Trie, State, ?EOT) of @@ -360,29 +476,42 @@ do_lookup_topic_key(Trie, State, [Tok | Rest], Varying) -> undefined end. -do_topic_key(Trie, _, _, State, [], Varying) -> +do_topic_key(Trie, _, _, State, [], Tokens, Varying) -> %% We reached the end of topic. Assert: Trie node that corresponds %% to EOT cannot be a wildcard. - {_, false, Static} = trie_next_(Trie, State, ?EOT), + {Updated, false, Static} = trie_next_(Trie, State, ?EOT), + _ = + case Trie#trie.rlookups andalso Updated of + false -> + ok; + _ -> + trie_insert(Trie, rlookup, Static, lists:reverse(Tokens)) + end, {Static, lists:reverse(Varying)}; -do_topic_key(Trie, ThresholdFun, Depth, State, [Tok | Rest], Varying0) -> +do_topic_key(Trie, ThresholdFun, Depth, State, [Tok | Rest], Tokens, Varying0) -> % TODO: it's not necessary to call it every time. Threshold = ThresholdFun(Depth), + {NChildren, IsWildcard, NextState} = trie_next_(Trie, State, Tok), Varying = - case trie_next_(Trie, State, Tok) of - {NChildren, _, NextState} when is_integer(NChildren), NChildren >= Threshold -> + case IsWildcard of + _ when is_integer(NChildren), NChildren >= Threshold -> %% Number of children for the trie node reached the %% threshold, we need to insert wildcard here. {_, _WildcardState} = trie_insert(Trie, State, ?PLUS), Varying0; - {_, false, NextState} -> + false -> Varying0; - {_, true, NextState} -> + true -> %% This topic level is marked as wildcard in the trie, %% we need to add it to the varying part of the key: [Tok | Varying0] end, - do_topic_key(Trie, ThresholdFun, Depth + 1, NextState, Rest, Varying). + TokOrWildcard = + case IsWildcard of + true -> ?PLUS; + false -> Tok + end, + do_topic_key(Trie, ThresholdFun, Depth + 1, NextState, Rest, [TokOrWildcard | Tokens], Varying). %% @doc Has side effects! Inserts missing elements -spec trie_next_(trie(), state(), binary() | ?EOT) -> {New, Wildcard, state()} when @@ -450,12 +579,51 @@ follow_path(#trie{} = T, State, Path) -> all_emanating(T, State) ). -contains_wildcard([{{_State, ?PLUS}, _Next} | _Rest]) -> - true; -contains_wildcard([_ | Rest]) -> - contains_wildcard(Rest); -contains_wildcard([]) -> - false. +tokens_of_path([{{_State, Token}, _Next} | Rest]) -> + [Token | tokens_of_path(Rest)]; +tokens_of_path([]) -> + []. + +%% Wrapper for type checking only: +-compile({inline, ets_insert_new/2}). +-spec ets_insert_new(ets:tid(), trans()) -> boolean(). +ets_insert_new(Tid, Trans) -> + ets:insert_new(Tid, Trans). + +compress_topic(_StaticKey, [], [], Acc) -> + lists:reverse(Acc); +compress_topic(StaticKey, TStructL0, ['#'], Acc) -> + case TStructL0 of + [] -> + lists:reverse(Acc); + ['+' | TStructL] -> + compress_topic(StaticKey, TStructL, ['#'], ['+' | Acc]); + [_ | TStructL] -> + compress_topic(StaticKey, TStructL, ['#'], Acc) + end; +compress_topic(StaticKey, ['+' | TStructL], [Level | TopicL], Acc) -> + compress_topic(StaticKey, TStructL, TopicL, [Level | Acc]); +compress_topic(StaticKey, [Struct | TStructL], [Level | TopicL], Acc) when + Level =:= '+'; Level =:= Struct +-> + compress_topic(StaticKey, TStructL, TopicL, Acc); +compress_topic(StaticKey, TStructL, TopicL, _Acc) -> + %% Topic is mismatched with the structure. This should never + %% happen. LTS got corrupted? + Err = #{ + msg => 'Topic structure mismatch', + static_key => StaticKey, + input => TopicL, + structure => TStructL + }, + throw({unrecoverable, Err}). + +decompress_topic(['+' | TStructL], [Level | TopicL], Acc) -> + decompress_topic(TStructL, TopicL, [Level | Acc]); +decompress_topic([StaticLevel | TStructL], TopicL, Acc) -> + decompress_topic(TStructL, TopicL, [StaticLevel | Acc]); +decompress_topic([], [], Acc) -> + lists:reverse(Acc). %%================================================================================ %% Tests @@ -658,6 +826,76 @@ topic_match_test() -> dump_to_dot(T, filename:join("_build", atom_to_list(?FUNCTION_NAME) ++ ".dot")) end. +%% erlfmt-ignore +rlookup_test() -> + T = trie_create(#{reverse_lookups => true}), + Threshold = 2, + ThresholdFun = fun(0) -> 1000; + (_) -> Threshold + end, + {S1, []} = test_key(T, ThresholdFun, [1]), + {S11, []} = test_key(T, ThresholdFun, [1, 1]), + {S12, []} = test_key(T, ThresholdFun, [1, 2]), + {S111, []} = test_key(T, ThresholdFun, [1, 1, 1]), + {S11e, []} = test_key(T, ThresholdFun, [1, 1, '']), + %% Now add learned wildcards: + {S21, []} = test_key(T, ThresholdFun, [2, 1]), + {S22, []} = test_key(T, ThresholdFun, [2, 2]), + {S2_, [<<"3">>]} = test_key(T, ThresholdFun, [2, 3]), + {S2_11, [<<"3">>]} = test_key(T, ThresholdFun, [2, 3, 1, 1]), + {S2_12, [<<"4">>]} = test_key(T, ThresholdFun, [2, 4, 1, 2]), + {S2_1_, [<<"3">>, <<"3">>]} = test_key(T, ThresholdFun, [2, 3, 1, 3]), + %% Check reverse matching: + ?assertEqual({ok, [<<"1">>]}, reverse_lookup(T, S1)), + ?assertEqual({ok, [<<"1">>, <<"1">>]}, reverse_lookup(T, S11)), + ?assertEqual({ok, [<<"1">>, <<"2">>]}, reverse_lookup(T, S12)), + ?assertEqual({ok, [<<"1">>, <<"1">>, <<"1">>]}, reverse_lookup(T, S111)), + ?assertEqual({ok, [<<"1">>, <<"1">>, '']}, reverse_lookup(T, S11e)), + ?assertEqual({ok, [<<"2">>, <<"1">>]}, reverse_lookup(T, S21)), + ?assertEqual({ok, [<<"2">>, <<"2">>]}, reverse_lookup(T, S22)), + ?assertEqual({ok, [<<"2">>, '+']}, reverse_lookup(T, S2_)), + ?assertEqual({ok, [<<"2">>, '+', <<"1">>, <<"1">>]}, reverse_lookup(T, S2_11)), + ?assertEqual({ok, [<<"2">>, '+', <<"1">>, <<"2">>]}, reverse_lookup(T, S2_12)), + ?assertEqual({ok, [<<"2">>, '+', <<"1">>, '+']}, reverse_lookup(T, S2_1_)), + %% Dump and restore trie to make sure rlookup still works: + T1 = trie_restore(#{reverse_lookups => true}, trie_dump(T, all)), + destroy(T), + ?assertEqual({ok, [<<"2">>, <<"1">>]}, reverse_lookup(T1, S21)), + ?assertEqual({ok, [<<"2">>, '+', <<"1">>, '+']}, reverse_lookup(T1, S2_1_)). + +n_topics_test() -> + Threshold = 3, + ThresholdFun = fun + (0) -> 1000; + (_) -> Threshold + end, + + T = trie_create(#{reverse_lookups => true}), + ?assertEqual(0, info(T, topics)), + {S11, []} = test_key(T, ThresholdFun, [1, 1]), + {S11, []} = test_key(T, ThresholdFun, [1, 1]), + ?assertEqual(1, info(T, topics)), + + {S12, []} = test_key(T, ThresholdFun, [1, 2]), + {S12, []} = test_key(T, ThresholdFun, [1, 2]), + ?assertEqual(2, info(T, topics)), + + {_S13, []} = test_key(T, ThresholdFun, [1, 3]), + ?assertEqual(3, info(T, topics)), + + {S1_, [_]} = test_key(T, ThresholdFun, [1, 4]), + ?assertEqual(4, info(T, topics)), + + {S1_, [_]} = test_key(T, ThresholdFun, [1, 5]), + {S1_, [_]} = test_key(T, ThresholdFun, [1, 6]), + {S1_, [_]} = test_key(T, ThresholdFun, [1, 7]), + ?assertEqual(4, info(T, topics)), + + ?assertMatch( + [{size, N}, {topics, 4}] when is_integer(N), + info(T) + ). + -define(keys_history, topic_key_history). %% erlfmt-ignore @@ -773,11 +1011,16 @@ paths_test() -> ), %% Test filter function for paths containing wildcards - WildcardPaths = lists:filter(fun contains_wildcard/1, Paths), + WildcardPaths = lists:filter( + fun(Path) -> + lists:member(?PLUS, tokens_of_path(Path)) + end, + Paths + ), FormattedWildcardPaths = lists:map(fun format_path/1, WildcardPaths), ?assertEqual( - sets:from_list(FormattedWildcardPaths, [{version, 2}]), sets:from_list(lists:map(FormatPathSpec, ExpectedWildcardPaths), [{version, 2}]), + sets:from_list(FormattedWildcardPaths, [{version, 2}]), #{ expected => ExpectedWildcardPaths, wildcards => FormattedWildcardPaths @@ -795,13 +1038,97 @@ paths_test() -> #trie{trie = Tab2} = T2, Dump1 = sets:from_list(ets:tab2list(Tab1), [{version, 2}]), Dump2 = sets:from_list(ets:tab2list(Tab2), [{version, 2}]), - ?assertEqual(Dump1, Dump2), - - ok. + ?assertEqual(Dump1, Dump2). format_path([{{_State, Edge}, _Next} | Rest]) -> [Edge | format_path(Rest)]; format_path([]) -> []. +compress_topic_test() -> + %% Structure without wildcards: + ?assertEqual([], compress_topic(42, [], [])), + ?assertEqual([], compress_topic(42, [<<"foo">>, <<"bar">>], [<<"foo">>, <<"bar">>])), + ?assertEqual([], compress_topic(42, [<<"foo">>, ''], [<<"foo">>, ''])), + ?assertEqual([], compress_topic(42, [<<"foo">>, ''], [<<"foo">>, '+'])), + ?assertEqual([], compress_topic(42, [<<"foo">>, ''], ['+', '+'])), + ?assertEqual([], compress_topic(42, [<<"foo">>, <<"bar">>, ''], ['#'])), + ?assertEqual([], compress_topic(42, [<<"foo">>, <<"bar">>, ''], [<<"foo">>, <<"bar">>, '#'])), + ?assertEqual([], compress_topic(42, [<<"foo">>, <<"bar">>, ''], ['+', '#'])), + ?assertEqual( + [], compress_topic(42, [<<"foo">>, <<"bar">>, ''], [<<"foo">>, <<"bar">>, '', '#']) + ), + %% With wildcards: + ?assertEqual( + [<<"1">>], compress_topic(42, [<<"foo">>, '+', <<"bar">>], [<<"foo">>, <<"1">>, <<"bar">>]) + ), + ?assertEqual( + [<<"1">>, <<"2">>], + compress_topic( + 42, + [<<"foo">>, '+', <<"bar">>, '+', <<"baz">>], + [<<"foo">>, <<"1">>, <<"bar">>, <<"2">>, <<"baz">>] + ) + ), + ?assertEqual( + ['+', <<"2">>], + compress_topic( + 42, + [<<"foo">>, '+', <<"bar">>, '+', <<"baz">>], + [<<"foo">>, '+', <<"bar">>, <<"2">>, <<"baz">>] + ) + ), + ?assertEqual( + ['+', '+'], + compress_topic( + 42, + [<<"foo">>, '+', <<"bar">>, '+', <<"baz">>], + [<<"foo">>, '+', <<"bar">>, '+', <<"baz">>] + ) + ), + ?assertEqual( + ['+', '+'], + compress_topic( + 42, + [<<"foo">>, '+', <<"bar">>, '+', <<"baz">>], + ['#'] + ) + ), + ?assertEqual( + ['+', '+'], + compress_topic( + 42, + [<<"foo">>, '+', <<"bar">>, '+', <<"baz">>], + [<<"foo">>, '+', '+', '#'] + ) + ), + %% Mismatch: + ?assertException(_, {unrecoverable, _}, compress_topic(42, [<<"foo">>], [<<"bar">>])), + ?assertException(_, {unrecoverable, _}, compress_topic(42, [], [<<"bar">>])), + ?assertException(_, {unrecoverable, _}, compress_topic(42, [<<"foo">>], [])), + ?assertException(_, {unrecoverable, _}, compress_topic(42, ['', ''], ['', '', ''])), + ?assertException(_, {unrecoverable, _}, compress_topic(42, ['', ''], [<<"foo">>, '#'])), + ?assertException(_, {unrecoverable, _}, compress_topic(42, ['', ''], ['+', '+', '+', '#'])), + ?assertException(_, {unrecoverable, _}, compress_topic(42, ['+'], [<<"bar">>, '+'])), + ?assertException( + _, {unrecoverable, _}, compress_topic(42, [<<"foo">>, '+'], [<<"bar">>, <<"baz">>]) + ). + +decompress_topic_test() -> + %% Structure without wildcards: + ?assertEqual([], decompress_topic([], [])), + ?assertEqual( + [<<"foo">>, '', <<"bar">>], + decompress_topic([<<"foo">>, '', <<"bar">>], []) + ), + %% With wildcards: + ?assertEqual( + [<<"foo">>, '', <<"bar">>, <<"baz">>], + decompress_topic([<<"foo">>, '+', <<"bar">>, '+'], ['', <<"baz">>]) + ), + ?assertEqual( + [<<"foo">>, '+', <<"bar">>, '+', ''], + decompress_topic([<<"foo">>, '+', <<"bar">>, '+', ''], ['+', '+']) + ). + -endif. From 843973ef32441ba4e4458f5af62dd5225de684c7 Mon Sep 17 00:00:00 2001 From: ieQu1 <99872536+ieQu1@users.noreply.github.com> Date: Mon, 1 Jul 2024 01:11:40 +0200 Subject: [PATCH 07/21] fix(ds): bitfield_lts: static_key_size -> static_key_bits --- apps/emqx_durable_storage/src/emqx_ds_lts.erl | 6 +++++- .../src/emqx_ds_storage_bitfield_lts.erl | 2 +- 2 files changed, 6 insertions(+), 2 deletions(-) diff --git a/apps/emqx_durable_storage/src/emqx_ds_lts.erl b/apps/emqx_durable_storage/src/emqx_ds_lts.erl index c0625593a..0a7b6c59a 100644 --- a/apps/emqx_durable_storage/src/emqx_ds_lts.erl +++ b/apps/emqx_durable_storage/src/emqx_ds_lts.erl @@ -416,7 +416,11 @@ get_id_for_key(#trie{is_binary_key = IsBin, static_key_size = Size}, State, Toke Hash = crypto:hash(sha256, term_to_binary([State | Token])), case IsBin of false -> - <> = Hash, + %% Note: for backward compatibility with bitstream_lts + %% layout we allow the key to be an integer. But this also + %% changes the semantics of `static_key_size` from number + %% of bytes to bits: + <> = Hash, Int; true -> element(1, erlang:split_binary(Hash, Size)) diff --git a/apps/emqx_durable_storage/src/emqx_ds_storage_bitfield_lts.erl b/apps/emqx_durable_storage/src/emqx_ds_storage_bitfield_lts.erl index 23225a5f6..182fe8cf0 100644 --- a/apps/emqx_durable_storage/src/emqx_ds_storage_bitfield_lts.erl +++ b/apps/emqx_durable_storage/src/emqx_ds_storage_bitfield_lts.erl @@ -905,7 +905,7 @@ restore_trie(TopicIndexBytes, DB, CF) -> {ok, IT} = rocksdb:iterator(DB, CF, []), try Dump = read_persisted_trie(IT, rocksdb:iterator_move(IT, first)), - TrieOpts = #{persist_callback => PersistCallback, static_key_size => TopicIndexBytes}, + TrieOpts = #{persist_callback => PersistCallback, static_key_bits => TopicIndexBytes * 8}, emqx_ds_lts:trie_restore(TrieOpts, Dump) after rocksdb:iterator_close(IT) From 210556e5453ded27235c246e661a94fbbe0815ca Mon Sep 17 00:00:00 2001 From: ieQu1 <99872536+ieQu1@users.noreply.github.com> Date: Mon, 1 Jul 2024 00:58:02 +0200 Subject: [PATCH 08/21] feat(ds): Generalize value serialization - Add a new utility module for message serialization - Add experimental serializer based on ASN.1 --- .../asn.1/DurableMessage.asn | 90 +++ apps/emqx_durable_storage/gen_src/.gitignore | 4 + apps/emqx_durable_storage/rebar.config | 7 +- .../src/emqx_ds_msg_serializer.erl | 515 ++++++++++++++++++ rebar.config | 3 +- 5 files changed, 617 insertions(+), 2 deletions(-) create mode 100644 apps/emqx_durable_storage/asn.1/DurableMessage.asn create mode 100644 apps/emqx_durable_storage/gen_src/.gitignore create mode 100644 apps/emqx_durable_storage/src/emqx_ds_msg_serializer.erl diff --git a/apps/emqx_durable_storage/asn.1/DurableMessage.asn b/apps/emqx_durable_storage/asn.1/DurableMessage.asn new file mode 100644 index 000000000..88897c460 --- /dev/null +++ b/apps/emqx_durable_storage/asn.1/DurableMessage.asn @@ -0,0 +1,90 @@ +-- This schema specifies binary encoding of EMQX's internal +-- representation of a message. +-- +-- Note: MQTT standard specifies that certain properties like topic +-- should be UTF8 strings. Here we represent them as OCTET STRING to +-- avoid extra checks. + +DurableMessage DEFINITIONS AUTOMATIC TAGS ::= +BEGIN + -- Non-standard flag: + MiscFlag ::= SEQUENCE { + key UTF8String, + value BOOLEAN + } + + -- Non-standard header or property. + -- Both key and value are interpreted as erlang terms: + MiscProperty ::= SEQUENCE { + key OCTET STRING, + value OCTET STRING + } + + ClientAttr ::= SEQUENCE { + key OCTET STRING, + value OCTET STRING + } + + -- Wrapper for any data that doesn't comply with the strict schema: + Misc ::= CHOICE { + flag MiscFlag, + header MiscProperty, + property MiscProperty, + -- Currently these are unused: + clientAttr ClientAttr, + extra MiscProperty + } + + -- Both key and value are interpreted as binaries: + UserProperty ::= SEQUENCE { + key OCTET STRING, + value OCTET STRING + } + + -- Common properties that are present in almost any message: + StdProperties ::= SEQUENCE { + payloadFormatIndicator INTEGER (0..255) OPTIONAL, + messageExpiryInterval INTEGER (0..4294967295) OPTIONAL, + responseTopic OCTET STRING OPTIONAL, + correlationData OCTET STRING OPTIONAL, + contentType OCTET STRING OPTIONAL, + userProperty SEQUENCE OF UserProperty + } + + ProtoVer ::= CHOICE { + mqtt INTEGER(0..255), + mqtt-sn INTEGER(0..255), + coap INTEGER(0..255) + } + + -- Common headers that are present in almost any message: + StdHeaders ::= SEQUENCE { + protoVer ProtoVer OPTIONAL, + peerhost OCTET STRING (SIZE(4..16)) OPTIONAL, -- IPv4 (4 octets) .. IPv6 (16 octets) + peername OCTET STRING (SIZE(6..18)) OPTIONAL, -- IPv4 (4 octets) .. IPv6 (16 octets) + 2 octets for (TCP/UDP) port + username OCTET STRING OPTIONAL + } + + From ::= CHOICE { + atom UTF8String, + binary OCTET STRING + } + + DurableMessage ::= SEQUENCE { + id OCTET STRING, + from From, + topic OCTET STRING, + payload OCTET STRING, + timestamp INTEGER, + qos INTEGER (0..2), + -- MQTT PUBLISH flags: + sys BOOLEAN, + dup BOOLEAN, + retain BOOLEAN, + -- Headers: + headers StdHeaders, + properties StdProperties, + -- Miscellaneous, highly EMQX-specific internal data: + misc SEQUENCE OF Misc OPTIONAL + } +END diff --git a/apps/emqx_durable_storage/gen_src/.gitignore b/apps/emqx_durable_storage/gen_src/.gitignore new file mode 100644 index 000000000..53ae84991 --- /dev/null +++ b/apps/emqx_durable_storage/gen_src/.gitignore @@ -0,0 +1,4 @@ +*.hrl +*.erl +*.beam +*.asn1db diff --git a/apps/emqx_durable_storage/rebar.config b/apps/emqx_durable_storage/rebar.config index 3a6379924..a78649d92 100644 --- a/apps/emqx_durable_storage/rebar.config +++ b/apps/emqx_durable_storage/rebar.config @@ -1,3 +1,8 @@ %% -*- mode:erlang -*- - {deps, [{emqx_utils, {path, "../emqx_utils"}}]}. + +{erl_opts, [{src_dirs, ["src", "gen_src"]}]}. + +{pre_hooks, [ + {"(linux|darwin|solaris)", compile, "erlc -bper +noobj -o gen_src asn.1/DurableMessage.asn"} +]}. diff --git a/apps/emqx_durable_storage/src/emqx_ds_msg_serializer.erl b/apps/emqx_durable_storage/src/emqx_ds_msg_serializer.erl new file mode 100644 index 000000000..11bd34d68 --- /dev/null +++ b/apps/emqx_durable_storage/src/emqx_ds_msg_serializer.erl @@ -0,0 +1,515 @@ +%%-------------------------------------------------------------------- +%% Copyright (c) 2024 EMQ Technologies Co., Ltd. All Rights Reserved. +%% +%% Licensed under the Apache License, Version 2.0 (the "License"); +%% you may not use this file except in compliance with the License. +%% You may obtain a copy of the License at +%% +%% http://www.apache.org/licenses/LICENSE-2.0 +%% +%% Unless required by applicable law or agreed to in writing, software +%% distributed under the License is distributed on an "AS IS" BASIS, +%% WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +%% See the License for the specific language governing permissions and +%% limitations under the License. +%%-------------------------------------------------------------------- + +%% @doc This utility module provides a generic method for encoding +%% (and decoding) MQTT messages at rest. +%% +%% Note to developer: backward compatibility has to be maintained at +%% all times, for all releases. +-module(emqx_ds_msg_serializer). + +%% API: +-export([serialize/2, deserialize/2, check_schema/1]). + +%% internal exports: +-export([]). + +-include_lib("emqx_utils/include/emqx_message.hrl"). +-include_lib("typerefl/include/types.hrl"). +-include("../gen_src/DurableMessage.hrl"). + +-ifdef(TEST). +-include_lib("eunit/include/eunit.hrl"). +-endif. + +-elvis([{elvis_style, atom_naming_convention, disable}]). +-dialyzer({nowarn_function, [serialize_asn1/1, deserialize_asn1/1]}). + +%%================================================================================ +%% Type declarations +%%================================================================================ + +%% FIXME: Properl reflection fails dialyzer check due wrong spec in +%% typerefl +-type schema() :: term(). + +-reflect_type([schema/0]). + +%%================================================================================ +%% API functions +%%================================================================================ + +-spec check_schema(schema()) -> ok | {error, _}. +check_schema(v1) -> + ok; +check_schema(asn1) -> + ok; +check_schema(_) -> + {error, "Unknown schema type"}. + +-spec serialize(schema(), emqx_types:message()) -> binary(). +serialize(v1, Msg) -> + serialize_v1(Msg); +serialize(asn1, Msg) -> + serialize_asn1(Msg). + +-spec deserialize(schema(), binary()) -> emqx_types:message(). +deserialize(v1, Blob) -> + deserialize_v1(Blob); +deserialize(asn1, Blob) -> + deserialize_asn1(Blob). + +%%================================================================================ +%% Internal functions +%%================================================================================ + +%%-------------------------------------------------------------------------------- +%% V1 (erlang:term_to_binary/binary_to_term). Simple not the most +%% space- and CPU-efficient encoding +%% -------------------------------------------------------------------------------- + +serialize_v1(Msg) -> + term_to_binary(message_to_value_v1(Msg)). + +message_to_value_v1(#message{ + id = Id, + qos = Qos, + from = From, + flags = Flags, + headers = Headers, + topic = Topic, + payload = Payload, + timestamp = Timestamp, + extra = Extra +}) -> + {Id, Qos, From, Flags, Headers, Topic, Payload, Timestamp, Extra}. + +deserialize_v1(Blob) -> + value_v1_to_message(binary_to_term(Blob)). + +value_v1_to_message({Id, Qos, From, Flags, Headers, Topic, Payload, Timestamp, Extra}) -> + #message{ + id = Id, + qos = Qos, + from = From, + flags = Flags, + headers = Headers, + topic = Topic, + payload = Payload, + timestamp = Timestamp, + extra = Extra + }. + +%%-------------------------------------------------------------------------------- +%% Encoding based on ASN1. +%%-------------------------------------------------------------------------------- + +serialize_asn1(#message{ + id = Id, + qos = Qos, + from = From0, + flags = Flags, + topic = Topic, + payload = Payload, + timestamp = Timestamp, + headers = Headers +}) -> + MiscFlags = maps:fold( + fun + (Key, Val, Acc) when Key =/= sys, Key =/= dup, Key =/= retain -> + [asn1_encode_misc(flag, Key, Val) | Acc]; + (_, _, Acc) -> + Acc + end, + [], + Flags + ), + {StdHeaders, StdProps, MiscHeaders} = asn1_encode_headers(Headers), + {ok, Bin} = 'DurableMessage':encode('DurableMessage', #'DurableMessage'{ + id = Id, + from = + case is_atom(From0) of + true -> {atom, erlang:atom_to_binary(From0, utf8)}; + false -> {binary, From0} + end, + topic = Topic, + payload = iolist_to_binary(Payload), + timestamp = Timestamp, + + qos = Qos, + sys = maps:get(sys, Flags, false), + dup = maps:get(dup, Flags, false), + retain = maps:get(retain, Flags, false), + + properties = StdProps, + headers = StdHeaders, + + %% TODO: store client attrs? + misc = MiscFlags ++ MiscHeaders + }), + Bin. + +deserialize_asn1(Blob) -> + {ok, #'DurableMessage'{ + id = Id, + from = From0, + topic = Topic, + payload = Payload, + timestamp = Timestamp, + qos = Qos, + + sys = Sys, + dup = Dup, + retain = Retain, + + headers = StdHeaders, + properties = StdProperties, + + misc = Misc + }} = 'DurableMessage':decode('DurableMessage', Blob), + From = + case From0 of + {atom, Bin} -> erlang:binary_to_atom(Bin, utf8); + {binary, Bin} -> Bin + end, + %% Decode flags: + Flags = #{sys => Sys, dup => Dup, retain => Retain}, + asn1_deserialize_misc(Misc, #message{ + id = Id, + qos = Qos, + from = From, + topic = Topic, + payload = Payload, + timestamp = Timestamp, + flags = Flags, + headers = asn1_decode_headers(StdHeaders, StdProperties) + }). + +asn1_encode_headers(Headers) -> + PeerName = + case Headers of + #{peername := {IP1, Port}} -> encode_ip_port(16, IP1, Port); + _ -> asn1_NOVALUE + end, + PeerHost = + case Headers of + #{peerhost := IP2} -> encode_ip_port(0, IP2, 0); + _ -> asn1_NOVALUE + end, + ProtoVer = asn1_encode_proto_ver(Headers), + StdHeaders = #'StdHeaders'{ + protoVer = ProtoVer, + peername = PeerName, + peerhost = PeerHost, + username = + case Headers of + #{username := U} when is_binary(U) -> U; + _ -> asn1_NOVALUE + end + }, + {StdProps, MiscProps} = asn1_encode_properties(maps:get(properties, Headers, #{})), + MiscHeaders = maps:fold( + fun + (Header, _V, Acc) when + Header =:= properties; Header =:= username; Header =:= client_attrs + -> + Acc; + (protocol, _V, Acc) when ProtoVer =/= asn1_NOVALUE -> + Acc; + (proto_ver, _V, Acc) when ProtoVer =/= asn1_NOVALUE -> + Acc; + (peername, _V, Acc) when PeerName =/= asn1_NOVALUE -> + Acc; + (peerhost, _V, Acc) when PeerHost =/= asn1_NOVALUE -> + Acc; + %% Add headers that could not be encoded using fixed schema: + (Key, Val, Acc) -> + [asn1_encode_misc(header, Key, Val) | Acc] + end, + [], + Headers + ), + {StdHeaders, StdProps, MiscHeaders ++ MiscProps}. + +asn1_encode_properties(Props) -> + UserProps = maps:get('User-Property', Props, []), + StdProperties = #'StdProperties'{ + payloadFormatIndicator = asn1_std_prop('Payload-Format-Indicator', Props), + messageExpiryInterval = asn1_std_prop('Message-Expiry-Interval', Props), + responseTopic = asn1_std_prop('Response-Topic', Props), + correlationData = asn1_std_prop('Correlation-Data', Props), + contentType = asn1_std_prop('Content-Type', Props), + userProperty = [#'UserProperty'{key = K, value = V} || {K, V} <- UserProps] + }, + MiscProperties = maps:fold( + fun + (K, V, Acc) when + K =/= 'Payload-Format-Indicator', + K =/= 'Message-Expiry-Interval', + K =/= 'Response-Topic', + K =/= 'Correlation-Data', + K =/= 'Content-Type', + K =/= 'User-Property' + -> + [asn1_encode_misc(property, K, V) | Acc]; + (_, _, Acc) -> + Acc + end, + [], + Props + ), + {StdProperties, MiscProperties}. + +asn1_encode_misc(header, Key, Val) -> + {header, #'MiscProperty'{ + key = term_to_binary(Key), value = term_to_binary(Val) + }}; +asn1_encode_misc(property, Key, Val) -> + {property, #'MiscProperty'{ + key = term_to_binary(Key), value = term_to_binary(Val) + }}; +asn1_encode_misc(flag, Key, Val) -> + {flag, #'MiscFlag'{ + key = atom_to_binary(Key, utf8), value = Val + }}. + +asn1_std_prop(Key, Map) -> + case Map of + #{Key := Val} -> Val; + _ -> asn1_NOVALUE + end. + +asn1_decode_headers( + #'StdHeaders'{ + protoVer = ProtoVer, peerhost = Peerhost, peername = Peername, username = Username + }, + StdProperties +) -> + M0 = asn1_decode_properties(StdProperties), + M1 = + case ProtoVer of + asn1_NOVALUE -> M0; + {Protocol, Ver} -> M0#{protocol => Protocol, proto_ver => Ver} + end, + M2 = asn1_add_optional(peername, decode_ip_port(16, Peername), M1), + M3 = + case decode_ip_port(0, Peerhost) of + asn1_NOVALUE -> M2; + {PeerIP, _} -> M2#{peerhost => PeerIP} + end, + asn1_add_optional(username, Username, M3). + +asn1_decode_properties(#'StdProperties'{ + payloadFormatIndicator = PFI, + userProperty = UP, + messageExpiryInterval = MEI, + responseTopic = RT, + correlationData = CD, + contentType = CT +}) -> + M0 = + case [{K, V} || #'UserProperty'{key = K, value = V} <- UP] of + [] -> #{}; + UserProps -> #{'User-Property' => UserProps} + end, + M1 = asn1_add_optional('Payload-Format-Indicator', PFI, M0), + M2 = asn1_add_optional('Message-Expiry-Interval', MEI, M1), + M3 = asn1_add_optional('Response-Topic', RT, M2), + M4 = asn1_add_optional('Correlation-Data', CD, M3), + M5 = asn1_add_optional('Content-Type', CT, M4), + case maps:size(M5) of + 0 -> #{}; + _ -> #{properties => M5} + end. + +asn1_add_optional(_Key, asn1_NOVALUE, Acc) -> Acc; +asn1_add_optional(Key, Val, Acc) -> maps:put(Key, Val, Acc). + +-define(IS_VER(V), is_integer(V), V >= 0, V =< 255). + +asn1_encode_proto_ver(#{protocol := mqtt, proto_ver := V}) when ?IS_VER(V) -> + {mqtt, V}; +asn1_encode_proto_ver(#{protocol := 'mqtt-sn', proto_ver := V}) when ?IS_VER(V) -> + {'mqtt-sn', V}; +asn1_encode_proto_ver(#{protocol := coap, proto_ver := V}) when ?IS_VER(V) -> + {coap, V}; +asn1_encode_proto_ver(_) -> + asn1_NOVALUE. + +-undef(IS_VER). + +asn1_deserialize_misc(asn1_NOVALUE, Message) -> + Message; +asn1_deserialize_misc(MiscData, Message0) -> + lists:foldl( + fun + ({flag, #'MiscFlag'{key = Key, value = Val}}, Acc) -> + Flags = maps:put(binary_to_atom(Key, utf8), Val, Acc#message.flags), + Acc#message{flags = Flags}; + ({header, #'MiscProperty'{key = Key, value = Val}}, Acc) -> + Headers = maps:put(binary_to_term(Key), binary_to_term(Val), Acc#message.headers), + Acc#message{headers = Headers}; + ({property, #'MiscProperty'{key = Key, value = Val}}, Acc) -> + #message{headers = Headers0} = Acc, + Headers = maps:update_with( + properties, + fun(Props) -> + maps:put(binary_to_term(Key), binary_to_term(Val), Props) + end, + Headers0 + ), + Acc#message{headers = Headers}; + ({clientAttr, #'ClientAttr'{key = Key, value = Val}}, Acc) -> + #message{headers = Headers0} = Acc, + Headers = maps:update_with( + client_attrs, + fun(Props) -> + maps:put(Key, Val, Props) + end, + Headers0 + ), + Acc#message{headers = Headers}; + ({extra, #'MiscProperty'{key = Key, value = Val}}, Acc) -> + Extra = maps:put(binary_to_term(Key), binary_to_term(Val), Acc#message.extra), + Acc#message{extra = Extra} + end, + Message0, + MiscData + ). + +encode_ip_port(PortSize, {A0, A1, A2, A3}, Port) -> + <>; +encode_ip_port(PortSize, {A0, A1, A2, A3, A4, A5, A6, A7, A8, A9, AA, AB, AC, AD, AE, AF}, Port) -> + <>; +encode_ip_port(_, _, _) -> + asn1_NOVALUE. + +decode_ip_port(PortSize, Blob) -> + case Blob of + <> -> + {{A0, A1, A2, A3}, Port}; + <> -> + {{A0, A1, A2, A3, A4, A5, A6, A7, A8, A9, AA, AB, AC, AD, AE, AF}, Port}; + _ -> + asn1_NOVALUE + end. + +-ifdef(TEST). + +test_messages() -> + [ + #message{ + id = <<"message_id_val">>, + qos = 2, + from = <<"from_val">>, + flags = #{sys => true, dup => true}, + topic = <<"topic/value">>, + payload = [<<"foo">>, <<"bar">>], + timestamp = 42424242, + extra = #{} + }, + #message{ + id = <<0, 6, 28, 54, 12, 158, 221, 191, 244, 69, 0, 0, 13, 214, 0, 3>>, + qos = 0, + from = <<"MzE3MjU5NzA4NDY3MzcwNzg0NDYxNzI5NDg0NDk4NTM0NDA">>, + flags = #{dup => true, retain => true, sys => true}, + headers = #{ + peername => {{127, 0, 0, 1}, 34560}, + protocol => mqtt, + username => <<"foobar">>, + proto_ver => 5, + peerhost => {1, 1, 1, 1}, + properties => + #{ + 'Content-Type' => <<"text/json">>, + 'User-Property' => [{<<"foo">>, <<"bar">>}, {<<"baz">>, <<"quux">>}], + 'Message-Expiry-Interval' => 10001, + 'Payload-Format-Indicator' => 1 + } + }, + topic = <<"foo/bar">>, + payload = <<"foo">>, + timestamp = 1719868325813, + extra = #{} + }, + #message{ + id = <<>>, + from = undefined, + flags = #{other_flag => true}, + headers = #{ + properties => + #{ + 'Payload-Format-Indicator' => 1, + 'Message-Expiry-Interval' => 1 bsl 32 - 1, + 'Response-Topic' => <<"foo/bar/baz">>, + 'Correlation-Data' => <<"correlation data">>, + 'Content-Type' => <<"text/json">>, + 'User-Property' => [{<<"foo">>, <<"bar">>}, {<<"baz">>, <<"quux">>}], + junk => garbage, + {34, 33, 2} => more_garbage + }, + junk => garbage + }, + topic = <<"foo/bar">>, + payload = <<"foo">>, + timestamp = 171986, + extra = #{} + }, + #message{ + id = <<>>, + from = undefined, + headers = #{ + protocol => "some_protocol", + proto_ver => 42, + peername => "some.fancy.peername:222", + peerhost => "some.fancy.peerhost" + }, + topic = <<"foo/bar">>, + payload = <<"foo">>, + timestamp = 171986, + extra = #{} + } + ]. + +v1_serialize_deserialize_test_() -> + [ + assert_transcode(v1, Msg) + || Msg <- test_messages() + ]. + +asn1_serialize_deserialize_test_() -> + [ + assert_transcode(asn1, Msg) + || Msg <- test_messages() + ]. + +assert_transcode(Schema, Msg) -> + fun() -> + Blob = serialize(Schema, Msg), + ?debugFmt("encoded size (~p) = ~p~n", [Schema, size(Blob)]), + assert_eq(Msg, deserialize(Schema, Blob)) + end. + +assert_eq(Expect, Got) -> + ?assertEqual( + emqx_ds_test_helpers:message_canonical_form(Expect), + emqx_ds_test_helpers:message_canonical_form(Got), + {Expect, Got} + ). + +-endif. diff --git a/rebar.config b/rebar.config index d2f223f40..91befeea8 100644 --- a/rebar.config +++ b/rebar.config @@ -47,7 +47,8 @@ emqx_exproto_v_1_connection_adapter_bhvr, emqx_exproto_v_1_connection_unary_handler_client, emqx_exhook_v_2_hook_provider_client, - emqx_exhook_v_2_hook_provider_bhvr + emqx_exhook_v_2_hook_provider_bhvr, + 'DurableMessage' ]}, {plt_location, "."}, {plt_prefix, "emqx_dialyzer"}, From de48077ac45d5f924727f284d9ddda5c6e479be2 Mon Sep 17 00:00:00 2001 From: ieQu1 <99872536+ieQu1@users.noreply.github.com> Date: Mon, 1 Jul 2024 00:56:26 +0200 Subject: [PATCH 09/21] test(ds): Add new helper functions - Improve message comparison - Add set operations --- apps/emqx/test/emqx_cth_suite.erl | 11 ++++ .../test/emqx_ds_test_helpers.erl | 55 +++++++++++++++++-- 2 files changed, 61 insertions(+), 5 deletions(-) diff --git a/apps/emqx/test/emqx_cth_suite.erl b/apps/emqx/test/emqx_cth_suite.erl index 39e3ebc45..f4fcd0a75 100644 --- a/apps/emqx/test/emqx_cth_suite.erl +++ b/apps/emqx/test/emqx_cth_suite.erl @@ -64,6 +64,7 @@ -export([work_dir/1]). -export([work_dir/2]). +-export([clean_work_dir/1]). -export([load_apps/1]). -export([start_apps/2]). @@ -432,6 +433,16 @@ work_dir(TCName, CTConfig) -> WorkDir = work_dir(CTConfig), filename:join(WorkDir, TCName). +%% @doc Delete contents of the workdir. +clean_work_dir(WorkDir) -> + ct:pal("Cleaning workdir ~p", [WorkDir]), + case re:run(WorkDir, "./_build/test/logs/") of + {match, _} -> + file:del_dir_r(WorkDir); + nomatch -> + error({unsafe_workdir, WorkDir}) + end. + %% start_ekka() -> diff --git a/apps/emqx_durable_storage/test/emqx_ds_test_helpers.erl b/apps/emqx_durable_storage/test/emqx_ds_test_helpers.erl index af41df1ad..fe903fad2 100644 --- a/apps/emqx_durable_storage/test/emqx_ds_test_helpers.erl +++ b/apps/emqx_durable_storage/test/emqx_ds_test_helpers.erl @@ -148,8 +148,7 @@ do_ds_topic_generation_stream(DB, Node, Shard, It0) -> ?ON( Node, begin - Now = emqx_ds_replication_layer:current_timestamp(DB, Shard), - emqx_ds_storage_layer:next(Shard, It0, 1, Now) + emqx_ds_storage_layer:next(Shard, It0, 1, _Now = 1 bsl 63) end ) of @@ -233,15 +232,60 @@ transitions(Node, DB) -> end ). -%% Stream comparison +%% Message comparison + +%% Try to eliminate any ambiguity in the message representation. +message_canonical_form(Msg0 = #message{}) -> + message_canonical_form(emqx_message:to_map(Msg0)); +message_canonical_form(#{flags := Flags0, headers := Headers0, payload := Payload0} = Msg) -> + %% Remove flags that are false: + Flags = maps:filter( + fun(_Key, Val) -> Val end, + Flags0 + ), + Msg#{flags := Flags, payload := iolist_to_binary(Payload0)}. + +sublist(L) -> + PrintMax = 20, + case length(L) of + 0 -> + []; + N when N > PrintMax -> + lists:sublist(L, 1, PrintMax) ++ ['...', N - PrintMax, 'more']; + _ -> + L + end. + +message_set(L) -> + ordsets:from_list([message_canonical_form(I) || I <- L]). + +message_set_subtract(A, B) -> + ordsets:subtract(message_set(A), message_set(B)). + +assert_same_set(Expected, Got) -> + assert_same_set(Expected, Got, #{}). + +assert_same_set(Expected, Got, Comment) -> + SE = message_set(Expected), + SG = message_set(Got), + case {ordsets:subtract(SE, SG), ordsets:subtract(SG, SE)} of + {[], []} -> + ok; + {Missing, Unexpected} -> + error(Comment#{ + matching => sublist(ordsets:intersection(SE, SG)), + missing => sublist(Missing), + unexpected => sublist(Unexpected) + }) + end. message_eq(Fields, {_Key, Msg1 = #message{}}, Msg2) -> message_eq(Fields, Msg1, Msg2); message_eq(Fields, Msg1, {_Key, Msg2 = #message{}}) -> message_eq(Fields, Msg1, Msg2); message_eq(Fields, Msg1 = #message{}, Msg2 = #message{}) -> - maps:with(Fields, emqx_message:to_map(Msg1)) =:= - maps:with(Fields, emqx_message:to_map(Msg2)). + maps:with(Fields, message_canonical_form(Msg1)) =:= + maps:with(Fields, message_canonical_form(Msg2)). %% Consuming streams and iterators @@ -304,6 +348,7 @@ ds_topic_stream(DB, ClientId, TopicBin, Node) -> {DBShard, emqx_ds_storage_layer:get_streams(DBShard, Topic, 0)} end ), + ct:pal("Streams for ~p, ~p @ ~p:~n ~p", [ClientId, TopicBin, Node, DSStreams]), %% Sort streams by their rank Y, and chain them together: emqx_utils_stream:chain([ ds_topic_generation_stream(DB, Node, ShardId, Topic, S) From a4642d4d06eef76f63734610e7cdb48aa3bb54ef Mon Sep 17 00:00:00 2001 From: ieQu1 <99872536+ieQu1@users.noreply.github.com> Date: Mon, 1 Jul 2024 01:02:13 +0200 Subject: [PATCH 10/21] feat(ds): Add a new storage layout engine: "skipstream" This layout is based on LTS as well, but it uses separate index streams for constrained replay of streams with learned wildcards --- .../src/emqx_ds_storage_bitfield_lts.erl | 2 +- .../src/emqx_ds_storage_skipstream_lts.erl | 674 ++++++++++++++++++ 2 files changed, 675 insertions(+), 1 deletion(-) create mode 100644 apps/emqx_durable_storage/src/emqx_ds_storage_skipstream_lts.erl diff --git a/apps/emqx_durable_storage/src/emqx_ds_storage_bitfield_lts.erl b/apps/emqx_durable_storage/src/emqx_ds_storage_bitfield_lts.erl index 182fe8cf0..fb831318e 100644 --- a/apps/emqx_durable_storage/src/emqx_ds_storage_bitfield_lts.erl +++ b/apps/emqx_durable_storage/src/emqx_ds_storage_bitfield_lts.erl @@ -198,7 +198,7 @@ create(_ShardId, DBHandle, GenId, Options, SPrev) -> case SPrev of #s{trie = TriePrev} -> ok = copy_previous_trie(DBHandle, TrieCFHandle, TriePrev), - ?tp(bitfield_lts_inherited_trie, #{}), + ?tp(layout_inherited_lts_trie, #{}), ok; undefined -> ok diff --git a/apps/emqx_durable_storage/src/emqx_ds_storage_skipstream_lts.erl b/apps/emqx_durable_storage/src/emqx_ds_storage_skipstream_lts.erl new file mode 100644 index 000000000..7b0c49c6f --- /dev/null +++ b/apps/emqx_durable_storage/src/emqx_ds_storage_skipstream_lts.erl @@ -0,0 +1,674 @@ +%%-------------------------------------------------------------------- +%% Copyright (c) 2024 EMQ Technologies Co., Ltd. All Rights Reserved. +%% +%% Licensed under the Apache License, Version 2.0 (the "License"); +%% you may not use this file except in compliance with the License. +%% You may obtain a copy of the License at +%% +%% http://www.apache.org/licenses/LICENSE-2.0 +%% +%% Unless required by applicable law or agreed to in writing, software +%% distributed under the License is distributed on an "AS IS" BASIS, +%% WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +%% See the License for the specific language governing permissions and +%% limitations under the License. +%%-------------------------------------------------------------------- +-module(emqx_ds_storage_skipstream_lts). + +-behaviour(emqx_ds_storage_layer). + +%% API: +-export([]). + +%% behavior callbacks: +-export([ + create/5, + open/5, + drop/5, + prepare_batch/4, + commit_batch/4, + get_streams/4, + get_delete_streams/4, + make_iterator/5, + make_delete_iterator/5, + update_iterator/4, + next/6, + delete_next/7 +]). + +%% internal exports: +-export([]). + +-export_type([schema/0, s/0]). + +-include_lib("emqx_utils/include/emqx_message.hrl"). +-include_lib("snabbkaffe/include/trace.hrl"). + +-ifdef(TEST). +-include_lib("eunit/include/eunit.hrl"). +-endif. + +-elvis([{elvis_style, nesting_level, disable}]). + +%%================================================================================ +%% Type declarations +%%================================================================================ + +%% keys: +-define(cooked_payloads, 6). +-define(cooked_lts_ops, 7). + +-define(lts_persist_ops, emqx_ds_storage_skipstream_lts_ops). + +%% Width of the wildcard layer, in bits: +-define(wcb, 16). +-type wildcard_idx() :: 0..16#ffff. + +%% Width of the timestamp, in bits: +-define(tsb, 64). +-define(max_ts, 16#ffffffffffffffff). +-type ts() :: 0..?max_ts. + +-type wildcard_hash() :: binary(). + +%% Permanent state: +-type schema() :: + #{ + wildcard_hash_bytes := pos_integer(), + topic_index_bytes := pos_integer(), + keep_message_id := boolean(), + serialization_schema := emqx_ds_msg_serializer:schema(), + with_guid := boolean() + }. + +%% Runtime state: +-record(s, { + db :: rocksdb:db_handle(), + data_cf :: rocksdb:cf_handle(), + trie :: emqx_ds_lts:trie(), + trie_cf :: rocksdb:cf_handle(), + serialization_schema :: emqx_ds_msg_serializer:schema(), + hash_bytes :: pos_integer(), + with_guid :: boolean() +}). + +-type s() :: #s{}. + +-record(stream, { + static_index :: emqx_ds_lts:static_key() +}). + +-record(it, { + static_index :: emqx_ds_lts:static_key(), + ts :: ts(), + compressed_tf :: binary() +}). + +%% Level iterator: +-record(l, { + n :: non_neg_integer(), + handle :: rocksdb:itr_handle(), + hash :: binary() +}). + +%%================================================================================ +%% API functions +%%================================================================================ + +%%================================================================================ +%% behavior callbacks +%%================================================================================ + +create(_ShardId, DBHandle, GenId, Schema0, SPrev) -> + Defaults = #{ + wildcard_hash_bytes => 8, + topic_index_bytes => 8, + serialization_schema => asn1, + with_guid => false + }, + Schema = maps:merge(Defaults, Schema0), + ok = emqx_ds_msg_serializer:check_schema(maps:get(serialization_schema, Schema)), + DataCFName = data_cf(GenId), + TrieCFName = trie_cf(GenId), + {ok, DataCFHandle} = rocksdb:create_column_family(DBHandle, DataCFName, []), + {ok, TrieCFHandle} = rocksdb:create_column_family(DBHandle, TrieCFName, []), + case SPrev of + #s{trie = TriePrev} -> + ok = copy_previous_trie(DBHandle, TrieCFHandle, TriePrev), + ?tp(layout_inherited_lts_trie, #{}), + ok; + undefined -> + ok + end, + {Schema, [{DataCFName, DataCFHandle}, {TrieCFName, TrieCFHandle}]}. + +open(_Shard, DBHandle, GenId, CFRefs, #{ + topic_index_bytes := TIBytes, + wildcard_hash_bytes := WCBytes, + serialization_schema := SSchema, + with_guid := WithGuid +}) -> + {_, DataCF} = lists:keyfind(data_cf(GenId), 1, CFRefs), + {_, TrieCF} = lists:keyfind(trie_cf(GenId), 1, CFRefs), + Trie = restore_trie(TIBytes, DBHandle, TrieCF), + #s{ + db = DBHandle, + data_cf = DataCF, + trie_cf = TrieCF, + trie = Trie, + hash_bytes = WCBytes, + serialization_schema = SSchema, + with_guid = WithGuid + }. + +drop(_ShardId, DBHandle, _GenId, _CFRefs, #s{data_cf = DataCF, trie_cf = TrieCF, trie = Trie}) -> + emqx_ds_lts:destroy(Trie), + ok = rocksdb:drop_column_family(DBHandle, DataCF), + ok = rocksdb:drop_column_family(DBHandle, TrieCF), + ok. + +prepare_batch( + _ShardId, + S = #s{trie = Trie, hash_bytes = HashBytes}, + Messages, + _Options +) -> + _ = erase(?lts_persist_ops), + Payloads = + lists:flatmap( + fun({Timestamp, Msg = #message{topic = Topic}}) -> + Tokens = words(Topic), + {Static, Varying} = emqx_ds_lts:topic_key(Trie, fun threshold_fun/1, Tokens), + %% TODO: is it possible to create index during the + %% commit phase to avoid transferring indexes through + %% the translog? + [ + {mk_key(Static, 0, <<>>, Timestamp), serialize(S, Varying, Msg)} + | mk_index(HashBytes, Static, Timestamp, Varying) + ] + end, + Messages + ), + {ok, #{ + ?cooked_payloads => Payloads, + ?cooked_lts_ops => pop_lts_persist_ops() + }}. + +commit_batch( + _ShardId, + #s{db = DB, trie_cf = TrieCF, data_cf = DataCF, trie = Trie}, + #{?cooked_lts_ops := LtsOps, ?cooked_payloads := Payloads}, + Options +) -> + {ok, Batch} = rocksdb:batch(), + try + %% Commit LTS trie to the storage: + lists:foreach( + fun({Key, Val}) -> + ok = rocksdb:batch_put(Batch, TrieCF, term_to_binary(Key), term_to_binary(Val)) + end, + LtsOps + ), + %% Apply LTS ops to the memory cache: + _ = emqx_ds_lts:trie_update(Trie, LtsOps), + %% Commit payloads: + lists:foreach( + fun({Key, Val}) -> + ok = rocksdb:batch_put(Batch, DataCF, Key, Val) + end, + Payloads + ), + Result = rocksdb:write_batch(DB, Batch, [ + {disable_wal, not maps:get(durable, Options, true)} + ]), + %% NOTE + %% Strictly speaking, `{error, incomplete}` is a valid result but should be impossible to + %% observe until there's `{no_slowdown, true}` in write options. + case Result of + ok -> + ok; + {error, {error, Reason}} -> + {error, unrecoverable, {rocksdb, Reason}} + end + after + rocksdb:release_batch(Batch) + end. + +get_streams(_Shard, #s{trie = Trie}, TopicFilter, _StartTime) -> + get_streams(Trie, TopicFilter). + +get_delete_streams(_Shard, #s{trie = Trie}, TopicFilter, _StartTime) -> + get_streams(Trie, TopicFilter). + +make_iterator(_Shard, #s{trie = Trie}, #stream{static_index = StaticIdx}, TopicFilter, StartTime) -> + {ok, TopicStructure} = emqx_ds_lts:reverse_lookup(Trie, StaticIdx), + CompressedTF = emqx_ds_lts:compress_topic(StaticIdx, TopicStructure, TopicFilter), + {ok, #it{ + static_index = StaticIdx, + ts = StartTime, + compressed_tf = emqx_topic:join(CompressedTF) + }}. + +make_delete_iterator(Shard, Data, Stream, TopicFilter, StartTime) -> + make_iterator(Shard, Data, Stream, TopicFilter, StartTime). + +update_iterator(_Shard, _Data, OldIter, DSKey) -> + case match_ds_key(OldIter#it.static_index, DSKey) of + false -> + {error, unrecoverable, "Invalid datastream key"}; + TS -> + {ok, OldIter#it{ts = TS}} + end. + +next({_DB, Shard}, S, It, BatchSize, TMax, IsCurrent) -> + Iterators = init_iterators(S, It), + %% ?tp(notice, skipstream_init_iters, #{it => It, its => Iterators}), + try + case next_loop(Shard, S, It, Iterators, BatchSize, TMax) of + {ok, _, []} when not IsCurrent -> + {ok, end_of_stream}; + Result -> + Result + end + after + free_iterators(Iterators) + end. + +delete_next(Shard, S, It0, Selector, BatchSize, Now, IsCurrent) -> + case next(Shard, S, It0, BatchSize, Now, IsCurrent) of + {ok, It, KVs} -> + batch_delete(S, It, Selector, KVs); + Ret -> + Ret + end. + +%%================================================================================ +%% Internal exports +%%================================================================================ + +%%================================================================================ +%% Internal functions +%%================================================================================ + +get_streams(Trie, TopicFilter) -> + lists:map( + fun({Static, _Varying}) -> + #stream{static_index = Static} + end, + emqx_ds_lts:match_topics(Trie, TopicFilter) + ). + +%%%%%%%% Value (de)serialization %%%%%%%%%% + +serialize(#s{serialization_schema = SSchema, with_guid = WithGuid}, Varying, Msg0) -> + %% Replace original topic with the varying parts: + Msg = Msg0#message{ + id = + case WithGuid of + true -> Msg0#message.id; + false -> <<>> + end, + topic = emqx_topic:join(Varying) + }, + emqx_ds_msg_serializer:serialize(SSchema, Msg). + +enrich( + Shard, + #s{trie = Trie, with_guid = WithGuid}, + DSKey, + StaticKey, + Msg0 +) -> + case emqx_ds_lts:reverse_lookup(Trie, StaticKey) of + {ok, Structure} -> + %% Reconstruct the original topic from the static topic + %% index and varying parts: + Topic = emqx_topic:join( + emqx_ds_lts:decompress_topic(Structure, words(Msg0#message.topic)) + ), + Msg0#message{ + topic = Topic, + id = + case WithGuid of + true -> Msg0#message.id; + false -> fake_guid(Shard, DSKey) + end + }; + undefined -> + Err = #{ + msg => "LTS trie missing key", + key => StaticKey + }, + throw({unrecoverable, Err}) + end. + +deserialize( + #s{serialization_schema = SSchema}, + Blob +) -> + emqx_ds_msg_serializer:deserialize(SSchema, Blob). + +fake_guid(_Shard, DSKey) -> + %% Both guid and MD5 are 16 bytes: + crypto:hash(md5, DSKey). + +%%%%%%%% Deletion %%%%%%%%%% + +batch_delete(#s{hash_bytes = HashBytes, db = DB, data_cf = CF}, It, Selector, KVs) -> + #it{static_index = Static, compressed_tf = CompressedTF} = It, + {Indices, _} = lists:foldl( + fun + ('+', {Acc, WildcardIdx}) -> + {Acc, WildcardIdx + 1}; + (LevelFilter, {Acc0, WildcardIdx}) -> + Acc = [{WildcardIdx, hash(HashBytes, LevelFilter)} | Acc0], + {Acc, WildcardIdx + 1} + end, + {[], 1}, + words(CompressedTF) + ), + KeyFamily = [{0, <<>>} | Indices], + {ok, Batch} = rocksdb:batch(), + try + Ndeleted = lists:foldl( + fun({MsgKey, Val}, Acc) -> + case Selector(Val) of + true -> + do_delete(CF, Batch, Static, KeyFamily, MsgKey), + Acc + 1; + false -> + Acc + end + end, + 0, + KVs + ), + case rocksdb:write_batch(DB, Batch, []) of + ok -> + {ok, It, Ndeleted, length(KVs)}; + {error, {error, Reason}} -> + {error, unrecoverable, {rocksdb, Reason}} + end + after + rocksdb:release_batch(Batch) + end. + +do_delete(CF, Batch, Static, KeyFamily, MsgKey) -> + TS = match_ds_key(Static, MsgKey), + lists:foreach( + fun({WildcardIdx, Hash}) -> + ok = rocksdb:batch_delete(Batch, CF, mk_key(Static, WildcardIdx, Hash, TS)) + end, + KeyFamily + ). + +%%%%%%%% Iteration %%%%%%%%%% + +init_iterators(S, #it{static_index = Static, compressed_tf = CompressedTF}) -> + do_init_iterators(S, Static, words(CompressedTF), 1). + +do_init_iterators(S, Static, ['+' | TopicFilter], WildcardLevel) -> + %% Ignore wildcard levels in the topic filter: + do_init_iterators(S, Static, TopicFilter, WildcardLevel + 1); +do_init_iterators(S, Static, [Constraint | TopicFilter], WildcardLevel) -> + %% Create iterator for the index stream: + #s{hash_bytes = HashBytes, db = DB, data_cf = DataCF} = S, + Hash = hash(HashBytes, Constraint), + {ok, ItHandle} = rocksdb:iterator(DB, DataCF, get_key_range(Static, WildcardLevel, Hash)), + It = #l{ + n = WildcardLevel, + handle = ItHandle, + hash = Hash + }, + [It | do_init_iterators(S, Static, TopicFilter, WildcardLevel + 1)]; +do_init_iterators(S, Static, [], _WildcardLevel) -> + %% Create an iterator for the data stream: + #s{db = DB, data_cf = DataCF} = S, + Hash = <<>>, + {ok, ItHandle} = rocksdb:iterator(DB, DataCF, get_key_range(Static, 0, Hash)), + [ + #l{ + n = 0, + handle = ItHandle, + hash = Hash + } + ]. + +next_loop(Shard, S, It = #it{ts = TS}, Iterators, BatchSize, TMax) -> + next_loop(Shard, S, It, Iterators, BatchSize, TMax, {seek, TS}, []). + +next_loop(_Shard, _S, It, _Iterators, 0, _TMax, Op, Acc) -> + finalize_loop(It, Op, Acc); +next_loop(Shard, S, It0, Iterators, BatchSize, TMax, Op, Acc) -> + %% ?tp(notice, skipstream_loop, #{ + %% ts => It0#it.ts, tf => It0#it.compressed_tf, bs => BatchSize, tmax => TMax, op => Op + %% }), + #it{static_index = StaticIdx, compressed_tf = CompressedTF} = It0, + case next_step(S, StaticIdx, CompressedTF, Iterators, undefined, Op) of + none -> + %% ?tp(notice, skipstream_loop_result, #{r => none}), + finalize_loop(It0, Op, Acc); + {seek, TS} when TS > TMax -> + %% ?tp(notice, skipstream_loop_result, #{r => seek_future, ts => TS}), + finalize_loop(It0, {seek, TS}, Acc); + {ok, TS, _Key, _Msg0} when TS > TMax -> + %% ?tp(notice, skipstream_loop_result, #{r => ok_future, ts => TS, key => _Key}), + finalize_loop(It0, {seek, TS}, Acc); + {seek, TS} -> + %% ?tp(notice, skipstream_loop_result, #{r => seek, ts => TS}), + It = It0#it{ts = TS}, + next_loop(Shard, S, It, Iterators, BatchSize, TMax, {seek, TS}, Acc); + {ok, TS, DSKey, Msg0} -> + %% ?tp(notice, skipstream_loop_result, #{r => ok, ts => TS, key => Key}), + Message = enrich(Shard, S, DSKey, StaticIdx, Msg0), + It = It0#it{ts = TS}, + next_loop(Shard, S, It, Iterators, BatchSize - 1, TMax, next, [{DSKey, Message} | Acc]) + end. + +finalize_loop(It0, Op, Acc) -> + case Op of + next -> NextTS = It0#it.ts + 1; + {seek, NextTS} -> ok + end, + It = It0#it{ts = NextTS}, + {ok, It, lists:reverse(Acc)}. + +next_step( + S, StaticIdx, CompressedTF, [#l{hash = Hash, handle = IH, n = N} | Iterators], ExpectedTS, Op +) -> + Result = + case Op of + next -> + rocksdb:iterator_move(IH, next); + {seek, TS} -> + rocksdb:iterator_move(IH, {seek, mk_key(StaticIdx, N, Hash, TS)}) + end, + case Result of + {error, invalid_iterator} -> + none; + {ok, Key, Blob} -> + case match_key(StaticIdx, N, Hash, Key) of + false -> + %% This should not happen, since we set boundaries + %% to the iterators, and overflow to a different + %% key prefix should be caught by the previous + %% clause: + none; + NextTS when ExpectedTS =:= undefined; NextTS =:= ExpectedTS -> + %% We found a key that corresponds to the + %% timestamp we expect. + %% ?tp(notice, ?MODULE_STRING "_step_hit", #{ + %% next_ts => NextTS, expected => ExpectedTS, n => N + %% }), + case Iterators of + [] -> + %% This is data stream as well. Check + %% message for hash collisions and return + %% value: + Msg0 = deserialize(S, Blob), + case emqx_topic:match(Msg0#message.topic, CompressedTF) of + true -> + {ok, NextTS, Key, Msg0}; + false -> + %% Hash collision. Advance to the + %% next timestamp: + {seek, NextTS + 1} + end; + _ -> + %% This is index stream. Keep going: + next_step(S, StaticIdx, CompressedTF, Iterators, NextTS, {seek, NextTS}) + end; + NextTS when NextTS > ExpectedTS, N > 0 -> + %% Next index level is not what we expect. + {seek, NextTS} + end + end. + +free_iterators(Its) -> + lists:foreach( + fun(#l{handle = IH}) -> + ok = rocksdb:iterator_close(IH) + end, + Its + ). + +%%%%%%%% Indexes %%%%%%%%%% + +mk_index(HashBytes, Static, Timestamp, Varying) -> + mk_index(HashBytes, Static, Timestamp, 1, Varying, []). + +mk_index(HashBytes, Static, Timestamp, N, [TopicLevel | Varying], Acc) -> + Op = {mk_key(Static, N, hash(HashBytes, TopicLevel), Timestamp), <<>>}, + mk_index(HashBytes, Static, Timestamp, N + 1, Varying, [Op | Acc]); +mk_index(_HashBytes, _Static, _Timestamp, _N, [], Acc) -> + Acc. + +%%%%%%%% Keys %%%%%%%%%% + +get_key_range(StaticIdx, WildcardIdx, Hash) -> + [ + {iterate_lower_bound, mk_key(StaticIdx, WildcardIdx, Hash, 0)}, + {iterate_upper_bound, mk_key(StaticIdx, WildcardIdx, Hash, ?max_ts)} + ]. + +-spec match_ds_key(emqx_ds_lts:static_key(), binary()) -> ts() | false. +match_ds_key(StaticIdx, Key) -> + match_key(StaticIdx, 0, <<>>, Key). + +-spec match_key(emqx_ds_lts:static_key(), wildcard_idx(), wildcard_hash(), binary()) -> + ts() | false. +match_key(StaticIdx, 0, <<>>, Key) -> + TSz = size(StaticIdx), + case Key of + <> -> + Timestamp; + _ -> + false + end; +match_key(StaticIdx, Idx, Hash, Key) when Idx > 0 -> + Tsz = size(StaticIdx), + Hsz = size(Hash), + case Key of + <> -> + Timestamp; + _ -> + false + end. + +-spec mk_key(emqx_ds_lts:static_key(), wildcard_idx(), wildcard_hash(), ts()) -> binary(). +mk_key(StaticIdx, 0, <<>>, Timestamp) -> + %% Data stream is identified by wildcard level = 0 + <>; +mk_key(StaticIdx, N, Hash, Timestamp) when N > 0 -> + %% Index stream: + <>. + +hash(HashBytes, '') -> + hash(HashBytes, <<>>); +hash(HashBytes, TopicLevel) -> + {Hash, _} = split_binary(erlang:md5(TopicLevel), HashBytes), + Hash. + +%%%%%%%% LTS %%%%%%%%%% + +%% TODO: don't hardcode the thresholds +threshold_fun(0) -> + 100; +threshold_fun(_) -> + 10. + +-spec restore_trie(pos_integer(), rocksdb:db_handle(), rocksdb:cf_handle()) -> emqx_ds_lts:trie(). +restore_trie(StaticIdxBytes, DB, CF) -> + PersistCallback = fun(Key, Val) -> + push_lts_persist_op(Key, Val), + ok + end, + {ok, IT} = rocksdb:iterator(DB, CF, []), + try + Dump = read_persisted_trie(IT, rocksdb:iterator_move(IT, first)), + TrieOpts = #{ + persist_callback => PersistCallback, + static_key_bytes => StaticIdxBytes, + reverse_lookups => true + }, + emqx_ds_lts:trie_restore(TrieOpts, Dump) + after + rocksdb:iterator_close(IT) + end. + +-spec copy_previous_trie(rocksdb:db_handle(), rocksdb:cf_handle(), emqx_ds_lts:trie()) -> + ok. +copy_previous_trie(DB, TrieCF, TriePrev) -> + {ok, Batch} = rocksdb:batch(), + lists:foreach( + fun({Key, Val}) -> + ok = rocksdb:batch_put(Batch, TrieCF, term_to_binary(Key), term_to_binary(Val)) + end, + emqx_ds_lts:trie_dump(TriePrev, wildcard) + ), + Result = rocksdb:write_batch(DB, Batch, []), + rocksdb:release_batch(Batch), + Result. + +push_lts_persist_op(Key, Val) -> + case erlang:get(?lts_persist_ops) of + undefined -> + erlang:put(?lts_persist_ops, [{Key, Val}]); + L when is_list(L) -> + erlang:put(?lts_persist_ops, [{Key, Val} | L]) + end. + +pop_lts_persist_ops() -> + case erlang:erase(?lts_persist_ops) of + undefined -> + []; + L when is_list(L) -> + L + end. + +read_persisted_trie(IT, {ok, KeyB, ValB}) -> + [ + {binary_to_term(KeyB), binary_to_term(ValB)} + | read_persisted_trie(IT, rocksdb:iterator_move(IT, next)) + ]; +read_persisted_trie(_IT, {error, invalid_iterator}) -> + []. + +%%%%%%%% Column families %%%%%%%%%% + +%% @doc Generate a column family ID for the MQTT messages +-spec data_cf(emqx_ds_storage_layer:gen_id()) -> [char()]. +data_cf(GenId) -> + "emqx_ds_storage_skipstream_lts_data" ++ integer_to_list(GenId). + +%% @doc Generate a column family ID for the trie +-spec trie_cf(emqx_ds_storage_layer:gen_id()) -> [char()]. +trie_cf(GenId) -> + "emqx_ds_storage_skipstream_lts_trie" ++ integer_to_list(GenId). + +%%%%%%%% Topic encoding %%%%%%%%%% + +words(<<>>) -> + []; +words(Bin) -> + emqx_topic:words(Bin). From 086e7256f54078272539d66870ca7c406863bdee Mon Sep 17 00:00:00 2001 From: ieQu1 <99872536+ieQu1@users.noreply.github.com> Date: Mon, 1 Jul 2024 01:10:16 +0200 Subject: [PATCH 11/21] feat(ds): Add configuration schema for skipstream LTS layout --- apps/emqx/src/emqx_ds_schema.erl | 60 ++++++++++++++++++++++++++++++-- 1 file changed, 58 insertions(+), 2 deletions(-) diff --git a/apps/emqx/src/emqx_ds_schema.erl b/apps/emqx/src/emqx_ds_schema.erl index 1cda81d1d..df50e332e 100644 --- a/apps/emqx/src/emqx_ds_schema.erl +++ b/apps/emqx/src/emqx_ds_schema.erl @@ -234,6 +234,42 @@ fields(layout_builtin_wildcard_optimized) -> } )} ]; +fields(layout_builtin_wildcard_optimized_v2) -> + [ + {type, + sc( + wildcard_optimized_v2, + #{ + 'readOnly' => true, + default => wildcard_optimized_v2, + desc => ?DESC(layout_builtin_wildcard_optimized_type) + } + )}, + {bytes_per_topic_level, + sc( + range(1, 16), + #{ + default => 8, + importance => ?IMPORTANCE_HIDDEN + } + )}, + {topic_index_bytes, + sc( + pos_integer(), + #{ + default => 8, + importance => ?IMPORTANCE_HIDDEN + } + )}, + {serialization_schema, + sc( + emqx_ds_msg_serializer:schema(), + #{ + default => v1, + importance => ?IMPORTANCE_HIDDEN + } + )} + ]; fields(layout_builtin_reference) -> [ {type, @@ -242,6 +278,7 @@ fields(layout_builtin_reference) -> #{ 'readOnly' => true, importance => ?IMPORTANCE_LOW, + default => reference, desc => ?DESC(layout_builtin_reference_type) } )} @@ -284,7 +321,7 @@ common_builtin_fields() -> importance => ?IMPORTANCE_MEDIUM, default => #{ - <<"type">> => wildcard_optimized + <<"type">> => wildcard_optimized_v2 } } )} @@ -298,6 +335,8 @@ desc(builtin_write_buffer) -> ?DESC(builtin_write_buffer); desc(layout_builtin_wildcard_optimized) -> ?DESC(layout_builtin_wildcard_optimized); +desc(layout_builtin_wildcard_optimized_v2) -> + ?DESC(layout_builtin_wildcard_optimized); desc(layout_builtin_reference) -> ?DESC(layout_builtin_reference); desc(_) -> @@ -307,6 +346,19 @@ desc(_) -> %% Internal functions %%================================================================================ +translate_layout( + #{ + type := wildcard_optimized_v2, + bytes_per_topic_level := BytesPerTopicLevel, + topic_index_bytes := TopicIndexBytes, + serialization_schema := SSchema + } +) -> + {emqx_ds_storage_skipstream_lts, #{ + wildcard_hash_bytes => BytesPerTopicLevel, + topic_index_bytes => TopicIndexBytes, + serialization_schema => SSchema + }}; translate_layout( #{ type := wildcard_optimized, @@ -336,7 +388,11 @@ builtin_layouts() -> %% suitable for production use. However, it's very simple and %% produces a very predictabale replay order, which can be useful %% for testing and debugging: - [ref(layout_builtin_wildcard_optimized), ref(layout_builtin_reference)]. + [ + ref(layout_builtin_wildcard_optimized_v2), + ref(layout_builtin_wildcard_optimized), + ref(layout_builtin_reference) + ]. sc(Type, Meta) -> hoconsc:mk(Type, Meta). From 8c5e4a237601ce5de17448b412080ce81fe60618 Mon Sep 17 00:00:00 2001 From: ieQu1 <99872536+ieQu1@users.noreply.github.com> Date: Mon, 1 Jul 2024 00:55:41 +0200 Subject: [PATCH 12/21] test(ds): Generalize storage layout test suite for different layouts --- ...E.erl => emqx_ds_storage_layout_SUITE.erl} | 140 ++++++++++-------- 1 file changed, 78 insertions(+), 62 deletions(-) rename apps/emqx_durable_storage/test/{emqx_ds_storage_bitfield_lts_SUITE.erl => emqx_ds_storage_layout_SUITE.erl} (85%) diff --git a/apps/emqx_durable_storage/test/emqx_ds_storage_bitfield_lts_SUITE.erl b/apps/emqx_durable_storage/test/emqx_ds_storage_layout_SUITE.erl similarity index 85% rename from apps/emqx_durable_storage/test/emqx_ds_storage_bitfield_lts_SUITE.erl rename to apps/emqx_durable_storage/test/emqx_ds_storage_layout_SUITE.erl index 11b6109e8..e0531dad0 100644 --- a/apps/emqx_durable_storage/test/emqx_ds_storage_bitfield_lts_SUITE.erl +++ b/apps/emqx_durable_storage/test/emqx_ds_storage_layout_SUITE.erl @@ -13,7 +13,7 @@ %% See the License for the specific language governing permissions and %% limitations under the License. %%-------------------------------------------------------------------- --module(emqx_ds_storage_bitfield_lts_SUITE). +-module(emqx_ds_storage_layout_SUITE). -compile(export_all). -compile(nowarn_export_all). @@ -23,23 +23,34 @@ -include_lib("snabbkaffe/include/snabbkaffe.hrl"). -include_lib("stdlib/include/assert.hrl"). +-define(FUTURE, (1 bsl 64 - 1)). + -define(SHARD, shard(?FUNCTION_NAME)). --define(DEFAULT_CONFIG, #{ +-define(DB_CONFIG(CONFIG), #{ backend => builtin_local, - storage => {emqx_ds_storage_bitfield_lts, #{}}, + storage => ?config(layout, CONFIG), n_shards => 1 }). --define(COMPACT_CONFIG, #{ - backend => builtin_local, - storage => - {emqx_ds_storage_bitfield_lts, #{ - bits_per_wildcard_level => 8 - }}, - n_shards => 1, - replication_factor => 1 -}). +all() -> + [ + {group, bitfield_lts}, + {group, skipstream_lts} + ]. + +init_per_group(Group, Config) -> + LayoutConf = + case Group of + skipstream_lts -> + {emqx_ds_storage_skipstream_lts, #{with_guid => true}}; + bitfield_lts -> + {emqx_ds_storage_bitfield_lts, #{}} + end, + [{layout, LayoutConf} | Config]. + +end_per_group(_Group, Config) -> + Config. %% Smoke test of store function t_store(_Config) -> @@ -53,7 +64,7 @@ t_store(_Config) -> payload = Payload, timestamp = PublishedAt }, - ?assertMatch(ok, emqx_ds_storage_layer:store_batch(?SHARD, [{PublishedAt, Msg}], #{})). + ?assertMatch(ok, emqx_ds:store_batch(?FUNCTION_NAME, [Msg])). %% Smoke test for iteration through a concrete topic t_iterate(_Config) -> @@ -61,15 +72,17 @@ t_iterate(_Config) -> Topics = [<<"foo/bar">>, <<"foo/bar/baz">>, <<"a">>], Timestamps = lists:seq(1, 10), Batch = [ - {PublishedAt, make_message(PublishedAt, Topic, integer_to_binary(PublishedAt))} + make_message(PublishedAt, Topic, integer_to_binary(PublishedAt)) || Topic <- Topics, PublishedAt <- Timestamps ], - ok = emqx_ds_storage_layer:store_batch(?SHARD, Batch, #{}), + ok = emqx_ds:store_batch(?FUNCTION_NAME, Batch), %% Iterate through individual topics: [ begin - [{_Rank, Stream}] = emqx_ds_storage_layer:get_streams(?SHARD, parse_topic(Topic), 0), + [{Rank, Stream}] = emqx_ds_storage_layer:get_streams(?SHARD, parse_topic(Topic), 0), + ct:pal("Streams for ~p: {~p, ~p}", [Topic, Rank, Stream]), {ok, It} = emqx_ds_storage_layer:make_iterator(?SHARD, Stream, parse_topic(Topic), 0), + ct:pal("Iterator for ~p: ~p", [Topic, It]), {ok, NextIt, MessagesAndKeys} = emqx_ds_storage_layer:next( ?SHARD, It, 100, emqx_ds:timestamp_us() ), @@ -91,10 +104,10 @@ t_delete(_Config) -> Topics = [<<"foo/bar">>, TopicToDelete, <<"a">>], Timestamps = lists:seq(1, 10), Batch = [ - {PublishedAt, make_message(PublishedAt, Topic, integer_to_binary(PublishedAt))} + make_message(PublishedAt, Topic, integer_to_binary(PublishedAt)) || Topic <- Topics, PublishedAt <- Timestamps ], - ok = emqx_ds_storage_layer:store_batch(?SHARD, Batch, #{}), + ok = emqx_ds:store_batch(?FUNCTION_NAME, Batch), %% Iterate through topics: StartTime = 0, @@ -109,23 +122,21 @@ t_delete(_Config) -> Messages = [Msg || {_DSKey, Msg} <- replay(?SHARD, TopicFilter, StartTime)], MessagesByTopic = maps:groups_from_list(fun emqx_message:topic/1, Messages), ?assertNot(is_map_key(TopicToDelete, MessagesByTopic), #{msgs => MessagesByTopic}), - ?assertEqual(20, length(Messages)), - - ok. + ?assertEqual(20, length(Messages)). -define(assertSameSet(A, B), ?assertEqual(lists:sort(A), lists:sort(B))). %% Smoke test that verifies that concrete topics are mapped to %% individual streams, unless there's too many of them. -t_get_streams(_Config) -> +t_get_streams(Config) -> %% Prepare data (without wildcards): Topics = [<<"foo/bar">>, <<"foo/bar/baz">>, <<"a">>], Timestamps = lists:seq(1, 10), Batch = [ - {PublishedAt, make_message(PublishedAt, Topic, integer_to_binary(PublishedAt))} + make_message(PublishedAt, Topic, integer_to_binary(PublishedAt)) || Topic <- Topics, PublishedAt <- Timestamps ], - ok = emqx_ds_storage_layer:store_batch(?SHARD, Batch, #{}), + ok = emqx_ds:store_batch(?FUNCTION_NAME, Batch), GetStream = fun(Topic) -> StartTime = 0, emqx_ds_storage_layer:get_streams(?SHARD, parse_topic(Topic), StartTime) @@ -136,7 +147,7 @@ t_get_streams(_Config) -> [A] = GetStream(<<"a">>), %% Restart shard to make sure trie is persisted and restored: ok = emqx_ds:close_db(?FUNCTION_NAME), - ok = emqx_ds:open_db(?FUNCTION_NAME, ?DEFAULT_CONFIG), + ok = emqx_ds:open_db(?FUNCTION_NAME, ?DB_CONFIG(Config)), %% Verify that there are no "ghost streams" for topics that don't %% have any messages: [] = GetStream(<<"bar/foo">>), @@ -148,11 +159,11 @@ t_get_streams(_Config) -> NewBatch = [ begin B = integer_to_binary(I), - {100, make_message(100, <<"foo/bar/", B/binary>>, <<"filler", B/binary>>)} + make_message(100, <<"foo/bar/", B/binary>>, <<"filler", B/binary>>) end || I <- lists:seq(1, 200) ], - ok = emqx_ds_storage_layer:store_batch(?SHARD, NewBatch, #{}), + ok = emqx_ds:store_batch(?FUNCTION_NAME, NewBatch), %% Check that "foo/bar/baz" topic now appears in two streams: %% "foo/bar/baz" and "foo/bar/+": NewStreams = lists:sort(GetStream("foo/bar/baz")), @@ -168,7 +179,7 @@ t_get_streams(_Config) -> ?assert(lists:member(A, AllStreams)), ok. -t_new_generation_inherit_trie(_Config) -> +t_new_generation_inherit_trie(Config) -> %% This test checks that we inherit the previous generation's LTS when creating a new %% generation. ?check_trace( @@ -176,25 +187,25 @@ t_new_generation_inherit_trie(_Config) -> %% Create a bunch of topics to be learned in the first generation TS1 = 500, Batch1 = [ - {TS1, make_message(TS1, make_topic([wildcard, I, suffix, Suffix]), bin(I))} + make_message(TS1, make_topic([wildcard, I, suffix, Suffix]), bin(I)) || I <- lists:seq(1, 200), Suffix <- [<<"foo">>, <<"bar">>] ], - ok = emqx_ds_storage_layer:store_batch(?SHARD, Batch1, #{}), + ok = emqx_ds:store_batch(?FUNCTION_NAME, Batch1), %% Now we create a new generation with the same LTS module. It should inherit the %% learned trie. ok = emqx_ds_storage_layer:add_generation(?SHARD, _Since = 1_000), %% Restart the shard, to verify that LTS is persisted. ok = emqx_ds:close_db(?FUNCTION_NAME), - ok = emqx_ds:open_db(?FUNCTION_NAME, ?DEFAULT_CONFIG), + ok = emqx_ds:open_db(?FUNCTION_NAME, ?DB_CONFIG(Config)), %% Store a batch of messages with the same set of topics. TS2 = 1_500, Batch2 = [ - {TS2, make_message(TS2, make_topic([wildcard, I, suffix, Suffix]), bin(I))} + make_message(TS2, make_topic([wildcard, I, suffix, Suffix]), bin(I)) || I <- lists:seq(1, 200), Suffix <- [<<"foo">>, <<"bar">>] ], - ok = emqx_ds_storage_layer:store_batch(?SHARD, Batch2, #{}), + ok = emqx_ds:store_batch(?FUNCTION_NAME, Batch2), %% We should get only two streams for wildcard query, for "foo" and for "bar". ?assertMatch( [_Foo, _Bar], @@ -203,29 +214,30 @@ t_new_generation_inherit_trie(_Config) -> ok end, fun(Trace) -> - ?assertMatch([_], ?of_kind(bitfield_lts_inherited_trie, Trace)), + ?assertMatch([_], ?of_kind(layout_inherited_lts_trie, Trace)), ok end ), ok. -t_replay(_Config) -> +t_replay(Config) -> %% Create concrete topics: Topics = [<<"foo/bar">>, <<"foo/bar/baz">>], - Timestamps = lists:seq(1, 10_000, 100), + Values = lists:seq(1, 1_000, 100), Batch1 = [ - {PublishedAt, make_message(PublishedAt, Topic, integer_to_binary(PublishedAt))} - || Topic <- Topics, PublishedAt <- Timestamps + make_message(Val, Topic, bin(Val)) + || Topic <- Topics, Val <- Values ], - ok = emqx_ds_storage_layer:store_batch(?SHARD, Batch1, #{}), + ok = emqx_ds:store_batch(?FUNCTION_NAME, Batch1), %% Create wildcard topics `wildcard/+/suffix/foo' and `wildcard/+/suffix/bar': Batch2 = [ - {TS, make_message(TS, make_topic([wildcard, I, suffix, Suffix]), bin(TS))} - || I <- lists:seq(1, 200), TS <- Timestamps, Suffix <- [<<"foo">>, <<"bar">>] + make_message(Val, make_topic([wildcard, Prefix, suffix, Suffix]), bin(Val)) + || Prefix <- lists:seq(1, 200), Val <- Values, Suffix <- [<<"foo">>, <<"bar">>] ], - ok = emqx_ds_storage_layer:store_batch(?SHARD, Batch2, #{}), + ok = emqx_ds:store_batch(?FUNCTION_NAME, Batch2), + timer:sleep(5_000), %% Check various topic filters: - Messages = [M || {_TS, M} <- Batch1 ++ Batch2], + Messages = Batch1 ++ Batch2, %% Missing topics (no ghost messages): ?assertNot(check(?SHARD, <<"missing/foo/bar">>, 0, Messages)), %% Regular topics: @@ -238,7 +250,7 @@ t_replay(_Config) -> ?assert(check(?SHARD, <<"+/+/baz">>, 0, Messages)), %% Restart the DB to make sure trie is persisted and restored: ok = emqx_ds:close_db(?FUNCTION_NAME), - ok = emqx_ds:open_db(?FUNCTION_NAME, ?DEFAULT_CONFIG), + ok = emqx_ds:open_db(?FUNCTION_NAME, ?DB_CONFIG(Config)), %% Learned wildcard topics: ?assertNot(check(?SHARD, <<"wildcard/1000/suffix/foo">>, 0, [])), ?assert(check(?SHARD, <<"wildcard/1/suffix/foo">>, 0, Messages)), @@ -314,6 +326,9 @@ t_non_atomic_store_batch(_Config) -> ). check(Shard, TopicFilter, StartTime, ExpectedMessages) -> + ?tp(notice, ?MODULE_STRING "_check", #{ + shard => Shard, tf => TopicFilter, start_time => StartTime + }), ExpectedFiltered = lists:filter( fun(#message{topic = Topic, timestamp = TS}) -> emqx_topic:match(Topic, TopicFilter) andalso TS >= StartTime @@ -325,17 +340,9 @@ check(Shard, TopicFilter, StartTime, ExpectedMessages) -> begin Dump = dump_messages(Shard, TopicFilter, StartTime), verify_dump(TopicFilter, StartTime, Dump), - Missing = ExpectedFiltered -- Dump, - Extras = Dump -- ExpectedFiltered, - ?assertMatch( - #{missing := [], unexpected := []}, - #{ - missing => Missing, - unexpected => Extras, - topic_filter => TopicFilter, - start_time => StartTime - } - ) + emqx_ds_test_helpers:assert_same_set(ExpectedFiltered, Dump, #{ + topic_filter => TopicFilter, start_time => StartTime + }) end, [] ), @@ -362,6 +369,7 @@ verify_dump(TopicFilter, StartTime, Dump) -> dump_messages(Shard, TopicFilter, StartTime) -> Streams = emqx_ds_storage_layer:get_streams(Shard, parse_topic(TopicFilter), StartTime), + ct:pal("Streams for ~p:~n ~p", [TopicFilter, Streams]), lists:flatmap( fun({_Rank, Stream}) -> dump_stream(Shard, Stream, TopicFilter, StartTime) @@ -374,6 +382,7 @@ dump_stream(Shard, Stream, TopicFilter, StartTime) -> {ok, Iterator} = emqx_ds_storage_layer:make_iterator( Shard, Stream, parse_topic(TopicFilter), StartTime ), + ct:pal("Iterator for ~p at stream ~p:~n ~p", [TopicFilter, Stream, Iterator]), Loop = fun F(It, 0) -> error({too_many_iterations, It}); @@ -502,24 +511,31 @@ bin(X) -> %% CT callbacks -all() -> emqx_common_test_helpers:all(?MODULE). +groups() -> + TCs = emqx_common_test_helpers:all(?MODULE), + [ + {bitfield_lts, TCs}, + {skipstream_lts, TCs} + ]. + suite() -> [{timetrap, {seconds, 20}}]. init_per_suite(Config) -> - emqx_common_test_helpers:clear_screen(), + WorkDir = emqx_cth_suite:work_dir(Config), Apps = emqx_cth_suite:start( [emqx_ds_builtin_local], - #{work_dir => emqx_cth_suite:work_dir(Config)} + #{work_dir => WorkDir} ), - [{apps, Apps} | Config]. + [{apps, Apps}, {work_dir, WorkDir} | Config]. end_per_suite(Config) -> Apps = ?config(apps, Config), ok = emqx_cth_suite:stop(Apps), + emqx_cth_suite:clean_work_dir(?config(work_dir, Config)), ok. init_per_testcase(TC, Config) -> - ok = emqx_ds:open_db(TC, ?DEFAULT_CONFIG), + ok = emqx_ds:open_db(TC, ?DB_CONFIG(Config)), Config. end_per_testcase(TC, _Config) -> @@ -558,7 +574,7 @@ delete(Shard, Iterators, Selector) -> fun(Iterator0, {AccIterators, NAcc}) -> case emqx_ds_storage_layer:delete_next( - Shard, Iterator0, Selector, 10, emqx_ds:timestamp_us() + Shard, Iterator0, Selector, 10, ?FUTURE ) of {ok, end_of_stream} -> @@ -591,7 +607,7 @@ replay(_Shard, []) -> replay(Shard, Iterators) -> {NewIterators0, Messages0} = lists:foldl( fun(Iterator0, {AccIterators, AccMessages}) -> - case emqx_ds_storage_layer:next(Shard, Iterator0, 10, emqx_ds:timestamp_us()) of + case emqx_ds_storage_layer:next(Shard, Iterator0, 10, ?FUTURE) of {ok, end_of_stream} -> {AccIterators, AccMessages}; {ok, _Iterator1, []} -> From b68ebb9a733cd35aadd50d38f66a6ea3843e2c16 Mon Sep 17 00:00:00 2001 From: ieQu1 <99872536+ieQu1@users.noreply.github.com> Date: Mon, 1 Jul 2024 01:05:10 +0200 Subject: [PATCH 13/21] test(dsrepl): Generalize tests to use different storage layouts --- .../test/emqx_ds_backends_SUITE.erl | 3 + .../test/emqx_ds_replication_SUITE.erl | 118 ++++++++++++------ 2 files changed, 80 insertions(+), 41 deletions(-) diff --git a/apps/emqx_ds_backends/test/emqx_ds_backends_SUITE.erl b/apps/emqx_ds_backends/test/emqx_ds_backends_SUITE.erl index ef99a0376..1fc1594fc 100644 --- a/apps/emqx_ds_backends/test/emqx_ds_backends_SUITE.erl +++ b/apps/emqx_ds_backends/test/emqx_ds_backends_SUITE.erl @@ -599,6 +599,9 @@ init_per_suite(Config) -> end_per_suite(_Config) -> ok. +suite() -> + [{timetrap, 50_000}]. + init_per_testcase(TC, Config) -> Apps = emqx_cth_suite:start( [emqx_durable_storage, emqx_ds_backends], diff --git a/apps/emqx_ds_builtin_raft/test/emqx_ds_replication_SUITE.erl b/apps/emqx_ds_builtin_raft/test/emqx_ds_replication_SUITE.erl index 068842709..92ceb2e16 100644 --- a/apps/emqx_ds_builtin_raft/test/emqx_ds_replication_SUITE.erl +++ b/apps/emqx_ds_builtin_raft/test/emqx_ds_replication_SUITE.erl @@ -29,15 +29,12 @@ emqx_ds_test_helpers:on(NODES, fun() -> BODY end) ). -opts() -> - opts(#{}). - -opts(Overrides) -> +opts(Config, Overrides) -> + Layout = ?config(layout, Config), maps:merge( #{ backend => builtin_raft, - %% storage => {emqx_ds_storage_reference, #{}}, - storage => {emqx_ds_storage_bitfield_lts, #{epoch_bits => 10}}, + storage => Layout, n_shards => 16, n_sites => 1, replication_factor => 3, @@ -58,7 +55,7 @@ appspec(emqx_durable_storage) -> t_metadata(init, Config) -> Apps = emqx_cth_suite:start([emqx_ds_builtin_raft], #{ - work_dir => emqx_cth_suite:work_dir(?FUNCTION_NAME, Config) + work_dir => ?config(work_dir, Config) }), [{apps, Apps} | Config]; t_metadata('end', Config) -> @@ -108,7 +105,7 @@ t_replication_transfers_snapshots(init, Config) -> {t_replication_transfers_snapshots2, #{apps => Apps}}, {t_replication_transfers_snapshots3, #{apps => Apps}} ], - #{work_dir => emqx_cth_suite:work_dir(?FUNCTION_NAME, Config)} + #{work_dir => ?config(work_dir, Config)} ), Nodes = emqx_cth_cluster:start(NodeSpecs), [{nodes, Nodes}, {specs, NodeSpecs} | Config]; @@ -125,9 +122,10 @@ t_replication_transfers_snapshots(Config) -> Nodes = [Node, NodeOffline | _] = ?config(nodes, Config), _Specs = [_, SpecOffline | _] = ?config(specs, Config), ?check_trace( + #{timetrap => 30_000}, begin %% Initialize DB on all nodes and wait for it to be online. - Opts = opts(#{n_shards => 1, n_sites => 3}), + Opts = opts(Config, #{n_shards => 1, n_sites => 3}), ?assertEqual( [{ok, ok} || _ <- Nodes], erpc:multicall(Nodes, emqx_ds, open_db, [?DB, Opts]) @@ -139,8 +137,11 @@ t_replication_transfers_snapshots(Config) -> ), %% Stop the DB on the "offline" node. - ok = emqx_cth_cluster:stop_node(NodeOffline), - _ = ?block_until(#{?snk_kind := ds_ra_state_enter, state := leader}, 500, 0), + ?wait_async_action( + ok = emqx_cth_cluster:stop_node(NodeOffline), + #{?snk_kind := ds_ra_state_enter, state := leader}, + 5_000 + ), %% Fill the storage with messages and few additional generations. emqx_ds_test_helpers:apply_stream(?DB, Nodes -- [NodeOffline], Stream), @@ -153,9 +154,10 @@ t_replication_transfers_snapshots(Config) -> ?snk_meta := #{node := NodeOffline} }) ), - ?assertEqual( - ok, - erpc:call(NodeOffline, emqx_ds, open_db, [?DB, opts()]) + + ok = ?ON( + NodeOffline, + emqx_ds:open_db(?DB, opts(Config, #{})) ), %% Trigger storage operation and wait the replica to be restored. @@ -183,7 +185,7 @@ t_rebalance(init, Config) -> {t_rebalance3, #{apps => Apps}}, {t_rebalance4, #{apps => Apps}} ], - #{work_dir => emqx_cth_suite:work_dir(?FUNCTION_NAME, Config)} + #{work_dir => ?config(work_dir, Config)} ), [{nodes, Nodes} | Config]; t_rebalance('end', Config) -> @@ -206,7 +208,7 @@ t_rebalance(Config) -> begin Sites = [S1, S2 | _] = [ds_repl_meta(N, this_site) || N <- Nodes], %% 1. Initialize DB on the first node. - Opts = opts(#{n_shards => 16, n_sites => 1, replication_factor => 3}), + Opts = opts(Config, #{n_shards => 16, n_sites => 1, replication_factor => 3}), [ ?assertEqual(ok, ?ON(Node, emqx_ds:open_db(?DB, Opts))) || Node <- Nodes @@ -316,7 +318,7 @@ t_join_leave_errors(init, Config) -> {t_join_leave_errors1, #{apps => Apps}}, {t_join_leave_errors2, #{apps => Apps}} ], - #{work_dir => emqx_cth_suite:work_dir(?FUNCTION_NAME, Config)} + #{work_dir => ?config(work_dir, Config)} ), [{nodes, Nodes} | Config]; t_join_leave_errors('end', Config) -> @@ -327,7 +329,7 @@ t_join_leave_errors(Config) -> %% join/leave operations are reported correctly. [N1, N2] = ?config(nodes, Config), - Opts = opts(#{n_shards => 16, n_sites => 1, replication_factor => 3}), + Opts = opts(Config, #{n_shards => 16, n_sites => 1, replication_factor => 3}), ?assertEqual(ok, erpc:call(N1, emqx_ds, open_db, [?FUNCTION_NAME, Opts])), ?assertEqual(ok, erpc:call(N2, emqx_ds, open_db, [?FUNCTION_NAME, Opts])), @@ -385,7 +387,7 @@ t_rebalance_chaotic_converges(init, Config) -> {t_rebalance_chaotic_converges2, #{apps => Apps}}, {t_rebalance_chaotic_converges3, #{apps => Apps}} ], - #{work_dir => emqx_cth_suite:work_dir(?FUNCTION_NAME, Config)} + #{work_dir => ?config(work_dir, Config)} ), [{nodes, Nodes} | Config]; t_rebalance_chaotic_converges('end', Config) -> @@ -411,7 +413,7 @@ t_rebalance_chaotic_converges(Config) -> ct:pal("Sites: ~p~n", [Sites]), %% Initialize DB on first two nodes. - Opts = opts(#{n_shards => 16, n_sites => 2, replication_factor => 3}), + Opts = opts(Config, #{n_shards => 16, n_sites => 2, replication_factor => 3}), %% Open DB: ?assertEqual( @@ -482,7 +484,7 @@ t_rebalance_offline_restarts(init, Config) -> {t_rebalance_offline_restarts2, #{apps => Apps}}, {t_rebalance_offline_restarts3, #{apps => Apps}} ], - #{work_dir => emqx_cth_suite:work_dir(?FUNCTION_NAME, Config)} + #{work_dir => ?config(work_dir, Config)} ), Nodes = emqx_cth_cluster:start(Specs), [{nodes, Nodes}, {nodespecs, Specs} | Config]; @@ -498,7 +500,7 @@ t_rebalance_offline_restarts(Config) -> _Specs = [NS1, NS2, _] = ?config(nodespecs, Config), %% Initialize DB on all 3 nodes. - Opts = opts(#{n_shards => 8, n_sites => 3, replication_factor => 3}), + Opts = opts(Config, #{n_shards => 8, n_sites => 3, replication_factor => 3}), ?assertEqual( [{ok, ok} || _ <- Nodes], erpc:multicall(Nodes, emqx_ds, open_db, [?DB, Opts]) @@ -544,7 +546,7 @@ t_drop_generation(Config) -> {t_drop_generation3, #{apps => Apps}} ], #{ - work_dir => emqx_cth_suite:work_dir(?FUNCTION_NAME, Config) + work_dir => ?config(work_dir, Config) } ), @@ -552,7 +554,7 @@ t_drop_generation(Config) -> ?check_trace( try %% Initialize DB on all 3 nodes. - Opts = opts(#{n_shards => 1, n_sites => 3, replication_factor => 3}), + Opts = opts(Config, #{n_shards => 1, n_sites => 3, replication_factor => 3}), ?assertEqual( [{ok, ok} || _ <- Nodes], erpc:multicall(Nodes, emqx_ds, open_db, [?DB, Opts]) @@ -614,21 +616,21 @@ t_drop_generation(Config) -> t_error_mapping_replication_layer(init, Config) -> Apps = emqx_cth_suite:start([emqx_ds_builtin_raft], #{ - work_dir => emqx_cth_suite:work_dir(?FUNCTION_NAME, Config) + work_dir => ?config(work_dir, Config) }), [{apps, Apps} | Config]; t_error_mapping_replication_layer('end', Config) -> emqx_cth_suite:stop(?config(apps, Config)), Config. -t_error_mapping_replication_layer(_Config) -> +t_error_mapping_replication_layer(Config) -> %% This checks that the replication layer maps recoverable errors correctly. ok = emqx_ds_test_helpers:mock_rpc(), ok = snabbkaffe:start_trace(), DB = ?FUNCTION_NAME, - ?assertMatch(ok, emqx_ds:open_db(DB, (opts())#{n_shards => 2})), + ?assertMatch(ok, emqx_ds:open_db(DB, opts(Config, #{n_shards => 2}))), [Shard1, Shard2] = emqx_ds_replication_layer_meta:shards(DB), TopicFilter = emqx_topic:words(<<"foo/#">>), @@ -695,7 +697,7 @@ t_error_mapping_replication_layer(_Config) -> Results2 = lists:map( fun(Iter) -> case emqx_ds:next(DB, Iter, _BatchSize = 42) of - Ok = {ok, _Iter, [_ | _]} -> + Ok = {ok, _Iter, _} -> Ok; Error = {error, recoverable, {badrpc, _}} -> Error; @@ -716,20 +718,20 @@ t_error_mapping_replication_layer(_Config) -> %% problems. t_store_batch_fail(init, Config) -> Apps = emqx_cth_suite:start([emqx_ds_builtin_raft], #{ - work_dir => emqx_cth_suite:work_dir(?FUNCTION_NAME, Config) + work_dir => ?config(work_dir, Config) }), [{apps, Apps} | Config]; t_store_batch_fail('end', Config) -> emqx_cth_suite:stop(?config(apps, Config)), Config. -t_store_batch_fail(_Config) -> +t_store_batch_fail(Config) -> ?check_trace( #{timetrap => 15_000}, try meck:new(emqx_ds_storage_layer, [passthrough, no_history]), DB = ?FUNCTION_NAME, - ?assertMatch(ok, emqx_ds:open_db(DB, (opts())#{n_shards => 2})), + ?assertMatch(ok, emqx_ds:open_db(DB, opts(Config, #{n_shards => 2}))), %% Success: Batch1 = [ message(<<"C1">>, <<"foo/bar">>, <<"1">>, 1), @@ -768,7 +770,7 @@ t_store_batch_fail(_Config) -> ), meck:unload(ra), ?assertMatch(ok, emqx_ds:store_batch(DB, Batch3, #{sync => true})), - lists:sort(emqx_ds_test_helpers:consume_per_stream(DB, ['#'], 1)) + lists:sort(emqx_ds_test_helpers:consume_per_stream(DB, ['#'], 0)) after meck:unload() end, @@ -803,7 +805,7 @@ t_crash_restart_recover(init, Config) -> {t_crash_stop_recover2, #{apps => Apps}}, {t_crash_stop_recover3, #{apps => Apps}} ], - #{work_dir => emqx_cth_suite:work_dir(?FUNCTION_NAME, Config)} + #{work_dir => ?config(work_dir, Config)} ), Nodes = emqx_cth_cluster:start(Specs), [{nodes, Nodes}, {nodespecs, Specs} | Config]; @@ -815,7 +817,7 @@ t_crash_restart_recover(Config) -> %% correctly preserved. Nodes = [N1, N2, N3] = ?config(nodes, Config), _Specs = [_, NS2, NS3] = ?config(nodespecs, Config), - DBOpts = opts(#{n_shards => 16, n_sites => 3, replication_factor => 3}), + DBOpts = opts(Config, #{n_shards => 16, n_sites => 3, replication_factor => 3}), %% Prepare test event stream. NMsgs = 400, @@ -856,7 +858,10 @@ t_crash_restart_recover(Config) -> MatchFlushFailed = ?match_event(#{?snk_kind := emqx_ds_buffer_flush_failed}), {ok, SubRef} = snabbkaffe:subscribe(MatchFlushFailed, NMsgs, _Timeout = 5000, infinity), {timeout, Events} = snabbkaffe:receive_events(SubRef), - LostMessages = [M || #{batch := Messages} <- Events, M <- Messages], + LostMessages = [ + emqx_ds_test_helpers:message_canonical_form(M) + || #{batch := Messages} <- Events, M <- Messages + ], ct:pal("Some messages were lost: ~p", [LostMessages]), ?assert(length(LostMessages) < NMsgs div 20), @@ -876,8 +881,16 @@ t_crash_restart_recover(Config) -> %% Does any messages were lost unexpectedly? {_, DSMessages} = lists:unzip(emqx_utils_stream:consume(DSStream1)), ExpectedMessages = emqx_utils_stream:consume(ExpectedStream), - MissingMessages = ExpectedMessages -- DSMessages, - ?defer_assert(?assertEqual([], MissingMessages -- LostMessages, DSMessages)) + MissingMessages = emqx_ds_test_helpers:message_set_subtract( + ExpectedMessages, DSMessages + ), + ?defer_assert( + ?assertEqual( + [], + emqx_ds_test_helpers:sublist(MissingMessages -- LostMessages), + emqx_ds_test_helpers:sublist(DSMessages) + ) + ) end, lists:foreach(VerifyClient, TopicStreams) end, @@ -984,12 +997,35 @@ sample(N, List) -> suite() -> [{timetrap, {seconds, 60}}]. -all() -> emqx_common_test_helpers:all(?MODULE). +all() -> + [{group, Grp} || {Grp, _} <- groups()]. + +groups() -> + TCs = emqx_common_test_helpers:all(?MODULE), + [ + {bitfield_lts, TCs}, + {skipstream_lts, TCs} + ]. + +init_per_group(Group, Config) -> + LayoutConf = + case Group of + skipstream_lts -> + {emqx_ds_storage_skipstream_lts, #{with_guid => true}}; + bitfield_lts -> + {emqx_ds_storage_bitfield_lts, #{}} + end, + [{layout, LayoutConf} | Config]. + +end_per_group(_Group, Config) -> + Config. init_per_testcase(TCName, Config0) -> - Config = emqx_common_test_helpers:init_per_testcase(?MODULE, TCName, Config0), - Config. + Config1 = [{work_dir, emqx_cth_suite:work_dir(TCName, Config0)} | Config0], + emqx_common_test_helpers:init_per_testcase(?MODULE, TCName, Config1). end_per_testcase(TCName, Config) -> ok = snabbkaffe:stop(), - emqx_common_test_helpers:end_per_testcase(?MODULE, TCName, Config). + Result = emqx_common_test_helpers:end_per_testcase(?MODULE, TCName, Config), + emqx_cth_suite:clean_work_dir(?config(work_dir, Config)), + Result. From afeb2ab8aa6b04c58edd36570e6aa41241504de0 Mon Sep 17 00:00:00 2001 From: ieQu1 <99872536+ieQu1@users.noreply.github.com> Date: Mon, 1 Jul 2024 01:11:07 +0200 Subject: [PATCH 14/21] feat(ds): Add metrics for skipstream layout --- .../include/emqx_ds_metrics.hrl | 9 ++++ .../src/emqx_ds_builtin_metrics.erl | 17 +++++++- .../src/emqx_ds_storage_skipstream_lts.erl | 43 ++++++++++++++++++- apps/emqx_prometheus/src/emqx_prometheus.erl | 9 +++- 4 files changed, 73 insertions(+), 5 deletions(-) diff --git a/apps/emqx_durable_storage/include/emqx_ds_metrics.hrl b/apps/emqx_durable_storage/include/emqx_ds_metrics.hrl index 4e463b14f..6433b1d29 100644 --- a/apps/emqx_durable_storage/include/emqx_ds_metrics.hrl +++ b/apps/emqx_durable_storage/include/emqx_ds_metrics.hrl @@ -46,4 +46,13 @@ %% the value is rejected by the subsequent post-processing: -define(DS_BITFIELD_LTS_COLLISION_COUNTER, emqx_ds_storage_bitfield_lts_counter_collision). +%%% Skipstream LTS Storage counters: +-define(DS_SKIPSTREAM_LTS_SEEK, emqx_ds_storage_skipstream_lts_seek). +-define(DS_SKIPSTREAM_LTS_NEXT, emqx_ds_storage_skipstream_lts_next). +-define(DS_SKIPSTREAM_LTS_HASH_COLLISION, emqx_ds_storage_skipstream_lts_hash_collision). +-define(DS_SKIPSTREAM_LTS_HIT, emqx_ds_storage_skipstream_lts_hit). +-define(DS_SKIPSTREAM_LTS_MISS, emqx_ds_storage_skipstream_lts_miss). +-define(DS_SKIPSTREAM_LTS_FUTURE, emqx_ds_storage_skipstream_lts_future). +-define(DS_SKIPSTREAM_LTS_EOS, emqx_ds_storage_skipstream_lts_end_of_stream). + -endif. diff --git a/apps/emqx_durable_storage/src/emqx_ds_builtin_metrics.erl b/apps/emqx_durable_storage/src/emqx_ds_builtin_metrics.erl index 107e2e5e6..1ae37f321 100644 --- a/apps/emqx_durable_storage/src/emqx_ds_builtin_metrics.erl +++ b/apps/emqx_durable_storage/src/emqx_ds_builtin_metrics.erl @@ -36,7 +36,9 @@ inc_lts_seek_counter/2, inc_lts_next_counter/2, - inc_lts_collision_counter/2 + inc_lts_collision_counter/2, + + collect_shard_counter/3 ]). %% behavior callbacks: @@ -59,7 +61,14 @@ {slide, ?DS_STORE_BATCH_TIME}, {counter, ?DS_BITFIELD_LTS_SEEK_COUNTER}, {counter, ?DS_BITFIELD_LTS_NEXT_COUNTER}, - {counter, ?DS_BITFIELD_LTS_COLLISION_COUNTER} + {counter, ?DS_BITFIELD_LTS_COLLISION_COUNTER}, + {counter, ?DS_SKIPSTREAM_LTS_SEEK}, + {counter, ?DS_SKIPSTREAM_LTS_NEXT}, + {counter, ?DS_SKIPSTREAM_LTS_HASH_COLLISION}, + {counter, ?DS_SKIPSTREAM_LTS_HIT}, + {counter, ?DS_SKIPSTREAM_LTS_MISS}, + {counter, ?DS_SKIPSTREAM_LTS_FUTURE}, + {counter, ?DS_SKIPSTREAM_LTS_EOS} ]). -define(FETCH_METRICS, [ @@ -160,6 +169,10 @@ inc_lts_next_counter({DB, _}, Inc) -> inc_lts_collision_counter({DB, _}, Inc) -> catch emqx_metrics_worker:inc(?WORKER, DB, ?DS_BITFIELD_LTS_COLLISION_COUNTER, Inc). +-spec collect_shard_counter(emqx_ds_storage_layer:shard_id(), atom(), non_neg_integer()) -> ok. +collect_shard_counter({DB, _}, Key, Inc) -> + catch emqx_metrics_worker:inc(?WORKER, DB, Key, Inc). + prometheus_meta() -> lists:map( fun diff --git a/apps/emqx_durable_storage/src/emqx_ds_storage_skipstream_lts.erl b/apps/emqx_durable_storage/src/emqx_ds_storage_skipstream_lts.erl index 7b0c49c6f..164050932 100644 --- a/apps/emqx_durable_storage/src/emqx_ds_storage_skipstream_lts.erl +++ b/apps/emqx_durable_storage/src/emqx_ds_storage_skipstream_lts.erl @@ -43,6 +43,7 @@ -include_lib("emqx_utils/include/emqx_message.hrl"). -include_lib("snabbkaffe/include/trace.hrl"). +-include("emqx_ds_metrics.hrl"). -ifdef(TEST). -include_lib("eunit/include/eunit.hrl"). @@ -260,7 +261,8 @@ update_iterator(_Shard, _Data, OldIter, DSKey) -> {ok, OldIter#it{ts = TS}} end. -next({_DB, Shard}, S, It, BatchSize, TMax, IsCurrent) -> +next(ShardId = {_DB, Shard}, S, It, BatchSize, TMax, IsCurrent) -> + init_counters(), Iterators = init_iterators(S, It), %% ?tp(notice, skipstream_init_iters, #{it => It, its => Iterators}), try @@ -271,7 +273,8 @@ next({_DB, Shard}, S, It, BatchSize, TMax, IsCurrent) -> Result end after - free_iterators(Iterators) + free_iterators(Iterators), + collect_counters(ShardId) end. delete_next(Shard, S, It0, Selector, BatchSize, Now, IsCurrent) -> @@ -447,12 +450,15 @@ next_loop(Shard, S, It0, Iterators, BatchSize, TMax, Op, Acc) -> case next_step(S, StaticIdx, CompressedTF, Iterators, undefined, Op) of none -> %% ?tp(notice, skipstream_loop_result, #{r => none}), + inc_counter(?DS_SKIPSTREAM_LTS_EOS), finalize_loop(It0, Op, Acc); {seek, TS} when TS > TMax -> %% ?tp(notice, skipstream_loop_result, #{r => seek_future, ts => TS}), + inc_counter(?DS_SKIPSTREAM_LTS_FUTURE), finalize_loop(It0, {seek, TS}, Acc); {ok, TS, _Key, _Msg0} when TS > TMax -> %% ?tp(notice, skipstream_loop_result, #{r => ok_future, ts => TS, key => _Key}), + inc_counter(?DS_SKIPSTREAM_LTS_FUTURE), finalize_loop(It0, {seek, TS}, Acc); {seek, TS} -> %% ?tp(notice, skipstream_loop_result, #{r => seek, ts => TS}), @@ -479,8 +485,10 @@ next_step( Result = case Op of next -> + inc_counter(?DS_SKIPSTREAM_LTS_NEXT), rocksdb:iterator_move(IH, next); {seek, TS} -> + inc_counter(?DS_SKIPSTREAM_LTS_SEEK), rocksdb:iterator_move(IH, {seek, mk_key(StaticIdx, N, Hash, TS)}) end, case Result of @@ -508,10 +516,12 @@ next_step( Msg0 = deserialize(S, Blob), case emqx_topic:match(Msg0#message.topic, CompressedTF) of true -> + inc_counter(?DS_SKIPSTREAM_LTS_HIT), {ok, NextTS, Key, Msg0}; false -> %% Hash collision. Advance to the %% next timestamp: + inc_counter(?DS_SKIPSTREAM_LTS_HASH_COLLISION), {seek, NextTS + 1} end; _ -> @@ -519,6 +529,7 @@ next_step( next_step(S, StaticIdx, CompressedTF, Iterators, NextTS, {seek, NextTS}) end; NextTS when NextTS > ExpectedTS, N > 0 -> + inc_counter(?DS_SKIPSTREAM_LTS_MISS), %% Next index level is not what we expect. {seek, NextTS} end @@ -672,3 +683,31 @@ words(<<>>) -> []; words(Bin) -> emqx_topic:words(Bin). + +%%%%%%%% Counters %%%%%%%%%% + +-define(COUNTERS, [ + ?DS_SKIPSTREAM_LTS_SEEK, + ?DS_SKIPSTREAM_LTS_NEXT, + ?DS_SKIPSTREAM_LTS_HASH_COLLISION, + ?DS_SKIPSTREAM_LTS_HIT, + ?DS_SKIPSTREAM_LTS_MISS, + ?DS_SKIPSTREAM_LTS_FUTURE, + ?DS_SKIPSTREAM_LTS_EOS +]). + +inc_counter(Counter) -> + N = get(Counter), + put(Counter, N + 1). + +init_counters() -> + _ = [put(I, 0) || I <- ?COUNTERS], + ok. + +collect_counters(Shard) -> + lists:foreach( + fun(Key) -> + emqx_ds_builtin_metrics:collect_shard_counter(Shard, Key, get(Key)) + end, + ?COUNTERS + ). diff --git a/apps/emqx_prometheus/src/emqx_prometheus.erl b/apps/emqx_prometheus/src/emqx_prometheus.erl index e205952d2..c1d30e604 100644 --- a/apps/emqx_prometheus/src/emqx_prometheus.erl +++ b/apps/emqx_prometheus/src/emqx_prometheus.erl @@ -514,7 +514,14 @@ emqx_collect(K = ?DS_STORE_BATCH_TIME, D) -> gauge_metrics(?MG(K, D, [])); emqx_collect(K = ?DS_BUILTIN_NEXT_TIME, D) -> gauge_metrics(?MG(K, D, [])); emqx_collect(K = ?DS_BITFIELD_LTS_SEEK_COUNTER, D) -> counter_metrics(?MG(K, D, [])); emqx_collect(K = ?DS_BITFIELD_LTS_NEXT_COUNTER, D) -> counter_metrics(?MG(K, D, [])); -emqx_collect(K = ?DS_BITFIELD_LTS_COLLISION_COUNTER, D) -> counter_metrics(?MG(K, D, [])). +emqx_collect(K = ?DS_BITFIELD_LTS_COLLISION_COUNTER, D) -> counter_metrics(?MG(K, D, [])); +emqx_collect(K = ?DS_SKIPSTREAM_LTS_SEEK, D) -> counter_metrics(?MG(K, D, [])); +emqx_collect(K = ?DS_SKIPSTREAM_LTS_NEXT, D) -> counter_metrics(?MG(K, D, [])); +emqx_collect(K = ?DS_SKIPSTREAM_LTS_HASH_COLLISION, D) -> counter_metrics(?MG(K, D, [])); +emqx_collect(K = ?DS_SKIPSTREAM_LTS_HIT, D) -> counter_metrics(?MG(K, D, [])); +emqx_collect(K = ?DS_SKIPSTREAM_LTS_MISS, D) -> counter_metrics(?MG(K, D, [])); +emqx_collect(K = ?DS_SKIPSTREAM_LTS_FUTURE, D) -> counter_metrics(?MG(K, D, [])); +emqx_collect(K = ?DS_SKIPSTREAM_LTS_EOS, D) -> counter_metrics(?MG(K, D, [])). %%-------------------------------------------------------------------- %% Indicators From 23dafbb03b0b981abf9b8a7844c894663e9cd054 Mon Sep 17 00:00:00 2001 From: ieQu1 <99872536+ieQu1@users.noreply.github.com> Date: Mon, 1 Jul 2024 01:15:14 +0200 Subject: [PATCH 15/21] feat(ds): Add a benchmarking tool for storage efficiency analysis --- .../dev/storage_efficiency.erl | 223 ++++++++++++++++++ 1 file changed, 223 insertions(+) create mode 100644 apps/emqx_durable_storage/dev/storage_efficiency.erl diff --git a/apps/emqx_durable_storage/dev/storage_efficiency.erl b/apps/emqx_durable_storage/dev/storage_efficiency.erl new file mode 100644 index 000000000..f14a14a36 --- /dev/null +++ b/apps/emqx_durable_storage/dev/storage_efficiency.erl @@ -0,0 +1,223 @@ +%%-------------------------------------------------------------------- +%% Copyright (c) 2024 EMQ Technologies Co., Ltd. All Rights Reserved. +%% +%% Licensed under the Apache License, Version 2.0 (the "License"); +%% you may not use this file except in compliance with the License. +%% You may obtain a copy of the License at +%% +%% http://www.apache.org/licenses/LICENSE-2.0 +%% +%% Unless required by applicable law or agreed to in writing, software +%% distributed under the License is distributed on an "AS IS" BASIS, +%% WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +%% See the License for the specific language governing permissions and +%% limitations under the License. +%%-------------------------------------------------------------------- + +%% @doc This script can be loaded to a running EMQX EE node. It will +%% create a number of DS databases with different options and fill +%% them with data of given size. +%% +%% Then it will measure size of the database directories and create +%% a "storage (in)efficiency" report. +-module(storage_efficiency). + +-include_lib("emqx_utils/include/emqx_message.hrl"). + +%% API: +-export([run/0, run/1]). + +%%================================================================================ +%% API functions +%%================================================================================ + +run() -> + run(#{}). + +run(Custom) -> + RunConf = maps:merge( + #{ + %% Sleep between batches: + sleep => 1_000, + %% Don't run test, only plot data: + dry_run => false, + %% Payload size multiplier: + size => 10, + %% Number of batches: + batches => 100, + %% Add generation every N batches: + add_generation => 10 + }, + Custom + ), + lists:foreach( + fun(DBConf) -> + run(DBConf, RunConf) + end, + configs() + ). + +%% erlfmt-ignore +gnuplot_script(Filename) -> + "set terminal qt\n" + %% "set logscale y 10\n" + "set title \"" ++ filename:basename(Filename, ".dat") ++ "\"\n" + "set key autotitle columnheader\n" + "plot for [n=2:*] \"" ++ Filename ++ "\" using 1:n with linespoints". + +%%================================================================================ +%% Internal functions +%%================================================================================ + +configs() -> + [ + {'benchmark-skipstream-asn1', + db_conf({emqx_ds_storage_skipstream_lts, #{serialization_schema => asn1}})}, + {'benchmark-skipstream-v1', + db_conf({emqx_ds_storage_skipstream_lts, #{serialization_schema => v1}})}, + {'benchmark-bitfield', db_conf({emqx_ds_storage_bitfield_lts, #{}})} + ]. + +db_conf(Storage) -> + #{ + backend => builtin_local, + %% n_sites => 1, + n_shards => 1, + %% replication_factor => 1, + %% replication_options => #{}, + storage => Storage + }. + +-record(s, { + data_size = 0, + payload_size = 0, + n_messages = 0, + datapoints = #{}, + x_axis = [] +}). + +run({DB, Config}, RunConf) -> + #{ + batches := NBatches, + size := PSMultiplier, + add_generation := AddGeneration, + sleep := Sleep, + dry_run := DryRun + } = RunConf, + {ok, _} = application:ensure_all_started(emqx_ds_backends), + Dir = dir(DB), + Filename = atom_to_list(DB) ++ ".dat", + DryRun orelse + begin + io:format(user, "Running benchmark for ~p in ~p~n", [DB, Dir]), + %% Ensure safe directory: + {match, _} = re:run(Dir, filename:join("data", DB)), + %% Ensure clean state: + ok = emqx_ds:open_db(DB, Config), + ok = emqx_ds:drop_db(DB), + ok = file:del_dir_r(Dir), + %% Open a fresh DB: + ok = emqx_ds:open_db(DB, Config), + S = lists:foldl( + fun(Batch, Acc0) -> + Size = PSMultiplier * Batch, + io:format(user, "Storing batch with payload size ~p~n", [Size]), + Acc1 = store_batch(DB, Size, Acc0), + %% Sleep so all data is hopefully flushed: + timer:sleep(Sleep), + (Batch div AddGeneration) =:= 0 andalso + emqx_ds:add_generation(DB), + collect_datapoint(DB, Acc1) + end, + collect_datapoint(DB, #s{}), + lists:seq(1, NBatches) + ), + {ok, FD} = file:open(Filename, [write]), + io:put_chars(FD, print(S)), + file:close(FD) + end, + os:cmd("echo '" ++ gnuplot_script(Filename) ++ "' | gnuplot --persist -"), + ok. + +collect_datapoint( + DB, S0 = #s{n_messages = N, data_size = DS, payload_size = PS, datapoints = DP0, x_axis = X} +) -> + NewData = [{"$_n", N}, {"$data", DS}, {"$payloads", PS} | dirsize(DB)], + DP = lists:foldl( + fun({Key, Val}, Acc) -> + maps:update_with( + Key, + fun(M) -> M#{N => Val} end, + #{}, + Acc + ) + end, + DP0, + NewData + ), + S0#s{ + datapoints = DP, + x_axis = [N | X] + }. + +print(#s{x_axis = XX, datapoints = DP}) -> + Cols = lists:sort(maps:keys(DP)), + Lines = [ + %% Print header: + Cols + %% Scan through rows: + | [ + %% Scan throgh columns: + [integer_to_binary(maps:get(X, maps:get(Col, DP), 0)) || Col <- Cols] + || X <- lists:reverse(XX) + ] + ], + lists:join( + "\n", + [lists:join(" ", Line) || Line <- Lines] + ). + +dirsize(DB) -> + RawOutput = os:cmd("cd " ++ dir(DB) ++ "; du -b --max-depth 1 ."), + [ + begin + [Sz, Dir] = string:lexemes(L, "\t"), + {Dir, list_to_integer(Sz)} + end + || L <- string:lexemes(RawOutput, "\n") + ]. + +dir(DB) -> + filename:join(emqx_ds_storage_layer:base_dir(), DB). + +store_batch(DB, PayloadSize, S0 = #s{n_messages = N, data_size = DS, payload_size = PS}) -> + From = rand:bytes(16), + BatchSize = 50, + Batch = [ + #message{ + id = emqx_guid:gen(), + timestamp = emqx_message:timestamp_now(), + payload = rand:bytes(PayloadSize), + from = From, + topic = emqx_topic:join([ + <<"blah">>, + <<"blah">>, + '', + <<"blah">>, + From, + <<"bazzzzzzzzzzzzzzzzzzzzzzz">>, + integer_to_binary(I) + ]) + } + || I <- lists:seq(1, BatchSize) + ], + ok = emqx_ds:store_batch(DB, Batch, #{sync => true}), + S0#s{ + n_messages = N + length(Batch), + data_size = DS + lists:sum(lists:map(fun msg_size/1, Batch)), + payload_size = PS + length(Batch) * PayloadSize + }. + +%% We consider MQTT wire encoding to be "close to the ideal". +msg_size(Msg = #message{}) -> + iolist_size(emqx_frame:serialize(emqx_message:to_packet(undefined, Msg))). From 661f79544b6056ad8a5f8e38d0465de47e1d1379 Mon Sep 17 00:00:00 2001 From: ieQu1 <99872536+ieQu1@users.noreply.github.com> Date: Thu, 4 Jul 2024 00:44:19 +0200 Subject: [PATCH 16/21] fix(ds): Optimize hot loop of skipstream storage layout --- .../src/emqx_ds_storage_skipstream_lts.erl | 90 ++++++++++++------- 1 file changed, 58 insertions(+), 32 deletions(-) diff --git a/apps/emqx_durable_storage/src/emqx_ds_storage_skipstream_lts.erl b/apps/emqx_durable_storage/src/emqx_ds_storage_skipstream_lts.erl index 164050932..bb786c1f0 100644 --- a/apps/emqx_durable_storage/src/emqx_ds_storage_skipstream_lts.erl +++ b/apps/emqx_durable_storage/src/emqx_ds_storage_skipstream_lts.erl @@ -293,6 +293,21 @@ delete_next(Shard, S, It0, Selector, BatchSize, Now, IsCurrent) -> %% Internal functions %%================================================================================ +%% Loop context: +-record(ctx, { + shard, + %% Generation runtime state + s, + %% RocksDB iterators: + iters, + %% Cached topic structure for the static index: + topic_structure, + %% Maximum time: + tmax, + %% Compressed topic filter, split into words: + filter +}). + get_streams(Trie, TopicFilter) -> lists:map( fun({Static, _Varying}) -> @@ -316,34 +331,19 @@ serialize(#s{serialization_schema = SSchema, with_guid = WithGuid}, Varying, Msg emqx_ds_msg_serializer:serialize(SSchema, Msg). enrich( - Shard, - #s{trie = Trie, with_guid = WithGuid}, + #ctx{shard = Shard, topic_structure = Structure, s = #s{with_guid = WithGuid}}, DSKey, - StaticKey, Msg0 ) -> - case emqx_ds_lts:reverse_lookup(Trie, StaticKey) of - {ok, Structure} -> - %% Reconstruct the original topic from the static topic - %% index and varying parts: - Topic = emqx_topic:join( - emqx_ds_lts:decompress_topic(Structure, words(Msg0#message.topic)) - ), - Msg0#message{ - topic = Topic, - id = - case WithGuid of - true -> Msg0#message.id; - false -> fake_guid(Shard, DSKey) - end - }; - undefined -> - Err = #{ - msg => "LTS trie missing key", - key => StaticKey - }, - throw({unrecoverable, Err}) - end. + Topic = emqx_topic:join(emqx_ds_lts:decompress_topic(Structure, words(Msg0#message.topic))), + Msg0#message{ + topic = Topic, + id = + case WithGuid of + true -> Msg0#message.id; + false -> fake_guid(Shard, DSKey) + end + }. deserialize( #s{serialization_schema = SSchema}, @@ -437,15 +437,41 @@ do_init_iterators(S, Static, [], _WildcardLevel) -> } ]. -next_loop(Shard, S, It = #it{ts = TS}, Iterators, BatchSize, TMax) -> - next_loop(Shard, S, It, Iterators, BatchSize, TMax, {seek, TS}, []). +next_loop( + Shard, + S = #s{trie = Trie}, + It = #it{static_index = StaticIdx, ts = TS, compressed_tf = CompressedTF}, + Iterators, + BatchSize, + TMax +) -> + TopicStructure = + case emqx_ds_lts:reverse_lookup(Trie, StaticIdx) of + {ok, Rev} -> + Rev; + undefined -> + throw(#{ + msg => "LTS trie missing key", + key => StaticIdx + }) + end, + Ctx = #ctx{ + shard = Shard, + s = S, + iters = Iterators, + topic_structure = TopicStructure, + filter = words(CompressedTF), + tmax = TMax + }, + next_loop(Ctx, It, BatchSize, {seek, TS}, []). -next_loop(_Shard, _S, It, _Iterators, 0, _TMax, Op, Acc) -> +next_loop(_Ctx, It, 0, Op, Acc) -> finalize_loop(It, Op, Acc); -next_loop(Shard, S, It0, Iterators, BatchSize, TMax, Op, Acc) -> +next_loop(Ctx, It0, BatchSize, Op, Acc) -> %% ?tp(notice, skipstream_loop, #{ %% ts => It0#it.ts, tf => It0#it.compressed_tf, bs => BatchSize, tmax => TMax, op => Op %% }), + #ctx{s = S, tmax = TMax, iters = Iterators} = Ctx, #it{static_index = StaticIdx, compressed_tf = CompressedTF} = It0, case next_step(S, StaticIdx, CompressedTF, Iterators, undefined, Op) of none -> @@ -463,12 +489,12 @@ next_loop(Shard, S, It0, Iterators, BatchSize, TMax, Op, Acc) -> {seek, TS} -> %% ?tp(notice, skipstream_loop_result, #{r => seek, ts => TS}), It = It0#it{ts = TS}, - next_loop(Shard, S, It, Iterators, BatchSize, TMax, {seek, TS}, Acc); + next_loop(Ctx, It, BatchSize, {seek, TS}, Acc); {ok, TS, DSKey, Msg0} -> %% ?tp(notice, skipstream_loop_result, #{r => ok, ts => TS, key => Key}), - Message = enrich(Shard, S, DSKey, StaticIdx, Msg0), + Message = enrich(Ctx, DSKey, Msg0), It = It0#it{ts = TS}, - next_loop(Shard, S, It, Iterators, BatchSize - 1, TMax, next, [{DSKey, Message} | Acc]) + next_loop(Ctx, It, BatchSize - 1, next, [{DSKey, Message} | Acc]) end. finalize_loop(It0, Op, Acc) -> From d1b574a67e8814ec3dcade7199eb3e4ce647dd69 Mon Sep 17 00:00:00 2001 From: ieQu1 <99872536+ieQu1@users.noreply.github.com> Date: Thu, 4 Jul 2024 01:47:30 +0200 Subject: [PATCH 17/21] perf(dslocal): Run heavy-duty operations in a temporary process --- .../src/emqx_ds_builtin_local.erl | 76 ++++++++++++++----- 1 file changed, 55 insertions(+), 21 deletions(-) diff --git a/apps/emqx_ds_builtin_local/src/emqx_ds_builtin_local.erl b/apps/emqx_ds_builtin_local/src/emqx_ds_builtin_local.erl index 5fe6eb559..28e307832 100644 --- a/apps/emqx_ds_builtin_local/src/emqx_ds_builtin_local.erl +++ b/apps/emqx_ds_builtin_local/src/emqx_ds_builtin_local.erl @@ -46,6 +46,12 @@ shard_of_message/4 ]). +%% Internal exports: +-export([ + do_next/3, + do_delete_next/4 +]). + -export_type([db_opts/0, shard/0, iterator/0, delete_iterator/0]). -include_lib("emqx_utils/include/emqx_message.hrl"). @@ -295,19 +301,8 @@ update_iterator(DB, Iter0 = #{?tag := ?IT, ?shard := Shard, ?enc := StorageIter0 end. -spec next(emqx_ds:db(), iterator(), pos_integer()) -> emqx_ds:next_result(iterator()). -next(DB, Iter0 = #{?tag := ?IT, ?shard := Shard, ?enc := StorageIter0}, N) -> - ShardId = {DB, Shard}, - T0 = erlang:monotonic_time(microsecond), - Result = emqx_ds_storage_layer:next(ShardId, StorageIter0, N, current_timestamp(ShardId)), - T1 = erlang:monotonic_time(microsecond), - emqx_ds_builtin_metrics:observe_next_time(DB, T1 - T0), - case Result of - {ok, StorageIter, Batch} -> - Iter = Iter0#{?enc := StorageIter}, - {ok, Iter, Batch}; - Other -> - Other - end. +next(DB, Iter, N) -> + with_worker(do_next, [DB, Iter, N]). -spec get_delete_streams(emqx_ds:db(), emqx_ds:topic_filter(), emqx_ds:time()) -> [emqx_ds:ds_specific_delete_stream()]. @@ -347,7 +342,36 @@ make_delete_iterator(DB, ?delete_stream(Shard, InnerStream), TopicFilter, StartT -spec delete_next(emqx_ds:db(), delete_iterator(), emqx_ds:delete_selector(), pos_integer()) -> emqx_ds:delete_next_result(emqx_ds:delete_iterator()). -delete_next(DB, Iter = #{?tag := ?DELETE_IT, ?shard := Shard, ?enc := StorageIter0}, Selector, N) -> +delete_next(DB, Iter, Selector, N) -> + with_worker(do_delete_next, [DB, Iter, Selector, N]). + +%%================================================================================ +%% Internal exports +%%================================================================================ + +current_timestamp(ShardId) -> + emqx_ds_builtin_local_meta:current_timestamp(ShardId). + +-spec do_next(emqx_ds:db(), iterator(), pos_integer()) -> emqx_ds:next_result(iterator()). +do_next(DB, Iter0 = #{?tag := ?IT, ?shard := Shard, ?enc := StorageIter0}, N) -> + ShardId = {DB, Shard}, + T0 = erlang:monotonic_time(microsecond), + Result = emqx_ds_storage_layer:next(ShardId, StorageIter0, N, current_timestamp(ShardId)), + T1 = erlang:monotonic_time(microsecond), + emqx_ds_builtin_metrics:observe_next_time(DB, T1 - T0), + case Result of + {ok, StorageIter, Batch} -> + Iter = Iter0#{?enc := StorageIter}, + {ok, Iter, Batch}; + Other -> + Other + end. + +-spec do_delete_next(emqx_ds:db(), delete_iterator(), emqx_ds:delete_selector(), pos_integer()) -> + emqx_ds:delete_next_result(emqx_ds:delete_iterator()). +do_delete_next( + DB, Iter = #{?tag := ?DELETE_IT, ?shard := Shard, ?enc := StorageIter0}, Selector, N +) -> ShardId = {DB, Shard}, case emqx_ds_storage_layer:delete_next( @@ -362,13 +386,6 @@ delete_next(DB, Iter = #{?tag := ?DELETE_IT, ?shard := Shard, ?enc := StorageIte Error end. -%%================================================================================ -%% Internal exports -%%================================================================================ - -current_timestamp(ShardId) -> - emqx_ds_builtin_local_meta:current_timestamp(ShardId). - %%================================================================================ %% Internal functions %%================================================================================ @@ -380,3 +397,20 @@ timeus_to_timestamp(undefined) -> undefined; timeus_to_timestamp(TimestampUs) -> TimestampUs div 1000. + +with_worker(F, A) -> + Parent = self(), + Ref = make_ref(), + {_Pid, MRef} = spawn_opt( + fun() -> + Parent ! {Ref, apply(?MODULE, F, A)} + end, + [monitor, {min_heap_size, 10000}] + ), + receive + {Ref, Result} -> + erlang:demonitor(MRef, [flush]), + Result; + {'DOWN', MRef, _, _, _, Info} -> + {error, unrecoverable, Info} + end. From dc4ae827985cc21b9c36265fff7838b2cd02d45e Mon Sep 17 00:00:00 2001 From: ieQu1 <99872536+ieQu1@users.noreply.github.com> Date: Thu, 4 Jul 2024 16:51:03 +0200 Subject: [PATCH 18/21] test(ds): Add message ID --- apps/emqx_durable_storage/test/emqx_ds_test_helpers.erl | 1 + 1 file changed, 1 insertion(+) diff --git a/apps/emqx_durable_storage/test/emqx_ds_test_helpers.erl b/apps/emqx_durable_storage/test/emqx_ds_test_helpers.erl index fe903fad2..08c08e0c5 100644 --- a/apps/emqx_durable_storage/test/emqx_ds_test_helpers.erl +++ b/apps/emqx_durable_storage/test/emqx_ds_test_helpers.erl @@ -122,6 +122,7 @@ topic_messages(TestCase, ClientId, N) -> fun() -> NBin = integer_to_binary(N), Msg = #message{ + id = <>, from = ClientId, topic = client_topic(TestCase, ClientId), timestamp = N * 100, From e70c1cfea325fd1b05fc9f4c4016aaa0945b7940 Mon Sep 17 00:00:00 2001 From: ieQu1 <99872536+ieQu1@users.noreply.github.com> Date: Thu, 4 Jul 2024 18:42:53 +0200 Subject: [PATCH 19/21] test(ds): Improve stability of replication test suite --- apps/emqx/test/emqx_cth_suite.erl | 1 + .../test/emqx_ds_replication_SUITE.erl | 17 +++++++++-------- 2 files changed, 10 insertions(+), 8 deletions(-) diff --git a/apps/emqx/test/emqx_cth_suite.erl b/apps/emqx/test/emqx_cth_suite.erl index f4fcd0a75..5fe4dce66 100644 --- a/apps/emqx/test/emqx_cth_suite.erl +++ b/apps/emqx/test/emqx_cth_suite.erl @@ -163,6 +163,7 @@ start(Apps, SuiteOpts = #{work_dir := WorkDir}) -> % 4. Setup isolated mnesia directory ok = emqx_common_test_helpers:load(mnesia), ok = application:set_env(mnesia, dir, filename:join([WorkDir, mnesia])), + ok = application:set_env(emqx_durable_storage, db_data_dir, filename:join([WorkDir, ds])), % 5. Start ekka separately. % For some reason it's designed to be started in non-regular way, so we have to track % applications started in the process manually. diff --git a/apps/emqx_ds_builtin_raft/test/emqx_ds_replication_SUITE.erl b/apps/emqx_ds_builtin_raft/test/emqx_ds_replication_SUITE.erl index 92ceb2e16..7fcc7ee1c 100644 --- a/apps/emqx_ds_builtin_raft/test/emqx_ds_replication_SUITE.erl +++ b/apps/emqx_ds_builtin_raft/test/emqx_ds_replication_SUITE.erl @@ -458,7 +458,7 @@ t_rebalance_chaotic_converges(Config) -> emqx_ds_test_helpers:apply_stream(?DB, Nodes, Stream), %% Wait for the last transition to complete. - ?retry(500, 20, ?assertEqual([], emqx_ds_test_helpers:transitions(N1, ?DB))), + ?retry(1000, 20, ?assertEqual([], emqx_ds_test_helpers:transitions(N1, ?DB))), ?defer_assert( ?assertEqual( @@ -726,11 +726,11 @@ t_store_batch_fail('end', Config) -> Config. t_store_batch_fail(Config) -> + DB = ?FUNCTION_NAME, ?check_trace( #{timetrap => 15_000}, try - meck:new(emqx_ds_storage_layer, [passthrough, no_history]), - DB = ?FUNCTION_NAME, + ok = meck:new(emqx_ds_storage_layer, [passthrough, no_history]), ?assertMatch(ok, emqx_ds:open_db(DB, opts(Config, #{n_shards => 2}))), %% Success: Batch1 = [ @@ -739,7 +739,7 @@ t_store_batch_fail(Config) -> ], ?assertMatch(ok, emqx_ds:store_batch(DB, Batch1, #{sync => true})), %% Inject unrecoverable error: - meck:expect(emqx_ds_storage_layer, store_batch, fun(_DB, _Shard, _Messages) -> + ok = meck:expect(emqx_ds_storage_layer, store_batch, fun(_DB, _Shard, _Messages) -> {error, unrecoverable, mock} end), Batch2 = [ @@ -749,10 +749,10 @@ t_store_batch_fail(Config) -> ?assertMatch( {error, unrecoverable, mock}, emqx_ds:store_batch(DB, Batch2, #{sync => true}) ), - meck:unload(emqx_ds_storage_layer), + ok = meck:unload(emqx_ds_storage_layer), %% Inject a recoveralbe error: - meck:new(ra, [passthrough, no_history]), - meck:expect(ra, process_command, fun(Servers, Shard, Command) -> + ok = meck:new(ra, [passthrough, no_history]), + ok = meck:expect(ra, process_command, fun(Servers, Shard, Command) -> ?tp(ra_command, #{servers => Servers, shard => Shard, command => Command}), {timeout, mock} end), @@ -768,7 +768,7 @@ t_store_batch_fail(Config) -> {error, recoverable, {timeout, mock}}, emqx_ds:store_batch(DB, Batch3, #{sync => true}) ), - meck:unload(ra), + ok = meck:unload(ra), ?assertMatch(ok, emqx_ds:store_batch(DB, Batch3, #{sync => true})), lists:sort(emqx_ds_test_helpers:consume_per_stream(DB, ['#'], 0)) after @@ -1027,5 +1027,6 @@ init_per_testcase(TCName, Config0) -> end_per_testcase(TCName, Config) -> ok = snabbkaffe:stop(), Result = emqx_common_test_helpers:end_per_testcase(?MODULE, TCName, Config), + catch emqx_ds:drop_db(TCName), emqx_cth_suite:clean_work_dir(?config(work_dir, Config)), Result. From d7732a6aac79e4a0c6e77f61a2c3f46ec42950f5 Mon Sep 17 00:00:00 2001 From: ieQu1 <99872536+ieQu1@users.noreply.github.com> Date: Thu, 4 Jul 2024 19:46:28 +0200 Subject: [PATCH 20/21] test(ds): Attempt to stabilize a flaky test --- .../test/emqx_ds_replication_SUITE.erl | 8 ++++---- changes/ce/feat-13370.en.md | 9 +++++++++ 2 files changed, 13 insertions(+), 4 deletions(-) create mode 100644 changes/ce/feat-13370.en.md diff --git a/apps/emqx_ds_builtin_raft/test/emqx_ds_replication_SUITE.erl b/apps/emqx_ds_builtin_raft/test/emqx_ds_replication_SUITE.erl index 7fcc7ee1c..9b53bddff 100644 --- a/apps/emqx_ds_builtin_raft/test/emqx_ds_replication_SUITE.erl +++ b/apps/emqx_ds_builtin_raft/test/emqx_ds_replication_SUITE.erl @@ -220,7 +220,7 @@ t_rebalance(Config) -> {ok, [_]}, ?ON(N1, emqx_ds_replication_layer_meta:assign_db_sites(?DB, [S1])) ), - ?retry(1000, 10, ?assertEqual([], emqx_ds_test_helpers:transitions(N1, ?DB))), + ?retry(1000, 20, ?assertEqual([], emqx_ds_test_helpers:transitions(N1, ?DB))), ?retry(500, 10, ?assertMatch(Shards when length(Shards) == 16, shards_online(N1, ?DB))), ct:pal("Sites: ~p~n", [Sites]), @@ -295,7 +295,7 @@ t_rebalance(Config) -> ct:pal("Transitions (~p -> ~p): ~p~n", [ Sites, tl(Sites), emqx_ds_test_helpers:transitions(N1, ?DB) ]), - ?retry(1000, 10, ?assertEqual([], emqx_ds_test_helpers:transitions(N2, ?DB))), + ?retry(1000, 20, ?assertEqual([], emqx_ds_test_helpers:transitions(N2, ?DB))), %% Verify that at the end each node is now responsible for each shard. ?defer_assert( @@ -372,7 +372,7 @@ t_join_leave_errors(Config) -> ?assertMatch({ok, _}, ds_repl_meta(N2, leave_db_site, [?FUNCTION_NAME, S1])), ?assertMatch([_ | _], emqx_ds_test_helpers:transitions(N1, ?FUNCTION_NAME)), ?retry( - 1000, 10, ?assertEqual([], emqx_ds_test_helpers:transitions(N1, ?FUNCTION_NAME)) + 1000, 20, ?assertEqual([], emqx_ds_test_helpers:transitions(N1, ?FUNCTION_NAME)) ), %% Should be no-op. @@ -458,7 +458,7 @@ t_rebalance_chaotic_converges(Config) -> emqx_ds_test_helpers:apply_stream(?DB, Nodes, Stream), %% Wait for the last transition to complete. - ?retry(1000, 20, ?assertEqual([], emqx_ds_test_helpers:transitions(N1, ?DB))), + ?retry(1000, 30, ?assertEqual([], emqx_ds_test_helpers:transitions(N1, ?DB))), ?defer_assert( ?assertEqual( diff --git a/changes/ce/feat-13370.en.md b/changes/ce/feat-13370.en.md new file mode 100644 index 000000000..a44f94eea --- /dev/null +++ b/changes/ce/feat-13370.en.md @@ -0,0 +1,9 @@ +Add a new version of `wildcard_optimized` storage layout for the durable storage. + +Improvements: + +- New layout does not introduce latency + +- On-disk serialization of MQTT messages is much more efficient + - Metadata is encoded more efficiently + - Message topic is compressed From 3721be65ee9ed2312abdb5bca31bb1edcb5a81bb Mon Sep 17 00:00:00 2001 From: ieQu1 <99872536+ieQu1@users.noreply.github.com> Date: Fri, 5 Jul 2024 00:12:19 +0200 Subject: [PATCH 21/21] fix(ds): Improve comments --- .../src/emqx_ds_storage_skipstream_lts.erl | 12 +++++++++++- changes/ce/feat-13370.en.md | 6 ++---- 2 files changed, 13 insertions(+), 5 deletions(-) diff --git a/apps/emqx_durable_storage/src/emqx_ds_storage_skipstream_lts.erl b/apps/emqx_durable_storage/src/emqx_ds_storage_skipstream_lts.erl index bb786c1f0..f555241f2 100644 --- a/apps/emqx_durable_storage/src/emqx_ds_storage_skipstream_lts.erl +++ b/apps/emqx_durable_storage/src/emqx_ds_storage_skipstream_lts.erl @@ -101,7 +101,9 @@ -record(it, { static_index :: emqx_ds_lts:static_key(), + %% Minimal timestamp of the next message: ts :: ts(), + %% Compressed topic filter: compressed_tf :: binary() }). @@ -555,8 +557,16 @@ next_step( next_step(S, StaticIdx, CompressedTF, Iterators, NextTS, {seek, NextTS}) end; NextTS when NextTS > ExpectedTS, N > 0 -> + %% Next index level is not what we expect. Reset + %% search to the first wilcard index, but continue + %% from `NextTS'. + %% + %% Note: if `NextTS > ExpectedTS' and `N =:= 0', + %% it means the upper (replication) level is + %% broken and supplied us NextTS that advenced + %% past the point of time that can be safely read. + %% We don't handle it here. inc_counter(?DS_SKIPSTREAM_LTS_MISS), - %% Next index level is not what we expect. {seek, NextTS} end end. diff --git a/changes/ce/feat-13370.en.md b/changes/ce/feat-13370.en.md index a44f94eea..8ed0cc75e 100644 --- a/changes/ce/feat-13370.en.md +++ b/changes/ce/feat-13370.en.md @@ -2,8 +2,6 @@ Add a new version of `wildcard_optimized` storage layout for the durable storage Improvements: -- New layout does not introduce latency +- New layout does not have an inherent latency -- On-disk serialization of MQTT messages is much more efficient - - Metadata is encoded more efficiently - - Message topic is compressed +- MQTT messages are serialized into a much more space-efficient format