feat(ds): Add a benchmarking tool for storage efficiency analysis

This commit is contained in:
ieQu1 2024-07-01 01:15:14 +02:00
parent afeb2ab8aa
commit 23dafbb03b
No known key found for this signature in database
GPG Key ID: 488654DF3FED6FDE
1 changed files with 223 additions and 0 deletions

View File

@ -0,0 +1,223 @@
%%--------------------------------------------------------------------
%% Copyright (c) 2024 EMQ Technologies Co., Ltd. All Rights Reserved.
%%
%% Licensed under the Apache License, Version 2.0 (the "License");
%% you may not use this file except in compliance with the License.
%% You may obtain a copy of the License at
%%
%% http://www.apache.org/licenses/LICENSE-2.0
%%
%% Unless required by applicable law or agreed to in writing, software
%% distributed under the License is distributed on an "AS IS" BASIS,
%% WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
%% See the License for the specific language governing permissions and
%% limitations under the License.
%%--------------------------------------------------------------------
%% @doc This script can be loaded to a running EMQX EE node. It will
%% create a number of DS databases with different options and fill
%% them with data of given size.
%%
%% Then it will measure size of the database directories and create
%% a "storage (in)efficiency" report.
-module(storage_efficiency).
-include_lib("emqx_utils/include/emqx_message.hrl").
%% API:
-export([run/0, run/1]).
%%================================================================================
%% API functions
%%================================================================================
run() ->
run(#{}).
run(Custom) ->
RunConf = maps:merge(
#{
%% Sleep between batches:
sleep => 1_000,
%% Don't run test, only plot data:
dry_run => false,
%% Payload size multiplier:
size => 10,
%% Number of batches:
batches => 100,
%% Add generation every N batches:
add_generation => 10
},
Custom
),
lists:foreach(
fun(DBConf) ->
run(DBConf, RunConf)
end,
configs()
).
%% erlfmt-ignore
gnuplot_script(Filename) ->
"set terminal qt\n"
%% "set logscale y 10\n"
"set title \"" ++ filename:basename(Filename, ".dat") ++ "\"\n"
"set key autotitle columnheader\n"
"plot for [n=2:*] \"" ++ Filename ++ "\" using 1:n with linespoints".
%%================================================================================
%% Internal functions
%%================================================================================
configs() ->
[
{'benchmark-skipstream-asn1',
db_conf({emqx_ds_storage_skipstream_lts, #{serialization_schema => asn1}})},
{'benchmark-skipstream-v1',
db_conf({emqx_ds_storage_skipstream_lts, #{serialization_schema => v1}})},
{'benchmark-bitfield', db_conf({emqx_ds_storage_bitfield_lts, #{}})}
].
db_conf(Storage) ->
#{
backend => builtin_local,
%% n_sites => 1,
n_shards => 1,
%% replication_factor => 1,
%% replication_options => #{},
storage => Storage
}.
-record(s, {
data_size = 0,
payload_size = 0,
n_messages = 0,
datapoints = #{},
x_axis = []
}).
run({DB, Config}, RunConf) ->
#{
batches := NBatches,
size := PSMultiplier,
add_generation := AddGeneration,
sleep := Sleep,
dry_run := DryRun
} = RunConf,
{ok, _} = application:ensure_all_started(emqx_ds_backends),
Dir = dir(DB),
Filename = atom_to_list(DB) ++ ".dat",
DryRun orelse
begin
io:format(user, "Running benchmark for ~p in ~p~n", [DB, Dir]),
%% Ensure safe directory:
{match, _} = re:run(Dir, filename:join("data", DB)),
%% Ensure clean state:
ok = emqx_ds:open_db(DB, Config),
ok = emqx_ds:drop_db(DB),
ok = file:del_dir_r(Dir),
%% Open a fresh DB:
ok = emqx_ds:open_db(DB, Config),
S = lists:foldl(
fun(Batch, Acc0) ->
Size = PSMultiplier * Batch,
io:format(user, "Storing batch with payload size ~p~n", [Size]),
Acc1 = store_batch(DB, Size, Acc0),
%% Sleep so all data is hopefully flushed:
timer:sleep(Sleep),
(Batch div AddGeneration) =:= 0 andalso
emqx_ds:add_generation(DB),
collect_datapoint(DB, Acc1)
end,
collect_datapoint(DB, #s{}),
lists:seq(1, NBatches)
),
{ok, FD} = file:open(Filename, [write]),
io:put_chars(FD, print(S)),
file:close(FD)
end,
os:cmd("echo '" ++ gnuplot_script(Filename) ++ "' | gnuplot --persist -"),
ok.
collect_datapoint(
DB, S0 = #s{n_messages = N, data_size = DS, payload_size = PS, datapoints = DP0, x_axis = X}
) ->
NewData = [{"$_n", N}, {"$data", DS}, {"$payloads", PS} | dirsize(DB)],
DP = lists:foldl(
fun({Key, Val}, Acc) ->
maps:update_with(
Key,
fun(M) -> M#{N => Val} end,
#{},
Acc
)
end,
DP0,
NewData
),
S0#s{
datapoints = DP,
x_axis = [N | X]
}.
print(#s{x_axis = XX, datapoints = DP}) ->
Cols = lists:sort(maps:keys(DP)),
Lines = [
%% Print header:
Cols
%% Scan through rows:
| [
%% Scan throgh columns:
[integer_to_binary(maps:get(X, maps:get(Col, DP), 0)) || Col <- Cols]
|| X <- lists:reverse(XX)
]
],
lists:join(
"\n",
[lists:join(" ", Line) || Line <- Lines]
).
dirsize(DB) ->
RawOutput = os:cmd("cd " ++ dir(DB) ++ "; du -b --max-depth 1 ."),
[
begin
[Sz, Dir] = string:lexemes(L, "\t"),
{Dir, list_to_integer(Sz)}
end
|| L <- string:lexemes(RawOutput, "\n")
].
dir(DB) ->
filename:join(emqx_ds_storage_layer:base_dir(), DB).
store_batch(DB, PayloadSize, S0 = #s{n_messages = N, data_size = DS, payload_size = PS}) ->
From = rand:bytes(16),
BatchSize = 50,
Batch = [
#message{
id = emqx_guid:gen(),
timestamp = emqx_message:timestamp_now(),
payload = rand:bytes(PayloadSize),
from = From,
topic = emqx_topic:join([
<<"blah">>,
<<"blah">>,
'',
<<"blah">>,
From,
<<"bazzzzzzzzzzzzzzzzzzzzzzz">>,
integer_to_binary(I)
])
}
|| I <- lists:seq(1, BatchSize)
],
ok = emqx_ds:store_batch(DB, Batch, #{sync => true}),
S0#s{
n_messages = N + length(Batch),
data_size = DS + lists:sum(lists:map(fun msg_size/1, Batch)),
payload_size = PS + length(Batch) * PayloadSize
}.
%% We consider MQTT wire encoding to be "close to the ideal".
msg_size(Msg = #message{}) ->
iolist_size(emqx_frame:serialize(emqx_message:to_packet(undefined, Msg))).