perf: add broker_pool_size, generic_pool_size and channel_cleanup_batch_size config options

Tuning these options can improve performance if cluster interconnect network latency is high. Fixes: EMQX-10661
2023-08-03 17:49:47 +03:00 · 2023-08-03 17:49:47 +03:00 · 466fe7e009
parent ed28c12a66
commit 466fe7e009
9 changed files with 72 additions and 5 deletions
--- a/apps/emqx/src/emqx.app.src
+++ b/apps/emqx/src/emqx.app.src
@ -2,7 +2,7 @@
 {application, emqx, [
    {id, "emqx"},
    {description, "EMQX Core"},
-    {vsn, "5.1.4"},
+    {vsn, "5.1.5"},
    {modules, []},
    {registered, []},
    {applications, [
--- a/apps/emqx/src/emqx_broker_sup.erl
+++ b/apps/emqx/src/emqx_broker_sup.erl
@ -31,7 +31,7 @@ start_link() ->

 init([]) ->
    %% Broker pool
-    PoolSize = emqx_vm:schedulers() * 2,
+    PoolSize = emqx:get_config([node, broker_pool_size], emqx_vm:schedulers() * 2),
    BrokerPool = emqx_pool_sup:spec([
        broker_pool,
        hash,
--- a/apps/emqx/src/emqx_cm.erl
+++ b/apps/emqx/src/emqx_cm.erl
@ -685,7 +685,8 @@ handle_cast(Msg, State) ->

 handle_info({'DOWN', _MRef, process, Pid, _Reason}, State = #{chan_pmon := PMon}) ->
    ?tp(emqx_cm_process_down, #{stale_pid => Pid, reason => _Reason}),
-    ChanPids = [Pid | emqx_utils:drain_down(?BATCH_SIZE)],
+    BatchSize = emqx:get_config([node, channel_cleanup_batch_size], ?BATCH_SIZE),
+    ChanPids = [Pid | emqx_utils:drain_down(BatchSize)],
    {Items, PMon1} = emqx_pmon:erase_all(ChanPids, PMon),
    lists:foreach(fun mark_channel_disconnected/1, ChanPids),
    ok = emqx_pool:async_submit(fun lists:foreach/2, [fun ?MODULE:clean_down/1, Items]),
--- a/apps/emqx/src/emqx_kernel_sup.erl
+++ b/apps/emqx/src/emqx_kernel_sup.erl
@ -31,7 +31,9 @@ init([]) ->
        %% always start emqx_config_handler first to load the emqx.conf to emqx_config
        [
            child_spec(emqx_config_handler, worker),
-            child_spec(emqx_pool_sup, supervisor),
+            child_spec(emqx_pool_sup, supervisor, [
+                emqx:get_config([node, generic_pool_size], emqx_vm:schedulers())
+            ]),
            child_spec(emqx_hooks, worker),
            child_spec(emqx_stats, worker),
            child_spec(emqx_metrics, worker),
--- a/apps/emqx/src/emqx_pool_sup.erl
+++ b/apps/emqx/src/emqx_pool_sup.erl
@ -24,6 +24,7 @@

 -export([
    start_link/0,
+    start_link/1,
    start_link/3,
    start_link/4
 ]).
@ -51,6 +52,9 @@ spec(ChildId, Args) ->
 start_link() ->
    start_link(?POOL, random, {?POOL, start_link, []}).

+start_link(PoolSize) ->
+    start_link(?POOL, random, PoolSize, {?POOL, start_link, []}).
+
 -spec start_link(atom() | tuple(), atom(), mfargs()) ->
    {ok, pid()} | {error, term()}.
 start_link(Pool, Type, MFA) ->
--- a/apps/emqx_conf/src/emqx_conf.app.src
+++ b/apps/emqx_conf/src/emqx_conf.app.src
@ -1,6 +1,6 @@
 {application, emqx_conf, [
    {description, "EMQX configuration management"},
-    {vsn, "0.1.24"},
+    {vsn, "0.1.25"},
    {registered, []},
    {mod, {emqx_conf_app, []}},
    {applications, [kernel, stdlib, emqx_ctl]},
--- a/apps/emqx_conf/src/emqx_conf_schema.erl
+++ b/apps/emqx_conf/src/emqx_conf_schema.erl
@ -672,6 +672,35 @@ fields("node") ->
                    mapping => "emqx_machine.custom_shard_transports",
                    default => #{}
                }
+            )},
+        {"broker_pool_size",
+            sc(
+                pos_integer(),
+                #{
+                    importance => ?IMPORTANCE_HIDDEN,
+                    default => emqx_vm:schedulers() * 2,
+                    'readOnly' => true,
+                    desc => ?DESC(node_broker_pool_size)
+                }
+            )},
+        {"generic_pool_size",
+            sc(
+                pos_integer(),
+                #{
+                    importance => ?IMPORTANCE_HIDDEN,
+                    default => emqx_vm:schedulers(),
+                    'readOnly' => true,
+                    desc => ?DESC(node_generic_pool_size)
+                }
+            )},
+        {"channel_cleanup_batch_size",
+            sc(
+                pos_integer(),
+                #{
+                    importance => ?IMPORTANCE_HIDDEN,
+                    default => 100_000,
+                    desc => ?DESC(node_channel_cleanup_batch_size)
+                }
            )}
    ];
 fields("cluster_call") ->
--- a/changes/ce/feat-11390.en.md
+++ b/changes/ce/feat-11390.en.md
@ -0,0 +1,3 @@
+Add `node.broker_pool_size`, `node.generic_pool_size`, `node.channel_cleanup_batch_size` options to EMQX configuration.
+
+Tuning these options can significantly improve performance if cluster interconnect network latency is high.
--- a/rel/i18n/emqx_conf_schema.hocon
+++ b/rel/i18n/emqx_conf_schema.hocon
@ -776,4 +776,32 @@ the default is to use the value set in <code>db.default_shard_transport</code>."
 db_shard_transports.label:
 """Shard Transports"""

+node_broker_pool_size.desc:
+"""The number of workers in emqx_broker pool. Increasing this value may improve performance
+by enhancing parallelism, especially when EMQX cluster interconnect network latency is high.
+Defaults to the number of Erlang schedulers (CPU cores) * 2.
+"""
+
+node_broker_pool_size.label:
+"""Node Broker Pool Size"""
+
+node_generic_pool_size.desc:
+"""The number of workers in emqx_pool. Increasing this value may improve performance
+by enhancing parallelism, especially when EMQX cluster interconnect network latency is high.
+Defaults to the number of Erlang schedulers (CPU cores).
+"""
+
+node_generic_pool_size.label:
+"""Node Generic Pool Size"""
+
+node_channel_cleanup_batch_size.desc:
+"""The size of the channel cleanup batch. if EMQX cluster interconnect network latency is high,
+reducing this value together with increasing node.generic_pool_size may improve performance
+during an abrupt disconnect of a large numbers of clients.
+Defaults to 100000.
+"""
+
+node_channel_cleanup_batch_size.label:
+"""Node Channel Cleanup Batch Size"""
+
 }