From 8feda315f6b61dc5c9567b4b1f161e2b977bcc64 Mon Sep 17 00:00:00 2001
From: Andrew Mayorov <encube.ul@gmail.com>
Date: Tue, 18 Jul 2023 23:11:04 +0200
Subject: [PATCH 01/12] feat(index): add topic index facility

Somewhat similar to `emqx_trie` in design and logic, yet built on
top of a single, potentially pre-existing table.
---
 apps/emqx/src/emqx_topic_index.erl        | 156 ++++++++++++++++++++++
 apps/emqx/test/emqx_topic_index_SUITE.erl | 143 ++++++++++++++++++++
 2 files changed, 299 insertions(+)
 create mode 100644 apps/emqx/src/emqx_topic_index.erl
 create mode 100644 apps/emqx/test/emqx_topic_index_SUITE.erl

diff --git a/apps/emqx/src/emqx_topic_index.erl b/apps/emqx/src/emqx_topic_index.erl
new file mode 100644
index 000000000..9f0b5fba1
--- /dev/null
+++ b/apps/emqx/src/emqx_topic_index.erl
@@ -0,0 +1,156 @@
+%%--------------------------------------------------------------------
+%% Copyright (c) 2023 EMQ Technologies Co., Ltd. All Rights Reserved.
+%%
+%% Licensed under the Apache License, Version 2.0 (the "License");
+%% you may not use this file except in compliance with the License.
+%% You may obtain a copy of the License at
+%%
+%%     http://www.apache.org/licenses/LICENSE-2.0
+%%
+%% Unless required by applicable law or agreed to in writing, software
+%% distributed under the License is distributed on an "AS IS" BASIS,
+%% WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+%% See the License for the specific language governing permissions and
+%% limitations under the License.
+%%--------------------------------------------------------------------
+
+%% @doc Topic index for matching topics to topic filters.
+%%
+%% Works on top of ETS ordered_set table. Keys are parsed topic filters
+%% with record ID appended to the end, wrapped in a tuple to disambiguate from
+%% topic filter words. Existing table may be used if existing keys will not
+%% collide with index keys.
+%%
+%% Designed to effectively answer questions like:
+%% 1. Does any topic filter match given topic?
+%% 2. Which records are associated with topic filters matching given topic?
+%%
+%% Questions like these are _only slightly_ less effective:
+%% 1. Which topic filters match given topic?
+%% 2. Which record IDs are associated with topic filters matching given topic?
+
+-module(emqx_topic_index).
+
+-export([new/0]).
+-export([insert/4]).
+-export([delete/3]).
+-export([match/2]).
+-export([matches/2]).
+
+-export([get_id/1]).
+-export([get_topic/1]).
+-export([get_record/2]).
+
+-type key(ID) :: [binary() | '+' | '#' | {ID}].
+-type match(ID) :: key(ID).
+
+new() ->
+    ets:new(?MODULE, [public, ordered_set, {write_concurrency, true}]).
+
+insert(Filter, ID, Record, Tab) ->
+    ets:insert(Tab, {emqx_topic:words(Filter) ++ [{ID}], Record}).
+
+delete(Filter, ID, Tab) ->
+    ets:delete(Tab, emqx_topic:words(Filter) ++ [{ID}]).
+
+-spec match(emqx_types:topic(), ets:table()) -> match(_ID) | false.
+match(Topic, Tab) ->
+    {Words, RPrefix} = match_init(Topic),
+    match(Words, RPrefix, Tab).
+
+match(Words, RPrefix, Tab) ->
+    Prefix = lists:reverse(RPrefix),
+    K = ets:next(Tab, Prefix),
+    case match_filter(Prefix, K, Words =/= []) of
+        true ->
+            K;
+        stop ->
+            false;
+        Matched ->
+            match_rest(Matched, Words, RPrefix, Tab)
+    end.
+
+match_rest(false, [W | Rest], RPrefix, Tab) ->
+    match(Rest, [W | RPrefix], Tab);
+match_rest(plus, [W | Rest], RPrefix, Tab) ->
+    case match(Rest, ['+' | RPrefix], Tab) of
+        Match when is_list(Match) ->
+            Match;
+        false ->
+            match(Rest, [W | RPrefix], Tab)
+    end;
+match_rest(_, [], _RPrefix, _Tab) ->
+    false.
+
+-spec matches(emqx_types:topic(), ets:table()) -> [match(_ID)].
+matches(Topic, Tab) ->
+    {Words, RPrefix} = match_init(Topic),
+    matches(Words, RPrefix, Tab).
+
+matches(Words, RPrefix, Tab) ->
+    Prefix = lists:reverse(RPrefix),
+    matches(ets:next(Tab, Prefix), Prefix, Words, RPrefix, Tab).
+
+matches(K, Prefix, Words, RPrefix, Tab) ->
+    case match_filter(Prefix, K, Words =/= []) of
+        true ->
+            [K | matches(ets:next(Tab, K), Prefix, Words, RPrefix, Tab)];
+        stop ->
+            [];
+        Matched ->
+            matches_rest(Matched, Words, RPrefix, Tab)
+    end.
+
+matches_rest(false, [W | Rest], RPrefix, Tab) ->
+    matches(Rest, [W | RPrefix], Tab);
+matches_rest(plus, [W | Rest], RPrefix, Tab) ->
+    matches(Rest, ['+' | RPrefix], Tab) ++ matches(Rest, [W | RPrefix], Tab);
+matches_rest(_, [], _RPrefix, _Tab) ->
+    [].
+
+match_filter([], [{_ID}], _IsPrefix = false) ->
+    % NOTE: exact match is `true` only if we match whole topic, not prefix
+    true;
+match_filter([], ['#', {_ID}], _IsPrefix) ->
+    % NOTE: naturally, '#' < '+', so this is already optimal for `match/2`
+    true;
+match_filter([], ['+' | _], _) ->
+    plus;
+match_filter([], [_H | _], _) ->
+    false;
+match_filter([H | T1], [H | T2], IsPrefix) ->
+    match_filter(T1, T2, IsPrefix);
+match_filter([H1 | _], [H2 | _], _) when H2 > H1 ->
+    % NOTE: we're strictly past the prefix, no need to continue
+    stop;
+match_filter(_, '$end_of_table', _) ->
+    stop.
+
+match_init(Topic) ->
+    case emqx_topic:words(Topic) of
+        [W = <<"$", _/bytes>> | Rest] ->
+            % NOTE
+            % This will effectively skip attempts to match special topics to `#` or `+/...`.
+            {Rest, [W]};
+        Words ->
+            {Words, []}
+    end.
+
+-spec get_id(match(ID)) -> ID.
+get_id([{ID}]) ->
+    ID;
+get_id([_ | Rest]) ->
+    get_id(Rest).
+
+-spec get_topic(match(_ID)) -> emqx_types:topic().
+get_topic(K) ->
+    emqx_topic:join(cut_topic(K)).
+
+cut_topic([{_ID}]) ->
+    [];
+cut_topic([W | Rest]) ->
+    [W | cut_topic(Rest)].
+
+-spec get_record(match(_ID), ets:table()) -> _Record.
+get_record(K, Tab) ->
+    ets:lookup_element(Tab, K, 2).
diff --git a/apps/emqx/test/emqx_topic_index_SUITE.erl b/apps/emqx/test/emqx_topic_index_SUITE.erl
new file mode 100644
index 000000000..98bfe48a1
--- /dev/null
+++ b/apps/emqx/test/emqx_topic_index_SUITE.erl
@@ -0,0 +1,143 @@
+%%--------------------------------------------------------------------
+%% Copyright (c) 2023 EMQ Technologies Co., Ltd. All Rights Reserved.
+%%
+%% Licensed under the Apache License, Version 2.0 (the "License");
+%% you may not use this file except in compliance with the License.
+%% You may obtain a copy of the License at
+%%
+%%     http://www.apache.org/licenses/LICENSE-2.0
+%%
+%% Unless required by applicable law or agreed to in writing, software
+%% distributed under the License is distributed on an "AS IS" BASIS,
+%% WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+%% See the License for the specific language governing permissions and
+%% limitations under the License.
+%%--------------------------------------------------------------------
+
+-module(emqx_topic_index_SUITE).
+
+-compile(export_all).
+-compile(nowarn_export_all).
+
+-include_lib("eunit/include/eunit.hrl").
+
+all() ->
+    emqx_common_test_helpers:all(?MODULE).
+
+t_insert(_) ->
+    Tab = emqx_topic_index:new(),
+    true = emqx_topic_index:insert(<<"sensor/1/metric/2">>, t_insert_1, <<>>, Tab),
+    true = emqx_topic_index:insert(<<"sensor/+/#">>, t_insert_2, <<>>, Tab),
+    true = emqx_topic_index:insert(<<"sensor/#">>, t_insert_3, <<>>, Tab),
+    ?assertEqual(<<"sensor/#">>, topic(match(<<"sensor">>, Tab))),
+    ?assertEqual(t_insert_3, id(match(<<"sensor">>, Tab))).
+
+t_match(_) ->
+    Tab = emqx_topic_index:new(),
+    true = emqx_topic_index:insert(<<"sensor/1/metric/2">>, t_match_1, <<>>, Tab),
+    true = emqx_topic_index:insert(<<"sensor/+/#">>, t_match_2, <<>>, Tab),
+    true = emqx_topic_index:insert(<<"sensor/#">>, t_match_3, <<>>, Tab),
+    ?assertMatch(
+        [<<"sensor/#">>, <<"sensor/+/#">>],
+        [topic(M) || M <- matches(<<"sensor/1">>, Tab)]
+    ).
+
+t_match2(_) ->
+    Tab = emqx_topic_index:new(),
+    true = emqx_topic_index:insert(<<"#">>, t_match2_1, <<>>, Tab),
+    true = emqx_topic_index:insert(<<"+/#">>, t_match2_2, <<>>, Tab),
+    true = emqx_topic_index:insert(<<"+/+/#">>, t_match2_3, <<>>, Tab),
+    ?assertEqual(
+        [<<"#">>, <<"+/#">>, <<"+/+/#">>],
+        [topic(M) || M <- matches(<<"a/b/c">>, Tab)]
+    ),
+    ?assertEqual(
+        false,
+        emqx_topic_index:match(<<"$SYS/broker/zenmq">>, Tab)
+    ).
+
+t_match3(_) ->
+    Tab = emqx_topic_index:new(),
+    Records = [
+        {<<"d/#">>, t_match3_1},
+        {<<"a/b/+">>, t_match3_2},
+        {<<"a/#">>, t_match3_3},
+        {<<"#">>, t_match3_4},
+        {<<"$SYS/#">>, t_match3_sys}
+    ],
+    lists:foreach(
+        fun({Topic, ID}) -> emqx_topic_index:insert(Topic, ID, <<>>, Tab) end,
+        Records
+    ),
+    Matched = matches(<<"a/b/c">>, Tab),
+    case length(Matched) of
+        3 -> ok;
+        _ -> error({unexpected, Matched})
+    end,
+    ?assertEqual(
+        t_match3_sys,
+        id(match(<<"$SYS/a/b/c">>, Tab))
+    ).
+
+t_match4(_) ->
+    Tab = emqx_topic_index:new(),
+    Records = [{<<"/#">>, t_match4_1}, {<<"/+">>, t_match4_2}, {<<"/+/a/b/c">>, t_match4_3}],
+    lists:foreach(
+        fun({Topic, ID}) -> emqx_topic_index:insert(Topic, ID, <<>>, Tab) end,
+        Records
+    ),
+    ?assertEqual(
+        [<<"/#">>, <<"/+">>],
+        [topic(M) || M <- matches(<<"/">>, Tab)]
+    ),
+    ?assertEqual(
+        [<<"/#">>, <<"/+/a/b/c">>],
+        [topic(M) || M <- matches(<<"/0/a/b/c">>, Tab)]
+    ).
+
+t_match5(_) ->
+    Tab = emqx_topic_index:new(),
+    T = <<"a/b/c/d/e/f/g/h/i/j/k/l/m/n/o/p/q/r/s/t/u/v/w/x/y/z">>,
+    Records = [
+        {<<"#">>, t_match5_1},
+        {<<T/binary, "/#">>, t_match5_2},
+        {<<T/binary, "/+">>, t_match5_3}
+    ],
+    lists:foreach(
+        fun({Topic, ID}) -> emqx_topic_index:insert(Topic, ID, <<>>, Tab) end,
+        Records
+    ),
+    ?assertEqual(
+        [<<"#">>, <<T/binary, "/#">>],
+        [topic(M) || M <- matches(T, Tab)]
+    ),
+    ?assertEqual(
+        [<<"#">>, <<T/binary, "/#">>, <<T/binary, "/+">>],
+        [topic(M) || M <- matches(<<T/binary, "/1">>, Tab)]
+    ).
+
+t_match6(_) ->
+    Tab = emqx_topic_index:new(),
+    T = <<"a/b/c/d/e/f/g/h/i/j/k/l/m/n/o/p/q/r/s/t/u/v/w/x/y/z">>,
+    W = <<"+/+/+/+/+/+/+/+/+/+/+/+/+/+/+/+/+/+/+/+/+/+/+/+/+/+/#">>,
+    emqx_topic_index:insert(W, ID = t_match6, <<>>, Tab),
+    ?assertEqual(ID, id(match(T, Tab))).
+
+t_match7(_) ->
+    Tab = emqx_topic_index:new(),
+    T = <<"a/b/c/d/e/f/g/h/i/j/k/l/m/n/o/p/q/r/s/t/u/v/w/x/y/z">>,
+    W = <<"a/+/c/+/e/+/g/+/i/+/k/+/m/+/o/+/q/+/s/+/u/+/w/+/y/+/#">>,
+    emqx_topic_index:insert(W, t_match7, <<>>, Tab),
+    ?assertEqual(W, topic(match(T, Tab))).
+
+match(T, Tab) ->
+    emqx_topic_index:match(T, Tab).
+
+matches(T, Tab) ->
+    lists:sort(emqx_topic_index:matches(T, Tab)).
+
+id(Match) ->
+    emqx_topic_index:get_id(Match).
+
+topic(Match) ->
+    emqx_topic_index:get_topic(Match).

From 28bcb394d150d066e30bb78d7be3ecdf375b6681 Mon Sep 17 00:00:00 2001
From: Andrew Mayorov <encube.ul@gmail.com>
Date: Fri, 21 Jul 2023 20:06:46 +0200
Subject: [PATCH 02/12] fix(topicidx): allow to return matches unique by record
 id

---
 apps/emqx/src/emqx_topic_index.erl        | 132 ++++++++++++----------
 apps/emqx/test/emqx_topic_index_SUITE.erl |  16 ++-
 2 files changed, 88 insertions(+), 60 deletions(-)

diff --git a/apps/emqx/src/emqx_topic_index.erl b/apps/emqx/src/emqx_topic_index.erl
index 9f0b5fba1..ac4901a09 100644
--- a/apps/emqx/src/emqx_topic_index.erl
+++ b/apps/emqx/src/emqx_topic_index.erl
@@ -16,18 +16,16 @@
 
 %% @doc Topic index for matching topics to topic filters.
 %%
-%% Works on top of ETS ordered_set table. Keys are parsed topic filters
-%% with record ID appended to the end, wrapped in a tuple to disambiguate from
-%% topic filter words. Existing table may be used if existing keys will not
-%% collide with index keys.
+%% Works on top of ETS ordered_set table. Keys are tuples constructed from
+%% parsed topic filters and record IDs, wrapped in a tuple to order them
+%% strictly greater than unit tuple (`{}`). Existing table may be used if
+%% existing keys will not collide with index keys.
 %%
 %% Designed to effectively answer questions like:
 %% 1. Does any topic filter match given topic?
 %% 2. Which records are associated with topic filters matching given topic?
-%%
-%% Questions like these are _only slightly_ less effective:
-%% 1. Which topic filters match given topic?
-%% 2. Which record IDs are associated with topic filters matching given topic?
+%% 3. Which topic filters match given topic?
+%% 4. Which record IDs are associated with topic filters matching given topic?
 
 -module(emqx_topic_index).
 
@@ -35,23 +33,23 @@
 -export([insert/4]).
 -export([delete/3]).
 -export([match/2]).
--export([matches/2]).
+-export([matches/3]).
 
 -export([get_id/1]).
 -export([get_topic/1]).
 -export([get_record/2]).
 
--type key(ID) :: [binary() | '+' | '#' | {ID}].
+-type key(ID) :: {[binary() | '+' | '#'], {ID}}.
 -type match(ID) :: key(ID).
 
 new() ->
     ets:new(?MODULE, [public, ordered_set, {write_concurrency, true}]).
 
 insert(Filter, ID, Record, Tab) ->
-    ets:insert(Tab, {emqx_topic:words(Filter) ++ [{ID}], Record}).
+    ets:insert(Tab, {{emqx_topic:words(Filter), {ID}}, Record}).
 
 delete(Filter, ID, Tab) ->
-    ets:delete(Tab, emqx_topic:words(Filter) ++ [{ID}]).
+    ets:delete(Tab, {emqx_topic:words(Filter), {ID}}).
 
 -spec match(emqx_types:topic(), ets:table()) -> match(_ID) | false.
 match(Topic, Tab) ->
@@ -60,8 +58,8 @@ match(Topic, Tab) ->
 
 match(Words, RPrefix, Tab) ->
     Prefix = lists:reverse(RPrefix),
-    K = ets:next(Tab, Prefix),
-    case match_filter(Prefix, K, Words =/= []) of
+    K = ets:next(Tab, {Prefix, {}}),
+    case match_filter(Prefix, K, Words == []) of
         true ->
             K;
         stop ->
@@ -74,7 +72,7 @@ match_rest(false, [W | Rest], RPrefix, Tab) ->
     match(Rest, [W | RPrefix], Tab);
 match_rest(plus, [W | Rest], RPrefix, Tab) ->
     case match(Rest, ['+' | RPrefix], Tab) of
-        Match when is_list(Match) ->
+        Match = {_, _} ->
             Match;
         false ->
             match(Rest, [W | RPrefix], Tab)
@@ -82,48 +80,71 @@ match_rest(plus, [W | Rest], RPrefix, Tab) ->
 match_rest(_, [], _RPrefix, _Tab) ->
     false.
 
--spec matches(emqx_types:topic(), ets:table()) -> [match(_ID)].
-matches(Topic, Tab) ->
+-spec matches(emqx_types:topic(), ets:table(), _Opts :: [unique]) -> [match(_ID)].
+matches(Topic, Tab, Opts) ->
     {Words, RPrefix} = match_init(Topic),
-    matches(Words, RPrefix, Tab).
-
-matches(Words, RPrefix, Tab) ->
-    Prefix = lists:reverse(RPrefix),
-    matches(ets:next(Tab, Prefix), Prefix, Words, RPrefix, Tab).
-
-matches(K, Prefix, Words, RPrefix, Tab) ->
-    case match_filter(Prefix, K, Words =/= []) of
-        true ->
-            [K | matches(ets:next(Tab, K), Prefix, Words, RPrefix, Tab)];
-        stop ->
-            [];
-        Matched ->
-            matches_rest(Matched, Words, RPrefix, Tab)
+    AccIn =
+        case Opts of
+            [unique | _] -> #{};
+            [] -> []
+        end,
+    Matches = matches(Words, RPrefix, AccIn, Tab),
+    case Matches of
+        #{} -> maps:values(Matches);
+        _ -> Matches
     end.
 
-matches_rest(false, [W | Rest], RPrefix, Tab) ->
-    matches(Rest, [W | RPrefix], Tab);
-matches_rest(plus, [W | Rest], RPrefix, Tab) ->
-    matches(Rest, ['+' | RPrefix], Tab) ++ matches(Rest, [W | RPrefix], Tab);
-matches_rest(_, [], _RPrefix, _Tab) ->
-    [].
+matches(Words, RPrefix, Acc, Tab) ->
+    Prefix = lists:reverse(RPrefix),
+    matches(ets:next(Tab, {Prefix, {}}), Prefix, Words, RPrefix, Acc, Tab).
 
-match_filter([], [{_ID}], _IsPrefix = false) ->
-    % NOTE: exact match is `true` only if we match whole topic, not prefix
-    true;
-match_filter([], ['#', {_ID}], _IsPrefix) ->
+matches(K, Prefix, Words, RPrefix, Acc, Tab) ->
+    case match_filter(Prefix, K, Words == []) of
+        true ->
+            matches(ets:next(Tab, K), Prefix, Words, RPrefix, match_add(K, Acc), Tab);
+        stop ->
+            Acc;
+        Matched ->
+            matches_rest(Matched, Words, RPrefix, Acc, Tab)
+    end.
+
+matches_rest(false, [W | Rest], RPrefix, Acc, Tab) ->
+    matches(Rest, [W | RPrefix], Acc, Tab);
+matches_rest(plus, [W | Rest], RPrefix, Acc, Tab) ->
+    NAcc = matches(Rest, ['+' | RPrefix], Acc, Tab),
+    matches(Rest, [W | RPrefix], NAcc, Tab);
+matches_rest(_, [], _RPrefix, Acc, _Tab) ->
+    Acc.
+
+match_add(K = {_Filter, ID}, Acc = #{}) ->
+    Acc#{ID => K};
+match_add(K, Acc) ->
+    [K | Acc].
+
+match_filter(Prefix, {Filter, _ID}, NotPrefix) ->
+    case match_filter(Prefix, Filter) of
+        exact ->
+            % NOTE: exact match is `true` only if we match whole topic, not prefix
+            NotPrefix;
+        Match ->
+            Match
+    end;
+match_filter(_, '$end_of_table', _) ->
+    stop.
+
+match_filter([], []) ->
+    exact;
+match_filter([], ['#']) ->
     % NOTE: naturally, '#' < '+', so this is already optimal for `match/2`
     true;
-match_filter([], ['+' | _], _) ->
+match_filter([], ['+' | _]) ->
     plus;
-match_filter([], [_H | _], _) ->
+match_filter([], [_H | _]) ->
     false;
-match_filter([H | T1], [H | T2], IsPrefix) ->
-    match_filter(T1, T2, IsPrefix);
-match_filter([H1 | _], [H2 | _], _) when H2 > H1 ->
+match_filter([H | T1], [H | T2]) ->
+    match_filter(T1, T2);
+match_filter([H1 | _], [H2 | _]) when H2 > H1 ->
     % NOTE: we're strictly past the prefix, no need to continue
-    stop;
-match_filter(_, '$end_of_table', _) ->
     stop.
 
 match_init(Topic) ->
@@ -137,19 +158,12 @@ match_init(Topic) ->
     end.
 
 -spec get_id(match(ID)) -> ID.
-get_id([{ID}]) ->
-    ID;
-get_id([_ | Rest]) ->
-    get_id(Rest).
+get_id({_Filter, {ID}}) ->
+    ID.
 
 -spec get_topic(match(_ID)) -> emqx_types:topic().
-get_topic(K) ->
-    emqx_topic:join(cut_topic(K)).
-
-cut_topic([{_ID}]) ->
-    [];
-cut_topic([W | Rest]) ->
-    [W | cut_topic(Rest)].
+get_topic({Filter, _ID}) ->
+    emqx_topic:join(Filter).
 
 -spec get_record(match(_ID), ets:table()) -> _Record.
 get_record(K, Tab) ->
diff --git a/apps/emqx/test/emqx_topic_index_SUITE.erl b/apps/emqx/test/emqx_topic_index_SUITE.erl
index 98bfe48a1..b5faba4d9 100644
--- a/apps/emqx/test/emqx_topic_index_SUITE.erl
+++ b/apps/emqx/test/emqx_topic_index_SUITE.erl
@@ -130,11 +130,25 @@ t_match7(_) ->
     emqx_topic_index:insert(W, t_match7, <<>>, Tab),
     ?assertEqual(W, topic(match(T, Tab))).
 
+t_match_unique(_) ->
+    Tab = emqx_topic_index:new(),
+    emqx_topic_index:insert(<<"a/b/c">>, t_match_id1, <<>>, Tab),
+    emqx_topic_index:insert(<<"a/b/+">>, t_match_id1, <<>>, Tab),
+    emqx_topic_index:insert(<<"a/b/c/+">>, t_match_id2, <<>>, Tab),
+    ?assertEqual(
+        [t_match_id1, t_match_id1],
+        [id(M) || M <- emqx_topic_index:matches(<<"a/b/c">>, Tab, [])]
+    ),
+    ?assertEqual(
+        [t_match_id1],
+        [id(M) || M <- emqx_topic_index:matches(<<"a/b/c">>, Tab, [unique])]
+    ).
+
 match(T, Tab) ->
     emqx_topic_index:match(T, Tab).
 
 matches(T, Tab) ->
-    lists:sort(emqx_topic_index:matches(T, Tab)).
+    lists:sort(emqx_topic_index:matches(T, Tab, [])).
 
 id(Match) ->
     emqx_topic_index:get_id(Match).

From 6a1340636369100a29188f9a63a0cb49a5b0b3dc Mon Sep 17 00:00:00 2001
From: Andrew Mayorov <encube.ul@gmail.com>
Date: Fri, 4 Aug 2023 16:18:07 +0400
Subject: [PATCH 03/12] fix(topicidx): use custom topic words to keep required
 ordering

Otherwise, topic with empty tokens (e.g. `a/b///c`) would have
some of their tokens ordered before `#` / `+`, because empty
token was represented as empty atom (`''`).
---
 apps/emqx/src/emqx_topic_index.erl | 22 +++++++++++++++++-----
 1 file changed, 17 insertions(+), 5 deletions(-)

diff --git a/apps/emqx/src/emqx_topic_index.erl b/apps/emqx/src/emqx_topic_index.erl
index ac4901a09..48c685f04 100644
--- a/apps/emqx/src/emqx_topic_index.erl
+++ b/apps/emqx/src/emqx_topic_index.erl
@@ -39,17 +39,18 @@
 -export([get_topic/1]).
 -export([get_record/2]).
 
--type key(ID) :: {[binary() | '+' | '#'], {ID}}.
+-type word() :: binary() | '+' | '#'.
+-type key(ID) :: {[word()], {ID}}.
 -type match(ID) :: key(ID).
 
 new() ->
-    ets:new(?MODULE, [public, ordered_set, {write_concurrency, true}]).
+    ets:new(?MODULE, [public, ordered_set, {read_concurrency, true}]).
 
 insert(Filter, ID, Record, Tab) ->
-    ets:insert(Tab, {{emqx_topic:words(Filter), {ID}}, Record}).
+    ets:insert(Tab, {{words(Filter), {ID}}, Record}).
 
 delete(Filter, ID, Tab) ->
-    ets:delete(Tab, {emqx_topic:words(Filter), {ID}}).
+    ets:delete(Tab, {words(Filter), {ID}}).
 
 -spec match(emqx_types:topic(), ets:table()) -> match(_ID) | false.
 match(Topic, Tab) ->
@@ -148,7 +149,7 @@ match_filter([H1 | _], [H2 | _]) when H2 > H1 ->
     stop.
 
 match_init(Topic) ->
-    case emqx_topic:words(Topic) of
+    case words(Topic) of
         [W = <<"$", _/bytes>> | Rest] ->
             % NOTE
             % This will effectively skip attempts to match special topics to `#` or `+/...`.
@@ -168,3 +169,14 @@ get_topic({Filter, _ID}) ->
 -spec get_record(match(_ID), ets:table()) -> _Record.
 get_record(K, Tab) ->
     ets:lookup_element(Tab, K, 2).
+
+%%
+
+-spec words(emqx_types:topic()) -> [word()].
+words(Topic) when is_binary(Topic) ->
+    [word(W) || W <- emqx_topic:tokens(Topic)].
+
+-spec word(binary()) -> word().
+word(<<"+">>) -> '+';
+word(<<"#">>) -> '#';
+word(Bin) -> Bin.

From 48a50c9137a89fbe6fd55a94dec8987ff5822b97 Mon Sep 17 00:00:00 2001
From: Andrew Mayorov <encube.ul@gmail.com>
Date: Fri, 4 Aug 2023 16:24:13 +0400
Subject: [PATCH 04/12] fix(topicidx): fix missing matches when 'a/b' and
 'a/b/#' both exist

Thanks to @HJianBo for spotting this issue. The approach to fix it
is different though: we try to keep the "recurrency branch factor"
to a minimum, instead introducing new condition for the case when
filter does not match, but iteration with `ets:next/2` is not yet
finished for the prefix.

Co-Authored-By: JianBo He <heeejianbo@gmail.com>
---
 apps/emqx/src/emqx_topic_index.erl        | 43 ++++++++++++-----------
 apps/emqx/test/emqx_topic_index_SUITE.erl | 18 ++++++++++
 2 files changed, 41 insertions(+), 20 deletions(-)

diff --git a/apps/emqx/src/emqx_topic_index.erl b/apps/emqx/src/emqx_topic_index.erl
index 48c685f04..29d8694af 100644
--- a/apps/emqx/src/emqx_topic_index.erl
+++ b/apps/emqx/src/emqx_topic_index.erl
@@ -59,10 +59,14 @@ match(Topic, Tab) ->
 
 match(Words, RPrefix, Tab) ->
     Prefix = lists:reverse(RPrefix),
-    K = ets:next(Tab, {Prefix, {}}),
-    case match_filter(Prefix, K, Words == []) of
+    match(ets:next(Tab, {Prefix, {}}), Prefix, Words, RPrefix, Tab).
+
+match(K, Prefix, Words, RPrefix, Tab) ->
+    case match_next(Prefix, K, Words) of
         true ->
             K;
+        skip ->
+            match(ets:next(Tab, K), Prefix, Words, RPrefix, Tab);
         stop ->
             false;
         Matched ->
@@ -100,9 +104,11 @@ matches(Words, RPrefix, Acc, Tab) ->
     matches(ets:next(Tab, {Prefix, {}}), Prefix, Words, RPrefix, Acc, Tab).
 
 matches(K, Prefix, Words, RPrefix, Acc, Tab) ->
-    case match_filter(Prefix, K, Words == []) of
+    case match_next(Prefix, K, Words) of
         true ->
             matches(ets:next(Tab, K), Prefix, Words, RPrefix, match_add(K, Acc), Tab);
+        skip ->
+            matches(ets:next(Tab, K), Prefix, Words, RPrefix, Acc, Tab);
         stop ->
             Acc;
         Matched ->
@@ -122,29 +128,26 @@ match_add(K = {_Filter, ID}, Acc = #{}) ->
 match_add(K, Acc) ->
     [K | Acc].
 
-match_filter(Prefix, {Filter, _ID}, NotPrefix) ->
-    case match_filter(Prefix, Filter) of
-        exact ->
-            % NOTE: exact match is `true` only if we match whole topic, not prefix
-            NotPrefix;
-        Match ->
-            Match
-    end;
-match_filter(_, '$end_of_table', _) ->
+match_next(Prefix, {Filter, _ID}, Suffix) ->
+    match_filter(Prefix, Filter, Suffix);
+match_next(_, '$end_of_table', _) ->
     stop.
 
-match_filter([], []) ->
-    exact;
-match_filter([], ['#']) ->
+match_filter([], [], []) ->
+    true;
+match_filter([], [], _Suffix) ->
+    % NOTE: we matched the prefix, but there may be more matches next
+    skip;
+match_filter([], ['#'], _Suffix) ->
     % NOTE: naturally, '#' < '+', so this is already optimal for `match/2`
     true;
-match_filter([], ['+' | _]) ->
+match_filter([], ['+' | _], _Suffix) ->
     plus;
-match_filter([], [_H | _]) ->
+match_filter([], [_H | _], _Suffix) ->
     false;
-match_filter([H | T1], [H | T2]) ->
-    match_filter(T1, T2);
-match_filter([H1 | _], [H2 | _]) when H2 > H1 ->
+match_filter([H | T1], [H | T2], Suffix) ->
+    match_filter(T1, T2, Suffix);
+match_filter([H1 | _], [H2 | _], _Suffix) when H2 > H1 ->
     % NOTE: we're strictly past the prefix, no need to continue
     stop.
 
diff --git a/apps/emqx/test/emqx_topic_index_SUITE.erl b/apps/emqx/test/emqx_topic_index_SUITE.erl
index b5faba4d9..75e1adaf6 100644
--- a/apps/emqx/test/emqx_topic_index_SUITE.erl
+++ b/apps/emqx/test/emqx_topic_index_SUITE.erl
@@ -144,6 +144,24 @@ t_match_unique(_) ->
         [id(M) || M <- emqx_topic_index:matches(<<"a/b/c">>, Tab, [unique])]
     ).
 
+t_match_wildcards(_) ->
+    Tab = emqx_topic_index:new(),
+    emqx_topic_index:insert(<<"a/b">>, id1, <<>>, Tab),
+    emqx_topic_index:insert(<<"a/b/#">>, id2, <<>>, Tab),
+    emqx_topic_index:insert(<<"a/b/#">>, id3, <<>>, Tab),
+    emqx_topic_index:insert(<<"a/b/c">>, id4, <<>>, Tab),
+    emqx_topic_index:insert(<<"a/b/+">>, id5, <<>>, Tab),
+    emqx_topic_index:insert(<<"a/b/d">>, id6, <<>>, Tab),
+    emqx_topic_index:insert(<<"a/+/+">>, id7, <<>>, Tab),
+    emqx_topic_index:insert(<<"a/+/#">>, id8, <<>>, Tab),
+
+    Records = [id(M) || M <- matches(<<"a/b/c">>, Tab)],
+    ?assertEqual([id2, id3, id4, id5, id7, id8], lists:sort(Records)),
+
+    Records1 = [id(M) || M <- matches(<<"a/b">>, Tab)],
+    ?assertEqual([id1, id2, id3, id8], lists:sort(Records1)),
+    ok.
+
 match(T, Tab) ->
     emqx_topic_index:match(T, Tab).
 

From 0c7bdbdab45ec2f9ecb5dc3f4f390afc5eae60f4 Mon Sep 17 00:00:00 2001
From: Andrew Mayorov <encube.ul@gmail.com>
Date: Fri, 4 Aug 2023 18:49:07 +0400
Subject: [PATCH 05/12] test(topicidx): add property test

Co-Authored-By: JianBo He <heeejianbo@gmail.com>
---
 apps/emqx/test/emqx_topic_index_SUITE.erl | 174 ++++++++++++++++++++--
 1 file changed, 159 insertions(+), 15 deletions(-)

diff --git a/apps/emqx/test/emqx_topic_index_SUITE.erl b/apps/emqx/test/emqx_topic_index_SUITE.erl
index 75e1adaf6..f2b263a5a 100644
--- a/apps/emqx/test/emqx_topic_index_SUITE.erl
+++ b/apps/emqx/test/emqx_topic_index_SUITE.erl
@@ -19,8 +19,11 @@
 -compile(export_all).
 -compile(nowarn_export_all).
 
+-include_lib("proper/include/proper.hrl").
 -include_lib("eunit/include/eunit.hrl").
 
+-import(emqx_proper_types, [scaled/2]).
+
 all() ->
     emqx_common_test_helpers:all(?MODULE).
 
@@ -144,23 +147,122 @@ t_match_unique(_) ->
         [id(M) || M <- emqx_topic_index:matches(<<"a/b/c">>, Tab, [unique])]
     ).
 
-t_match_wildcards(_) ->
-    Tab = emqx_topic_index:new(),
-    emqx_topic_index:insert(<<"a/b">>, id1, <<>>, Tab),
-    emqx_topic_index:insert(<<"a/b/#">>, id2, <<>>, Tab),
-    emqx_topic_index:insert(<<"a/b/#">>, id3, <<>>, Tab),
-    emqx_topic_index:insert(<<"a/b/c">>, id4, <<>>, Tab),
-    emqx_topic_index:insert(<<"a/b/+">>, id5, <<>>, Tab),
-    emqx_topic_index:insert(<<"a/b/d">>, id6, <<>>, Tab),
-    emqx_topic_index:insert(<<"a/+/+">>, id7, <<>>, Tab),
-    emqx_topic_index:insert(<<"a/+/#">>, id8, <<>>, Tab),
+t_match_wildcard_edge_cases(_) ->
+    CommonTopics = [
+        <<"a/b">>,
+        <<"a/b/#">>,
+        <<"a/b/#">>,
+        <<"a/b/c">>,
+        <<"a/b/+">>,
+        <<"a/b/d">>,
+        <<"a/+/+">>,
+        <<"a/+/#">>
+    ],
+    Datasets =
+        [
+            %% Topics, TopicName, Results
+            {CommonTopics, <<"a/b/c">>, [2, 3, 4, 5, 7, 8]},
+            {CommonTopics, <<"a/b">>, [1, 2, 3, 8]},
+            {[<<"+/b/c">>, <<"/">>], <<"a/b/c">>, [1]},
+            {[<<"#">>, <<"/">>], <<"a">>, [1]},
+            {[<<"/">>, <<"+">>], <<"a">>, [2]}
+        ],
+    F = fun({Topics, TopicName, Expected}) ->
+        Tab = emqx_topic_index:new(),
+        _ = [emqx_topic_index:insert(T, N, <<>>, Tab) || {N, T} <- lists:enumerate(Topics)],
+        Results = [id(M) || M <- emqx_topic_index:matches(TopicName, Tab, [unique])],
+        ?assertEqual(
+            Expected,
+            Results,
+            #{
+                "Base topics" => Topics,
+                "Topic name" => TopicName
+            }
+        )
+    end,
+    lists:foreach(F, Datasets).
 
-    Records = [id(M) || M <- matches(<<"a/b/c">>, Tab)],
-    ?assertEqual([id2, id3, id4, id5, id7, id8], lists:sort(Records)),
+t_prop_matches(_) ->
+    ?assert(
+        proper:quickcheck(
+            topic_matches_prop(),
+            [{max_size, 100}, {numtests, 100}]
+        )
+    ),
+    Statistics = [{C, account(C)} || C <- [filters, topics, matches, maxhits]],
+    ct:pal("Statistics: ~p", [maps:from_list(Statistics)]).
 
-    Records1 = [id(M) || M <- matches(<<"a/b">>, Tab)],
-    ?assertEqual([id1, id2, id3, id8], lists:sort(Records1)),
-    ok.
+topic_matches_prop() ->
+    ?FORALL(
+        % Generate a longer list of topics and a shorter list of topic filter patterns.
+        #{
+            topics := TTopics,
+            patterns := Pats
+        },
+        emqx_proper_types:fixedmap(#{
+            % NOTE
+            % Beware adding non-empty contraint, proper will have a hard time with `topic_t/1`
+            % for some reason.
+            topics => scaled(4, list(topic_t([1, 2, 3, 4]))),
+            patterns => list(topic_filter_pattern_t())
+        }),
+        begin
+            Tab = emqx_topic_index:new(),
+            Topics = [emqx_topic:join(T) || T <- TTopics],
+            % Produce topic filters from generated topics and patterns.
+            % Number of filters is equal to the number of patterns, most of the time.
+            Filters = lists:enumerate(mk_filters(Pats, TTopics)),
+            _ = [emqx_topic_index:insert(F, N, <<>>, Tab) || {N, F} <- Filters],
+            % Gather some basic statistics
+            _ = account(filters, length(Filters)),
+            _ = account(topics, NTopics = length(Topics)),
+            _ = account(maxhits, NTopics * NTopics),
+            % Verify that matching each topic against index returns the same results as
+            % matching it against the list of filters one by one.
+            lists:all(
+                fun(Topic) ->
+                    Ids1 = [id(M) || M <- emqx_topic_index:matches(Topic, Tab, [unique])],
+                    Ids2 = lists:filtermap(
+                        fun({N, F}) ->
+                            case emqx_topic:match(Topic, F) of
+                                true -> {true, N};
+                                false -> false
+                            end
+                        end,
+                        Filters
+                    ),
+                    % Account a number of matches to compute hitrate later
+                    _ = account(matches, length(Ids1)),
+                    case (Ids2 -- Ids1) ++ (Ids2 -- Ids1) of
+                        [] ->
+                            true;
+                        [_ | _] = _Differences ->
+                            ct:pal(
+                                "Topic name: ~p~n"
+                                "Index results: ~p~n"
+                                "Topic match results:: ~p~n",
+                                [Topic, Ids1, Ids2]
+                            ),
+                            false
+                    end
+                end,
+                Topics
+            )
+        end
+    ).
+
+mk_filters([Pat | PRest], [Topic | TRest]) ->
+    [emqx_topic:join(mk_topic_filter(Pat, Topic)) | mk_filters(PRest, TRest)];
+mk_filters(_, _) ->
+    [].
+
+account(Counter, N) ->
+    put({?MODULE, Counter}, account(Counter) + N).
+
+account(Counter) ->
+    emqx_maybe:define(get({?MODULE, Counter}), 0).
+
+%%
 
 match(T, Tab) ->
     emqx_topic_index:match(T, Tab).
@@ -173,3 +275,45 @@ id(Match) ->
 
 topic(Match) ->
     emqx_topic_index:get_topic(Match).
+
+%%
+
+topic_t(EntropyWeights) ->
+    EWLast = lists:last(EntropyWeights),
+    ?LET(L, scaled(1 / 4, list(EWLast)), begin
+        EWs = lists:sublist(EntropyWeights ++ L, length(L)),
+        ?SIZED(S, [oneof([topic_level_t(S * EW), topic_level_fixed_t()]) || EW <- EWs])
+    end).
+
+topic_level_t(Entropy) ->
+    S = floor(1 + math:log2(Entropy) / 4),
+    ?LET(I, range(1, Entropy), iolist_to_binary(io_lib:format("~*.16.0B", [S, I]))).
+
+topic_level_fixed_t() ->
+    oneof([
+        <<"foo">>,
+        <<"bar">>,
+        <<"baz">>,
+        <<"xyzzy">>
+    ]).
+
+topic_filter_pattern_t() ->
+    list(topic_level_pattern_t()).
+
+topic_level_pattern_t() ->
+    frequency([
+        {5, level},
+        {2, '+'},
+        {1, '#'}
+    ]).
+
+mk_topic_filter([], _) ->
+    [];
+mk_topic_filter(_, []) ->
+    [];
+mk_topic_filter(['#' | _], _) ->
+    ['#'];
+mk_topic_filter(['+' | Rest], [_ | Levels]) ->
+    ['+' | mk_topic_filter(Rest, Levels)];
+mk_topic_filter([level | Rest], [L | Levels]) ->
+    [L | mk_topic_filter(Rest, Levels)].

From fd0986071c082170fa154ce6d7695805ad513515 Mon Sep 17 00:00:00 2001
From: Andrew Mayorov <encube.ul@gmail.com>
Date: Fri, 4 Aug 2023 19:27:43 +0400
Subject: [PATCH 06/12] perf(topicidx): implement fast-forwarding prefixes

This should give less `ets:next/2` calls in general and much less
when index has relatively small number of long non-wildcard topics.
---
 apps/emqx/src/emqx_topic_index.erl        | 20 ++++++++++++++++----
 apps/emqx/test/emqx_topic_index_SUITE.erl | 10 ++++++++++
 2 files changed, 26 insertions(+), 4 deletions(-)

diff --git a/apps/emqx/src/emqx_topic_index.erl b/apps/emqx/src/emqx_topic_index.erl
index 29d8694af..44e88e659 100644
--- a/apps/emqx/src/emqx_topic_index.erl
+++ b/apps/emqx/src/emqx_topic_index.erl
@@ -73,7 +73,13 @@ match(K, Prefix, Words, RPrefix, Tab) ->
             match_rest(Matched, Words, RPrefix, Tab)
     end.
 
-match_rest(false, [W | Rest], RPrefix, Tab) ->
+match_rest([W1 | [W2 | _] = SLast], [W1 | [W2 | _] = Rest], RPrefix, Tab) ->
+    % NOTE
+    % Fast-forward through identical words in the topic and the last key suffixes.
+    % This should save us a few redundant `ets:next` calls at the cost of slightly
+    % more complex match patterns.
+    match_rest(SLast, Rest, [W1 | RPrefix], Tab);
+match_rest(SLast, [W | Rest], RPrefix, Tab) when is_list(SLast) ->
     match(Rest, [W | RPrefix], Tab);
 match_rest(plus, [W | Rest], RPrefix, Tab) ->
     case match(Rest, ['+' | RPrefix], Tab) of
@@ -115,7 +121,13 @@ matches(K, Prefix, Words, RPrefix, Acc, Tab) ->
             matches_rest(Matched, Words, RPrefix, Acc, Tab)
     end.
 
-matches_rest(false, [W | Rest], RPrefix, Acc, Tab) ->
+matches_rest([W1 | [W2 | _] = SLast], [W1 | [W2 | _] = Rest], RPrefix, Acc, Tab) ->
+    % NOTE
+    % Fast-forward through identical words in the topic and the last key suffixes.
+    % This should save us a few redundant `ets:next` calls at the cost of slightly
+    % more complex match patterns.
+    matches_rest(SLast, Rest, [W1 | RPrefix], Acc, Tab);
+matches_rest(SLast, [W | Rest], RPrefix, Acc, Tab) when is_list(SLast) ->
     matches(Rest, [W | RPrefix], Acc, Tab);
 matches_rest(plus, [W | Rest], RPrefix, Acc, Tab) ->
     NAcc = matches(Rest, ['+' | RPrefix], Acc, Tab),
@@ -143,8 +155,8 @@ match_filter([], ['#'], _Suffix) ->
     true;
 match_filter([], ['+' | _], _Suffix) ->
     plus;
-match_filter([], [_H | _], _Suffix) ->
-    false;
+match_filter([], [_H | _] = Rest, _Suffix) ->
+    Rest;
 match_filter([H | T1], [H | T2], Suffix) ->
     match_filter(T1, T2, Suffix);
 match_filter([H1 | _], [H2 | _], _Suffix) when H2 > H1 ->
diff --git a/apps/emqx/test/emqx_topic_index_SUITE.erl b/apps/emqx/test/emqx_topic_index_SUITE.erl
index f2b263a5a..80ca536b4 100644
--- a/apps/emqx/test/emqx_topic_index_SUITE.erl
+++ b/apps/emqx/test/emqx_topic_index_SUITE.erl
@@ -133,6 +133,16 @@ t_match7(_) ->
     emqx_topic_index:insert(W, t_match7, <<>>, Tab),
     ?assertEqual(W, topic(match(T, Tab))).
 
+t_match_fast_forward(_) ->
+    Tab = emqx_topic_index:new(),
+    emqx_topic_index:insert(<<"a/b/1/2/3/4/5/6/7/8/9/#">>, id1, <<>>, Tab),
+    emqx_topic_index:insert(<<"z/y/x/+/+">>, id2, <<>>, Tab),
+    emqx_topic_index:insert(<<"a/b/c/+">>, id3, <<>>, Tab),
+    % dbg:tracer(),
+    % dbg:p(all, c),
+    % dbg:tpl({ets, next, '_'}, x),
+    ?assertEqual([id1], [id(M) || M <- matches(<<"a/b/1/2/3/4/5/6/7/8/9/0">>, Tab)]).
+
 t_match_unique(_) ->
     Tab = emqx_topic_index:new(),
     emqx_topic_index:insert(<<"a/b/c">>, t_match_id1, <<>>, Tab),

From 9a249e4b01fc9ab717a1ab02c0a5db255fa87989 Mon Sep 17 00:00:00 2001
From: Andrew Mayorov <encube.ul@gmail.com>
Date: Mon, 14 Aug 2023 13:11:25 +0400
Subject: [PATCH 07/12] test(topicidx): increase test coverage

---
 apps/emqx/test/emqx_topic_index_SUITE.erl | 14 ++++++++------
 1 file changed, 8 insertions(+), 6 deletions(-)

diff --git a/apps/emqx/test/emqx_topic_index_SUITE.erl b/apps/emqx/test/emqx_topic_index_SUITE.erl
index 80ca536b4..ade98acec 100644
--- a/apps/emqx/test/emqx_topic_index_SUITE.erl
+++ b/apps/emqx/test/emqx_topic_index_SUITE.erl
@@ -141,6 +141,7 @@ t_match_fast_forward(_) ->
     % dbg:tracer(),
     % dbg:p(all, c),
     % dbg:tpl({ets, next, '_'}, x),
+    ?assertEqual(id1, id(match(<<"a/b/1/2/3/4/5/6/7/8/9/0">>, Tab))),
     ?assertEqual([id1], [id(M) || M <- matches(<<"a/b/1/2/3/4/5/6/7/8/9/0">>, Tab)]).
 
 t_match_unique(_) ->
@@ -180,14 +181,15 @@ t_match_wildcard_edge_cases(_) ->
     F = fun({Topics, TopicName, Expected}) ->
         Tab = emqx_topic_index:new(),
         _ = [emqx_topic_index:insert(T, N, <<>>, Tab) || {N, T} <- lists:enumerate(Topics)],
-        Results = [id(M) || M <- emqx_topic_index:matches(TopicName, Tab, [unique])],
+        ?assertEqual(
+            lists:last(Expected),
+            id(emqx_topic_index:match(TopicName, Tab)),
+            #{"Base topics" => Topics, "Topic name" => TopicName}
+        ),
         ?assertEqual(
             Expected,
-            Results,
-            #{
-                "Base topics" => Topics,
-                "Topic name" => TopicName
-            }
+            [id(M) || M <- emqx_topic_index:matches(TopicName, Tab, [unique])],
+            #{"Base topics" => Topics, "Topic name" => TopicName}
         )
     end,
     lists:foreach(F, Datasets).

From 47dfba4341ea312967e33dd5b21dd549b7319ea8 Mon Sep 17 00:00:00 2001
From: Andrew Mayorov <encube.ul@gmail.com>
Date: Thu, 20 Jul 2023 14:49:10 +0200
Subject: [PATCH 08/12] perf(ruleeng): employ `emqx_topic_index` to speed up
 topic matching

---
 apps/emqx_rule_engine/include/rule_engine.hrl |  1 +
 .../emqx_rule_engine/src/emqx_rule_engine.erl | 68 +++++++++++++------
 .../src/emqx_rule_engine_app.erl              |  1 +
 3 files changed, 49 insertions(+), 21 deletions(-)

diff --git a/apps/emqx_rule_engine/include/rule_engine.hrl b/apps/emqx_rule_engine/include/rule_engine.hrl
index b2a6a549e..7df5d9941 100644
--- a/apps/emqx_rule_engine/include/rule_engine.hrl
+++ b/apps/emqx_rule_engine/include/rule_engine.hrl
@@ -109,6 +109,7 @@
 
 %% Tables
 -define(RULE_TAB, emqx_rule_engine).
+-define(RULE_TOPIC_INDEX, emqx_rule_engine_topic_index).
 
 %% Allowed sql function provider modules
 -define(DEFAULT_SQL_FUNC_PROVIDER, emqx_rule_funcs).
diff --git a/apps/emqx_rule_engine/src/emqx_rule_engine.erl b/apps/emqx_rule_engine/src/emqx_rule_engine.erl
index 66c82d3a1..41d1ed433 100644
--- a/apps/emqx_rule_engine/src/emqx_rule_engine.erl
+++ b/apps/emqx_rule_engine/src/emqx_rule_engine.erl
@@ -176,7 +176,7 @@ create_rule(Params) ->
 
 create_rule(Params = #{id := RuleId}, CreatedAt) when is_binary(RuleId) ->
     case get_rule(RuleId) of
-        not_found -> parse_and_insert(Params, CreatedAt);
+        not_found -> with_parsed_rule(Params, CreatedAt, fun insert_rule/1);
         {ok, _} -> {error, already_exists}
     end.
 
@@ -185,18 +185,27 @@ update_rule(Params = #{id := RuleId}) when is_binary(RuleId) ->
     case get_rule(RuleId) of
         not_found ->
             {error, not_found};
-        {ok, #{created_at := CreatedAt}} ->
-            parse_and_insert(Params, CreatedAt)
+        {ok, RulePrev = #{created_at := CreatedAt}} ->
+            with_parsed_rule(Params, CreatedAt, fun(Rule) -> update_rule(Rule, RulePrev) end)
     end.
 
 -spec delete_rule(RuleId :: rule_id()) -> ok.
 delete_rule(RuleId) when is_binary(RuleId) ->
-    gen_server:call(?RULE_ENGINE, {delete_rule, RuleId}, ?T_CALL).
+    case get_rule(RuleId) of
+        not_found ->
+            ok;
+        {ok, Rule} ->
+            gen_server:call(?RULE_ENGINE, {delete_rule, Rule}, ?T_CALL)
+    end.
 
 -spec insert_rule(Rule :: rule()) -> ok.
 insert_rule(Rule) ->
     gen_server:call(?RULE_ENGINE, {insert_rule, Rule}, ?T_CALL).
 
+-spec update_rule(Rule :: rule(), RulePrev :: rule()) -> ok.
+update_rule(Rule, RulePrev) ->
+    gen_server:call(?RULE_ENGINE, {update_rule, Rule, RulePrev}, ?T_CALL).
+
 %%----------------------------------------------------------------------------------------
 %% Rule Management
 %%----------------------------------------------------------------------------------------
@@ -216,9 +225,8 @@ get_rules_ordered_by_ts() ->
 -spec get_rules_for_topic(Topic :: binary()) -> [rule()].
 get_rules_for_topic(Topic) ->
     [
-        Rule
-     || Rule = #{from := From} <- get_rules(),
-        emqx_topic:match_any(Topic, From)
+        emqx_topic_index:get_record(M, ?RULE_TOPIC_INDEX)
+     || M <- emqx_topic_index:matches(Topic, ?RULE_TOPIC_INDEX, [unique])
     ].
 
 -spec get_rules_with_same_event(Topic :: binary()) -> [rule()].
@@ -411,10 +419,17 @@ init([]) ->
     {ok, #{}}.
 
 handle_call({insert_rule, Rule}, _From, State) ->
-    do_insert_rule(Rule),
+    ok = do_insert_rule(Rule),
+    ok = do_update_rule_index(Rule),
+    {reply, ok, State};
+handle_call({update_rule, Rule, RulePrev}, _From, State) ->
+    ok = do_delete_rule_index(RulePrev),
+    ok = do_insert_rule(Rule),
+    ok = do_update_rule_index(Rule),
     {reply, ok, State};
 handle_call({delete_rule, Rule}, _From, State) ->
-    do_delete_rule(Rule),
+    ok = do_delete_rule_index(Rule),
+    ok = do_delete_rule(Rule),
     {reply, ok, State};
 handle_call(Req, _From, State) ->
     ?SLOG(error, #{msg => "unexpected_call", request => Req}),
@@ -438,7 +453,7 @@ code_change(_OldVsn, State, _Extra) ->
 %% Internal Functions
 %%----------------------------------------------------------------------------------------
 
-parse_and_insert(Params = #{id := RuleId, sql := Sql, actions := Actions}, CreatedAt) ->
+with_parsed_rule(Params = #{id := RuleId, sql := Sql, actions := Actions}, CreatedAt, Fun) ->
     case emqx_rule_sqlparser:parse(Sql) of
         {ok, Select} ->
             Rule = #{
@@ -459,7 +474,7 @@ parse_and_insert(Params = #{id := RuleId, sql := Sql, actions := Actions}, Creat
                 conditions => emqx_rule_sqlparser:select_where(Select)
                 %% -- calculated fields end
             },
-            ok = insert_rule(Rule),
+            ok = Fun(Rule),
             {ok, Rule};
         {error, Reason} ->
             {error, Reason}
@@ -471,16 +486,27 @@ do_insert_rule(#{id := Id} = Rule) ->
     true = ets:insert(?RULE_TAB, {Id, maps:remove(id, Rule)}),
     ok.
 
-do_delete_rule(RuleId) ->
-    case get_rule(RuleId) of
-        {ok, Rule} ->
-            ok = unload_hooks_for_rule(Rule),
-            ok = clear_metrics_for_rule(RuleId),
-            true = ets:delete(?RULE_TAB, RuleId),
-            ok;
-        not_found ->
-            ok
-    end.
+do_delete_rule(#{id := Id} = Rule) ->
+    ok = unload_hooks_for_rule(Rule),
+    ok = clear_metrics_for_rule(Id),
+    true = ets:delete(?RULE_TAB, Id),
+    ok.
+
+do_update_rule_index(#{id := Id, from := From} = Rule) ->
+    ok = lists:foreach(
+        fun(Topic) ->
+            true = emqx_topic_index:insert(Topic, Id, Rule, ?RULE_TOPIC_INDEX)
+        end,
+        From
+    ).
+
+do_delete_rule_index(#{id := Id, from := From}) ->
+    ok = lists:foreach(
+        fun(Topic) ->
+            true = emqx_topic_index:delete(Topic, Id, ?RULE_TOPIC_INDEX)
+        end,
+        From
+    ).
 
 parse_actions(Actions) ->
     [do_parse_action(Act) || Act <- Actions].
diff --git a/apps/emqx_rule_engine/src/emqx_rule_engine_app.erl b/apps/emqx_rule_engine/src/emqx_rule_engine_app.erl
index d8b031bdd..28515cb1a 100644
--- a/apps/emqx_rule_engine/src/emqx_rule_engine_app.erl
+++ b/apps/emqx_rule_engine/src/emqx_rule_engine_app.erl
@@ -26,6 +26,7 @@
 
 start(_Type, _Args) ->
     _ = ets:new(?RULE_TAB, [named_table, public, ordered_set, {read_concurrency, true}]),
+    _ = ets:new(?RULE_TOPIC_INDEX, [named_table, public, ordered_set, {read_concurrency, true}]),
     ok = emqx_rule_events:reload(),
     SupRet = emqx_rule_engine_sup:start_link(),
     ok = emqx_rule_engine:load_rules(),

From fe9477f92ea972b6d5b003658c2d4a957679f907 Mon Sep 17 00:00:00 2001
From: Andrew Mayorov <encube.ul@gmail.com>
Date: Mon, 14 Aug 2023 14:00:23 +0400
Subject: [PATCH 09/12] chore: bump applications versions

* emqx_rule_engine 5.0.23
---
 apps/emqx_rule_engine/src/emqx_rule_engine.app.src | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/apps/emqx_rule_engine/src/emqx_rule_engine.app.src b/apps/emqx_rule_engine/src/emqx_rule_engine.app.src
index 09d57a4f9..e6d00bcae 100644
--- a/apps/emqx_rule_engine/src/emqx_rule_engine.app.src
+++ b/apps/emqx_rule_engine/src/emqx_rule_engine.app.src
@@ -2,7 +2,7 @@
 {application, emqx_rule_engine, [
     {description, "EMQX Rule Engine"},
     % strict semver, bump manually!
-    {vsn, "5.0.22"},
+    {vsn, "5.0.23"},
     {modules, []},
     {registered, [emqx_rule_engine_sup, emqx_rule_engine]},
     {applications, [kernel, stdlib, rulesql, getopt, emqx_ctl, uuid]},

From d302aaae4cde883a290a4875e21b5e7891a611a3 Mon Sep 17 00:00:00 2001
From: Andrew Mayorov <encube.ul@gmail.com>
Date: Mon, 14 Aug 2023 15:24:24 +0400
Subject: [PATCH 10/12] chore: add changelog entry

---
 changes/ce/perf-11396.en.md | 1 +
 1 file changed, 1 insertion(+)
 create mode 100644 changes/ce/perf-11396.en.md

diff --git a/changes/ce/perf-11396.en.md b/changes/ce/perf-11396.en.md
new file mode 100644
index 000000000..fd8df9a9d
--- /dev/null
+++ b/changes/ce/perf-11396.en.md
@@ -0,0 +1 @@
+Introduce topic index for the rule engine runtime that significantly improves the performance of EMQX with a non-trivial number of rules consuming messages matching different topic filters.

From e39bbf4c495c2a16485a1e9e646a2845a2651603 Mon Sep 17 00:00:00 2001
From: Andrew Mayorov <encube.ul@gmail.com>
Date: Tue, 15 Aug 2023 16:55:48 +0400
Subject: [PATCH 11/12] chore(topicidx): add more descriptive comments and
 specs

To (hopefully) better illustrate what is happening there.
---
 apps/emqx/src/emqx_topic_index.erl | 33 ++++++++++++++++++++++++++++++
 1 file changed, 33 insertions(+)

diff --git a/apps/emqx/src/emqx_topic_index.erl b/apps/emqx/src/emqx_topic_index.erl
index 44e88e659..09e19a9f7 100644
--- a/apps/emqx/src/emqx_topic_index.erl
+++ b/apps/emqx/src/emqx_topic_index.erl
@@ -43,15 +43,27 @@
 -type key(ID) :: {[word()], {ID}}.
 -type match(ID) :: key(ID).
 
+%% @doc Create a new ETS table suitable for topic index.
+%% Usable mostly for testing purposes.
+-spec new() -> ets:table().
 new() ->
     ets:new(?MODULE, [public, ordered_set, {read_concurrency, true}]).
 
+%% @doc Insert a new entry into the index that associates given topic filter to given
+%% record ID, and attaches arbitrary record to the entry. This allows users to choose
+%% between regular and "materialized" indexes, for example.
+-spec insert(emqx_types:topic(), _ID, _Record, ets:table()) -> true.
 insert(Filter, ID, Record, Tab) ->
     ets:insert(Tab, {{words(Filter), {ID}}, Record}).
 
+%% @doc Delete an entry from the index that associates given topic filter to given
+%% record ID. Deleting non-existing entry is not an error.
+-spec delete(emqx_types:topic(), _ID, ets:table()) -> true.
 delete(Filter, ID, Tab) ->
     ets:delete(Tab, {words(Filter), {ID}}).
 
+%% @doc Match given topic against the index and return the first match, or `false` if
+%% no match is found.
 -spec match(emqx_types:topic(), ets:table()) -> match(_ID) | false.
 match(Topic, Tab) ->
     {Words, RPrefix} = match_init(Topic),
@@ -82,6 +94,10 @@ match_rest([W1 | [W2 | _] = SLast], [W1 | [W2 | _] = Rest], RPrefix, Tab) ->
 match_rest(SLast, [W | Rest], RPrefix, Tab) when is_list(SLast) ->
     match(Rest, [W | RPrefix], Tab);
 match_rest(plus, [W | Rest], RPrefix, Tab) ->
+    % NOTE
+    % There's '+' in the key suffix, meaning we should consider 2 alternatives:
+    % 1. Match the rest of the topic as if there was '+' in the current position.
+    % 2. Skip this key and try to match the topic as it is.
     case match(Rest, ['+' | RPrefix], Tab) of
         Match = {_, _} ->
             Match;
@@ -91,6 +107,8 @@ match_rest(plus, [W | Rest], RPrefix, Tab) ->
 match_rest(_, [], _RPrefix, _Tab) ->
     false.
 
+%% @doc Match given topic against the index and return _all_ matches.
+%% If `unique` option is given, return only unique matches by record ID.
 -spec matches(emqx_types:topic(), ets:table(), _Opts :: [unique]) -> [match(_ID)].
 matches(Topic, Tab, Opts) ->
     {Words, RPrefix} = match_init(Topic),
@@ -130,12 +148,18 @@ matches_rest([W1 | [W2 | _] = SLast], [W1 | [W2 | _] = Rest], RPrefix, Acc, Tab)
 matches_rest(SLast, [W | Rest], RPrefix, Acc, Tab) when is_list(SLast) ->
     matches(Rest, [W | RPrefix], Acc, Tab);
 matches_rest(plus, [W | Rest], RPrefix, Acc, Tab) ->
+    % NOTE
+    % There's '+' in the key suffix, meaning we should accumulate all matches from
+    % each of 2 branches:
+    % 1. Match the rest of the topic as if there was '+' in the current position.
+    % 2. Skip this key and try to match the topic as it is.
     NAcc = matches(Rest, ['+' | RPrefix], Acc, Tab),
     matches(Rest, [W | RPrefix], NAcc, Tab);
 matches_rest(_, [], _RPrefix, Acc, _Tab) ->
     Acc.
 
 match_add(K = {_Filter, ID}, Acc = #{}) ->
+    % NOTE: ensuring uniqueness by record ID
     Acc#{ID => K};
 match_add(K, Acc) ->
     [K | Acc].
@@ -146,6 +170,7 @@ match_next(_, '$end_of_table', _) ->
     stop.
 
 match_filter([], [], []) ->
+    % NOTE: we matched the topic exactly
     true;
 match_filter([], [], _Suffix) ->
     % NOTE: we matched the prefix, but there may be more matches next
@@ -173,14 +198,18 @@ match_init(Topic) ->
             {Words, []}
     end.
 
+%% @doc Extract record ID from the match.
 -spec get_id(match(ID)) -> ID.
 get_id({_Filter, {ID}}) ->
     ID.
 
+%% @doc Extract topic (or topic filter) from the match.
 -spec get_topic(match(_ID)) -> emqx_types:topic().
 get_topic({Filter, _ID}) ->
     emqx_topic:join(Filter).
 
+%% @doc Fetch the record associated with the match.
+%% NOTE: Only really useful for ETS tables where the record ID is the first element.
 -spec get_record(match(_ID), ets:table()) -> _Record.
 get_record(K, Tab) ->
     ets:lookup_element(Tab, K, 2).
@@ -189,6 +218,10 @@ get_record(K, Tab) ->
 
 -spec words(emqx_types:topic()) -> [word()].
 words(Topic) when is_binary(Topic) ->
+    % NOTE
+    % This is almost identical to `emqx_topic:words/1`, but it doesn't convert empty
+    % tokens to ''. This is needed to keep ordering of words consistent with what
+    % `match_filter/3` expects.
     [word(W) || W <- emqx_topic:tokens(Topic)].
 
 -spec word(binary()) -> word().

From 5d79823891f32ec2fc6333bfdbc9e091e218c769 Mon Sep 17 00:00:00 2001
From: Andrew Mayorov <encube.ul@gmail.com>
Date: Thu, 17 Aug 2023 01:13:19 +0400
Subject: [PATCH 12/12] perf(topicidx): preserve next key on the stack to reuse
 later

This should further optimize the number of `ets:next/2` calls
required for a single match query.
---
 apps/emqx/src/emqx_topic_index.erl | 30 +++++++++++++++++++++---------
 1 file changed, 21 insertions(+), 9 deletions(-)

diff --git a/apps/emqx/src/emqx_topic_index.erl b/apps/emqx/src/emqx_topic_index.erl
index 09e19a9f7..a6f662f74 100644
--- a/apps/emqx/src/emqx_topic_index.erl
+++ b/apps/emqx/src/emqx_topic_index.erl
@@ -127,6 +127,17 @@ matches(Words, RPrefix, Acc, Tab) ->
     Prefix = lists:reverse(RPrefix),
     matches(ets:next(Tab, {Prefix, {}}), Prefix, Words, RPrefix, Acc, Tab).
 
+matches(Words, RPrefix, K = {Filter, _}, Acc, Tab) ->
+    Prefix = lists:reverse(RPrefix),
+    case Prefix > Filter of
+        true ->
+            % NOTE: Prefix already greater than the last key seen, need to `ets:next/2`.
+            matches(ets:next(Tab, {Prefix, {}}), Prefix, Words, RPrefix, Acc, Tab);
+        false ->
+            % NOTE: Prefix is still less than or equal to the last key seen, reuse it.
+            matches(K, Prefix, Words, RPrefix, Acc, Tab)
+    end.
+
 matches(K, Prefix, Words, RPrefix, Acc, Tab) ->
     case match_next(Prefix, K, Words) of
         true ->
@@ -136,26 +147,27 @@ matches(K, Prefix, Words, RPrefix, Acc, Tab) ->
         stop ->
             Acc;
         Matched ->
-            matches_rest(Matched, Words, RPrefix, Acc, Tab)
+            % NOTE: Prserve next key on the stack to save on `ets:next/2` calls.
+            matches_rest(Matched, Words, RPrefix, K, Acc, Tab)
     end.
 
-matches_rest([W1 | [W2 | _] = SLast], [W1 | [W2 | _] = Rest], RPrefix, Acc, Tab) ->
+matches_rest([W1 | [W2 | _] = SLast], [W1 | [W2 | _] = Rest], RPrefix, K, Acc, Tab) ->
     % NOTE
     % Fast-forward through identical words in the topic and the last key suffixes.
     % This should save us a few redundant `ets:next` calls at the cost of slightly
     % more complex match patterns.
-    matches_rest(SLast, Rest, [W1 | RPrefix], Acc, Tab);
-matches_rest(SLast, [W | Rest], RPrefix, Acc, Tab) when is_list(SLast) ->
-    matches(Rest, [W | RPrefix], Acc, Tab);
-matches_rest(plus, [W | Rest], RPrefix, Acc, Tab) ->
+    matches_rest(SLast, Rest, [W1 | RPrefix], K, Acc, Tab);
+matches_rest(SLast, [W | Rest], RPrefix, K, Acc, Tab) when is_list(SLast) ->
+    matches(Rest, [W | RPrefix], K, Acc, Tab);
+matches_rest(plus, [W | Rest], RPrefix, K, Acc, Tab) ->
     % NOTE
     % There's '+' in the key suffix, meaning we should accumulate all matches from
     % each of 2 branches:
     % 1. Match the rest of the topic as if there was '+' in the current position.
     % 2. Skip this key and try to match the topic as it is.
-    NAcc = matches(Rest, ['+' | RPrefix], Acc, Tab),
-    matches(Rest, [W | RPrefix], NAcc, Tab);
-matches_rest(_, [], _RPrefix, Acc, _Tab) ->
+    NAcc = matches(Rest, ['+' | RPrefix], K, Acc, Tab),
+    matches(Rest, [W | RPrefix], K, NAcc, Tab);
+matches_rest(_, [], _RPrefix, _K, Acc, _Tab) ->
     Acc.
 
 match_add(K = {_Filter, ID}, Acc = #{}) ->