From ae094e363caf2a02edd586b0168a1006c81d7289 Mon Sep 17 00:00:00 2001
From: "Zaiming (Stone) Shi" <zmstone@gmail.com>
Date: Tue, 15 Aug 2023 16:27:19 +0200
Subject: [PATCH 01/12] chore: fix a typo in log message

---
 apps/emqx_conf/src/emqx_cluster_rpc.erl | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/apps/emqx_conf/src/emqx_cluster_rpc.erl b/apps/emqx_conf/src/emqx_cluster_rpc.erl
index bb154f8b5..934d7ef7a 100644
--- a/apps/emqx_conf/src/emqx_cluster_rpc.erl
+++ b/apps/emqx_conf/src/emqx_cluster_rpc.erl
@@ -649,7 +649,7 @@ do_wait_for_emqx_ready(N) ->
         ok ->
             ok;
         timeout ->
-            ?SLOG(warning, #{msg => "stil_waiting_for_emqx_app_to_be_ready"}),
+            ?SLOG(warning, #{msg => "still_waiting_for_emqx_app_to_be_ready"}),
             do_wait_for_emqx_ready(N - 1)
     end.
 

From 6b152b3cb70af21558be7bd8be050d73d33c51af Mon Sep 17 00:00:00 2001
From: "Zaiming (Stone) Shi" <zmstone@gmail.com>
Date: Sun, 20 Aug 2023 20:21:45 +0200
Subject: [PATCH 02/12] test: add more debug output

---
 apps/emqx/test/emqx_topic_index_SUITE.erl | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/apps/emqx/test/emqx_topic_index_SUITE.erl b/apps/emqx/test/emqx_topic_index_SUITE.erl
index ade98acec..9058f3597 100644
--- a/apps/emqx/test/emqx_topic_index_SUITE.erl
+++ b/apps/emqx/test/emqx_topic_index_SUITE.erl
@@ -252,8 +252,9 @@ topic_matches_prop() ->
                             ct:pal(
                                 "Topic name: ~p~n"
                                 "Index results: ~p~n"
-                                "Topic match results:: ~p~n",
-                                [Topic, Ids1, Ids2]
+                                "Topic match results: ~p~n"
+                                "Filters: ~p~n",
+                                [Topic, Ids1, Ids2, Filters]
                             ),
                             false
                     end

From f4c8c6be5573804203dcd5d7e66ede487cfd19bf Mon Sep 17 00:00:00 2001
From: "Zaiming (Stone) Shi" <zmstone@gmail.com>
Date: Thu, 17 Aug 2023 10:13:22 +0200
Subject: [PATCH 03/12] refactor(topic_index): optimize trie-search performance

---
 apps/emqx/src/emqx_topic_gbt.erl          | 116 ++++++++
 apps/emqx/src/emqx_topic_index.erl        | 191 ++----------
 apps/emqx/src/emqx_trie_search.erl        | 344 ++++++++++++++++++++++
 apps/emqx/test/emqx_topic_index_SUITE.erl | 190 +++++++-----
 4 files changed, 593 insertions(+), 248 deletions(-)
 create mode 100644 apps/emqx/src/emqx_topic_gbt.erl
 create mode 100644 apps/emqx/src/emqx_trie_search.erl

diff --git a/apps/emqx/src/emqx_topic_gbt.erl b/apps/emqx/src/emqx_topic_gbt.erl
new file mode 100644
index 000000000..da6320bc7
--- /dev/null
+++ b/apps/emqx/src/emqx_topic_gbt.erl
@@ -0,0 +1,116 @@
+%%--------------------------------------------------------------------
+%% Copyright (c) 2023 EMQ Technologies Co., Ltd. All Rights Reserved.
+%%
+%% Licensed under the Apache License, Version 2.0 (the "License");
+%% you may not use this file except in compliance with the License.
+%% You may obtain a copy of the License at
+%%
+%%     http://www.apache.org/licenses/LICENSE-2.0
+%%
+%% Unless required by applicable law or agreed to in writing, software
+%% distributed under the License is distributed on an "AS IS" BASIS,
+%% WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+%% See the License for the specific language governing permissions and
+%% limitations under the License.
+%%--------------------------------------------------------------------
+
+%% @doc Topic index implemetation with gb_trees stored in persistent_term.
+%% This is only suitable for a static set of topic or topic-filters.
+
+-module(emqx_topic_gbt).
+
+-export([new/0, new/1]).
+-export([insert/4]).
+-export([delete/3]).
+-export([match/2]).
+-export([matches/3]).
+
+-export([get_id/1]).
+-export([get_topic/1]).
+-export([get_record/2]).
+
+-type word() :: binary() | '+' | '#'.
+-type key(ID) :: {[word()], {ID}}.
+-type match(ID) :: key(ID).
+-type name() :: any().
+
+%% @private Only for testing.
+-spec new() -> name().
+new() ->
+    new(test).
+
+%% @doc Create a new gb_tree and store it in the persitent_term with the
+%% given name.
+-spec new(name()) -> name().
+new(Name) ->
+    T = gb_trees:from_orddict([]),
+    true = gbt_update(Name, T),
+    Name.
+
+%% @doc Insert a new entry into the index that associates given topic filter to given
+%% record ID, and attaches arbitrary record to the entry. This allows users to choose
+%% between regular and "materialized" indexes, for example.
+-spec insert(emqx_types:topic(), _ID, _Record, name()) -> true.
+insert(Filter, ID, Record, Name) ->
+    Tree = gbt(Name),
+    Key = key(Filter, ID),
+    NewTree = gb_trees:enter(Key, Record, Tree),
+    true = gbt_update(Name, NewTree).
+
+%% @doc Delete an entry from the index that associates given topic filter to given
+%% record ID. Deleting non-existing entry is not an error.
+-spec delete(emqx_types:topic(), _ID, name()) -> true.
+delete(Filter, ID, Name) ->
+    Tree = gbt(Name),
+    Key = key(Filter, ID),
+    NewTree = gb_trees:delete_any(Key, Tree),
+    true = gbt_update(Name, NewTree).
+
+%% @doc Match given topic against the index and return the first match, or `false` if
+%% no match is found.
+-spec match(emqx_types:topic(), name()) -> match(_ID) | false.
+match(Topic, Name) ->
+    emqx_trie_search:match(Topic, make_nextf(Name)).
+
+%% @doc Match given topic against the index and return _all_ matches.
+%% If `unique` option is given, return only unique matches by record ID.
+matches(Topic, Name, Opts) ->
+    emqx_trie_search:matches(Topic, make_nextf(Name), Opts).
+
+%% @doc Extract record ID from the match.
+-spec get_id(match(ID)) -> ID.
+get_id(Key) ->
+    emqx_trie_search:get_id(Key).
+
+%% @doc Extract topic (or topic filter) from the match.
+-spec get_topic(match(_ID)) -> emqx_types:topic().
+get_topic(Key) ->
+    emqx_trie_search:get_topic(Key).
+
+%% @doc Fetch the record associated with the match.
+-spec get_record(match(_ID), name()) -> _Record.
+get_record(Key, Name) ->
+    Gbt = gbt(Name),
+    gb_trees:get(Key, Gbt).
+
+key(TopicOrFilter, ID) ->
+    emqx_trie_search:make_key(TopicOrFilter, ID).
+
+gbt(Name) ->
+    persistent_term:get({?MODULE, Name}).
+
+gbt_update(Name, Tree) ->
+    persistent_term:put({?MODULE, Name}, Tree),
+    true.
+
+gbt_next(nil, _Input) ->
+    emqx_trie_search:ceiling();
+gbt_next({P, _V, _Smaller, Bigger}, K) when K >= P ->
+    gbt_next(Bigger, K);
+gbt_next({P, _V, Smaller, _Bigger}, K) ->
+    NextKey = gbt_next(Smaller, K),
+    min(P, NextKey).
+
+make_nextf(Name) ->
+    {_SizeWeDontCare, TheTree} = gbt(Name),
+    fun(Key) -> gbt_next(TheTree, Key) end.
diff --git a/apps/emqx/src/emqx_topic_index.erl b/apps/emqx/src/emqx_topic_index.erl
index a6f662f74..e1da5b3b7 100644
--- a/apps/emqx/src/emqx_topic_index.erl
+++ b/apps/emqx/src/emqx_topic_index.erl
@@ -14,18 +14,7 @@
 %% limitations under the License.
 %%--------------------------------------------------------------------
 
-%% @doc Topic index for matching topics to topic filters.
-%%
-%% Works on top of ETS ordered_set table. Keys are tuples constructed from
-%% parsed topic filters and record IDs, wrapped in a tuple to order them
-%% strictly greater than unit tuple (`{}`). Existing table may be used if
-%% existing keys will not collide with index keys.
-%%
-%% Designed to effectively answer questions like:
-%% 1. Does any topic filter match given topic?
-%% 2. Which records are associated with topic filters matching given topic?
-%% 3. Which topic filters match given topic?
-%% 4. Which record IDs are associated with topic filters matching given topic?
+%% @doc Topic index implemetation with ETS table as ordered-set storage.
 
 -module(emqx_topic_index).
 
@@ -39,186 +28,51 @@
 -export([get_topic/1]).
 -export([get_record/2]).
 
--type word() :: binary() | '+' | '#'.
--type key(ID) :: {[word()], {ID}}.
+-type key(ID) :: emqx_trie_search:key(ID).
 -type match(ID) :: key(ID).
 
 %% @doc Create a new ETS table suitable for topic index.
 %% Usable mostly for testing purposes.
 -spec new() -> ets:table().
 new() ->
-    ets:new(?MODULE, [public, ordered_set, {read_concurrency, true}]).
+    T = ets:new(?MODULE, [public, ordered_set, {read_concurrency, true}]),
+    ets:insert(T, {emqx_trie_search:ceiling(), []}),
+    T.
 
 %% @doc Insert a new entry into the index that associates given topic filter to given
 %% record ID, and attaches arbitrary record to the entry. This allows users to choose
 %% between regular and "materialized" indexes, for example.
 -spec insert(emqx_types:topic(), _ID, _Record, ets:table()) -> true.
 insert(Filter, ID, Record, Tab) ->
-    ets:insert(Tab, {{words(Filter), {ID}}, Record}).
+    Key = key(Filter, ID),
+    true = ets:insert(Tab, {Key, Record}).
 
 %% @doc Delete an entry from the index that associates given topic filter to given
 %% record ID. Deleting non-existing entry is not an error.
 -spec delete(emqx_types:topic(), _ID, ets:table()) -> true.
 delete(Filter, ID, Tab) ->
-    ets:delete(Tab, {words(Filter), {ID}}).
+    true = ets:delete(Tab, key(Filter, ID)).
 
 %% @doc Match given topic against the index and return the first match, or `false` if
 %% no match is found.
 -spec match(emqx_types:topic(), ets:table()) -> match(_ID) | false.
 match(Topic, Tab) ->
-    {Words, RPrefix} = match_init(Topic),
-    match(Words, RPrefix, Tab).
-
-match(Words, RPrefix, Tab) ->
-    Prefix = lists:reverse(RPrefix),
-    match(ets:next(Tab, {Prefix, {}}), Prefix, Words, RPrefix, Tab).
-
-match(K, Prefix, Words, RPrefix, Tab) ->
-    case match_next(Prefix, K, Words) of
-        true ->
-            K;
-        skip ->
-            match(ets:next(Tab, K), Prefix, Words, RPrefix, Tab);
-        stop ->
-            false;
-        Matched ->
-            match_rest(Matched, Words, RPrefix, Tab)
-    end.
-
-match_rest([W1 | [W2 | _] = SLast], [W1 | [W2 | _] = Rest], RPrefix, Tab) ->
-    % NOTE
-    % Fast-forward through identical words in the topic and the last key suffixes.
-    % This should save us a few redundant `ets:next` calls at the cost of slightly
-    % more complex match patterns.
-    match_rest(SLast, Rest, [W1 | RPrefix], Tab);
-match_rest(SLast, [W | Rest], RPrefix, Tab) when is_list(SLast) ->
-    match(Rest, [W | RPrefix], Tab);
-match_rest(plus, [W | Rest], RPrefix, Tab) ->
-    % NOTE
-    % There's '+' in the key suffix, meaning we should consider 2 alternatives:
-    % 1. Match the rest of the topic as if there was '+' in the current position.
-    % 2. Skip this key and try to match the topic as it is.
-    case match(Rest, ['+' | RPrefix], Tab) of
-        Match = {_, _} ->
-            Match;
-        false ->
-            match(Rest, [W | RPrefix], Tab)
-    end;
-match_rest(_, [], _RPrefix, _Tab) ->
-    false.
+    emqx_trie_search:match(Topic, make_nextf(Tab)).
 
 %% @doc Match given topic against the index and return _all_ matches.
 %% If `unique` option is given, return only unique matches by record ID.
--spec matches(emqx_types:topic(), ets:table(), _Opts :: [unique]) -> [match(_ID)].
 matches(Topic, Tab, Opts) ->
-    {Words, RPrefix} = match_init(Topic),
-    AccIn =
-        case Opts of
-            [unique | _] -> #{};
-            [] -> []
-        end,
-    Matches = matches(Words, RPrefix, AccIn, Tab),
-    case Matches of
-        #{} -> maps:values(Matches);
-        _ -> Matches
-    end.
-
-matches(Words, RPrefix, Acc, Tab) ->
-    Prefix = lists:reverse(RPrefix),
-    matches(ets:next(Tab, {Prefix, {}}), Prefix, Words, RPrefix, Acc, Tab).
-
-matches(Words, RPrefix, K = {Filter, _}, Acc, Tab) ->
-    Prefix = lists:reverse(RPrefix),
-    case Prefix > Filter of
-        true ->
-            % NOTE: Prefix already greater than the last key seen, need to `ets:next/2`.
-            matches(ets:next(Tab, {Prefix, {}}), Prefix, Words, RPrefix, Acc, Tab);
-        false ->
-            % NOTE: Prefix is still less than or equal to the last key seen, reuse it.
-            matches(K, Prefix, Words, RPrefix, Acc, Tab)
-    end.
-
-matches(K, Prefix, Words, RPrefix, Acc, Tab) ->
-    case match_next(Prefix, K, Words) of
-        true ->
-            matches(ets:next(Tab, K), Prefix, Words, RPrefix, match_add(K, Acc), Tab);
-        skip ->
-            matches(ets:next(Tab, K), Prefix, Words, RPrefix, Acc, Tab);
-        stop ->
-            Acc;
-        Matched ->
-            % NOTE: Prserve next key on the stack to save on `ets:next/2` calls.
-            matches_rest(Matched, Words, RPrefix, K, Acc, Tab)
-    end.
-
-matches_rest([W1 | [W2 | _] = SLast], [W1 | [W2 | _] = Rest], RPrefix, K, Acc, Tab) ->
-    % NOTE
-    % Fast-forward through identical words in the topic and the last key suffixes.
-    % This should save us a few redundant `ets:next` calls at the cost of slightly
-    % more complex match patterns.
-    matches_rest(SLast, Rest, [W1 | RPrefix], K, Acc, Tab);
-matches_rest(SLast, [W | Rest], RPrefix, K, Acc, Tab) when is_list(SLast) ->
-    matches(Rest, [W | RPrefix], K, Acc, Tab);
-matches_rest(plus, [W | Rest], RPrefix, K, Acc, Tab) ->
-    % NOTE
-    % There's '+' in the key suffix, meaning we should accumulate all matches from
-    % each of 2 branches:
-    % 1. Match the rest of the topic as if there was '+' in the current position.
-    % 2. Skip this key and try to match the topic as it is.
-    NAcc = matches(Rest, ['+' | RPrefix], K, Acc, Tab),
-    matches(Rest, [W | RPrefix], K, NAcc, Tab);
-matches_rest(_, [], _RPrefix, _K, Acc, _Tab) ->
-    Acc.
-
-match_add(K = {_Filter, ID}, Acc = #{}) ->
-    % NOTE: ensuring uniqueness by record ID
-    Acc#{ID => K};
-match_add(K, Acc) ->
-    [K | Acc].
-
-match_next(Prefix, {Filter, _ID}, Suffix) ->
-    match_filter(Prefix, Filter, Suffix);
-match_next(_, '$end_of_table', _) ->
-    stop.
-
-match_filter([], [], []) ->
-    % NOTE: we matched the topic exactly
-    true;
-match_filter([], [], _Suffix) ->
-    % NOTE: we matched the prefix, but there may be more matches next
-    skip;
-match_filter([], ['#'], _Suffix) ->
-    % NOTE: naturally, '#' < '+', so this is already optimal for `match/2`
-    true;
-match_filter([], ['+' | _], _Suffix) ->
-    plus;
-match_filter([], [_H | _] = Rest, _Suffix) ->
-    Rest;
-match_filter([H | T1], [H | T2], Suffix) ->
-    match_filter(T1, T2, Suffix);
-match_filter([H1 | _], [H2 | _], _Suffix) when H2 > H1 ->
-    % NOTE: we're strictly past the prefix, no need to continue
-    stop.
-
-match_init(Topic) ->
-    case words(Topic) of
-        [W = <<"$", _/bytes>> | Rest] ->
-            % NOTE
-            % This will effectively skip attempts to match special topics to `#` or `+/...`.
-            {Rest, [W]};
-        Words ->
-            {Words, []}
-    end.
+    emqx_trie_search:matches(Topic, make_nextf(Tab), Opts).
 
 %% @doc Extract record ID from the match.
 -spec get_id(match(ID)) -> ID.
-get_id({_Filter, {ID}}) ->
-    ID.
+get_id(Key) ->
+    emqx_trie_search:get_id(Key).
 
 %% @doc Extract topic (or topic filter) from the match.
 -spec get_topic(match(_ID)) -> emqx_types:topic().
-get_topic({Filter, _ID}) ->
-    emqx_topic:join(Filter).
+get_topic(Key) ->
+    emqx_trie_search:get_topic(Key).
 
 %% @doc Fetch the record associated with the match.
 %% NOTE: Only really useful for ETS tables where the record ID is the first element.
@@ -226,17 +80,8 @@ get_topic({Filter, _ID}) ->
 get_record(K, Tab) ->
     ets:lookup_element(Tab, K, 2).
 
-%%
+key(TopicOrFilter, ID) ->
+    emqx_trie_search:make_key(TopicOrFilter, ID).
 
--spec words(emqx_types:topic()) -> [word()].
-words(Topic) when is_binary(Topic) ->
-    % NOTE
-    % This is almost identical to `emqx_topic:words/1`, but it doesn't convert empty
-    % tokens to ''. This is needed to keep ordering of words consistent with what
-    % `match_filter/3` expects.
-    [word(W) || W <- emqx_topic:tokens(Topic)].
-
--spec word(binary()) -> word().
-word(<<"+">>) -> '+';
-word(<<"#">>) -> '#';
-word(Bin) -> Bin.
+make_nextf(Tab) ->
+    fun(Key) -> ets:next(Tab, Key) end.
diff --git a/apps/emqx/src/emqx_trie_search.erl b/apps/emqx/src/emqx_trie_search.erl
new file mode 100644
index 000000000..8d9463c26
--- /dev/null
+++ b/apps/emqx/src/emqx_trie_search.erl
@@ -0,0 +1,344 @@
+%%--------------------------------------------------------------------
+%% Copyright (c) 2023 EMQ Technologies Co., Ltd. All Rights Reserved.
+%%
+%% Licensed under the Apache License, Version 2.0 (the "License");
+%% you may not use this file except in compliance with the License.
+%% You may obtain a copy of the License at
+%%
+%%     http://www.apache.org/licenses/LICENSE-2.0
+%%
+%% Unless required by applicable law or agreed to in writing, software
+%% distributed under the License is distributed on an "AS IS" BASIS,
+%% WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+%% See the License for the specific language governing permissions and
+%% limitations under the License.
+%%--------------------------------------------------------------------
+
+%% @doc Topic index for matching topics to topic filters.
+%%
+%% Works on top of ETS ordered_set table. Keys are tuples constructed from
+%% parsed topic filters and record IDs, wrapped in a tuple to order them
+%% strictly greater than unit tuple (`{}`). Existing table may be used if
+%% existing keys will not collide with index keys.
+%%
+%% Designed to effectively answer questions like:
+%% 1. Does any topic filter match given topic?
+%% 2. Which records are associated with topic filters matching given topic?
+%% 3. Which topic filters match given topic?
+%% 4. Which record IDs are associated with topic filters matching given topic?
+%%
+%% Trie-search algorithm:
+%%
+%% Given a 3-level topic (e.g. a/b/c), if we leave out '#' for now,
+%% all possible subscriptions of a/b/c can be enumerated as below:
+%%
+%% a/b/c
+%% a/b/+
+%% a/+/c <--- subscribed
+%% a/+/+
+%% +/b/c <--- subscribed
+%% +/b/+
+%% +/+/c
+%% +/+/+ <--- start searching upward from here
+%%
+%% Let's name this search space "Space1".
+%% If we brute-force it, the scope would be 8 (2^3).
+%% Meaning this has O(2^N) complexity (N being the level of topics).
+%%
+%% This clearly isn't going to work.
+%% Should we then try to enumerate all subscribers instead?
+%% If there are also other subscriptions, e.g. "+/x/y" and "+/b/0"
+%%
+%% a/+/c <--- match of a/b/c
+%% +/x/n
+%% ...
+%% +/x/2
+%% +/x/1
+%% +/b/c <--- match of a/b/c
+%% +/b/1
+%% +/b/0
+%%
+%% Let's name it "Space2".
+%%
+%% This has O(M * L) complexity (M being the total number of subscriptions,
+%% and L being the number of topic levels).
+%% This is usually a lot smaller than "Space1", but still not very effective
+%% if the collection size is e.g. 1 million.
+%%
+%% To make it more effective, we'll need to combine the two algorithms:
+%% Use the ordered subscription topics' prefixes as starting points to make
+%% guesses about whether or not the next word can be a '+', and skip-over
+%% to the next possible match.
+%%
+%% NOTE: A prerequisite of the ordered collection is, it should be able
+%% to find the *immediate-next* topic/filter with a given prefix.
+%%
+%% In the above example, we start from "+/b/0". When comparing "+/b/0"
+%% with "a/b/c", we know the matching prefix is "+/b", meaning we can
+%% start guessing if the next word is '+' or 'c':
+%%   * It can't be '+' because '+' < '0'
+%%   * It might be 'c' because 'c' > '0'
+%%
+%% So, we try to jump to the next topic which has a prefix of "+/b/c"
+%% (this effectively means skipping over "+/b/1").
+%%
+%% After "+/b/c" is found to be a matching filter, we move up:
+%%   * The next possible match is "a/+/+" according to Space1
+%%   * The next subscription is "+/x/1" according to Space2
+%%
+%% "a/+/+" is lexicographically greater than "+/x/+", so let's jump to
+%% the immediate-next of 'a/+/+', which is "a/+/c", allowing us to skip
+%% over all the ones starting with "+/x".
+%%
+%% If we take '#' into consideration, it's only one extra comparison to see
+%% if a filter ends with '#'.
+%%
+%% In summary, the complexity of this algorithm is O(N * L)
+%% N being the number of total matches, and L being the level of the topic.
+
+-module(emqx_trie_search).
+
+-export([ceiling/0, make_key/2]).
+-export([match/2, matches/3, get_id/1, get_topic/1]).
+-export_type([key/1, word/0, nextf/0, opts/0]).
+
+-type word() :: binary() | '+' | '#'.
+-type base_key() :: {binary() | [word()], {}}.
+-type key(ID) :: {binary() | [word()], {ID}}.
+-type nextf() :: fun((key(_) | base_key()) -> key(_)).
+-type opts() :: [unique | return_first].
+
+%% Holds the constant values of each search.
+-record(ctx, {
+    %% A function which can quickly find the immediate-next record of the given prefix
+    nextf :: nextf(),
+    %% The initial prefix to start searching from
+    %% if the input topic starts with a dollar-word, it's the first word like [<<"$SYS">>]
+    %% otherwise it's a []
+    prefix0 :: [word()],
+    %% The initial words of a topic
+    words0 :: [word()],
+    %% Return as soon as there is one match found
+    return_first :: boolean()
+}).
+
+%% Holds the variable parts of each search.
+-record(acc, {
+    %% The current searching target topic/filter
+    target,
+    %% The number of moves.
+    %% This is used to check if the target has been moved
+    %% after attempting to append '+' to the searching prefix
+    moves = 0,
+    %% Search result accumulation
+    matches = []
+}).
+
+%% All valid utf8 bytes are less than 255.
+-define(CEILING_TOPIC, <<255>>).
+-define(CEILING, {?CEILING_TOPIC, {1}}).
+
+%% @doc Return a key which is greater than all other valid keys.
+ceiling() ->
+    ?CEILING.
+
+%% @doc Make a search-key for the given topic.
+-spec make_key(emqx_types:topic(), ID) -> key(ID).
+make_key(Topic, ID) when is_binary(Topic) ->
+    Words = words(Topic),
+    Key =
+        case lists:any(fun erlang:is_atom/1, Words) of
+            true ->
+                %% it's a wildcard
+                {Words, {ID}};
+            false ->
+                %% Not a wildcard. We do not split the topic
+                %% because they can be found with direct lookups.
+                %% it is also more compact in memory.
+                {Topic, {ID}}
+        end,
+    Key > ceiling() andalso throw({invalid_topic, Topic}),
+    Key.
+
+%% @doc Extract record ID from the match.
+-spec get_id(key(ID)) -> ID.
+get_id({_Filter, {ID}}) ->
+    ID.
+
+%% @doc Extract topic (or topic filter) from the match.
+-spec get_topic(key(_ID)) -> emqx_types:topic().
+get_topic({Filter, _ID}) when is_list(Filter) ->
+    emqx_topic:join(Filter);
+get_topic({Topic, _ID}) ->
+    Topic.
+
+%% Make the base-key which can be used to locate the desired search target.
+base(Prefix) ->
+    {Prefix, {}}.
+
+%% Move the search target to the key next to the given Base.
+move_up(#ctx{nextf = NextF}, #acc{moves = Moves} = Acc, Base) ->
+    Acc#acc{target = NextF(Base), moves = Moves + 1}.
+
+%% The current target key is a match, add it to the accumulation.
+add(C, #acc{target = Key} = Acc) ->
+    add(C, Acc, Key).
+
+%% Add the given key to the accumulation.
+add(#ctx{return_first = true}, _Acc, Key) ->
+    throw({return_first, Key});
+add(_C, #acc{matches = Matches} = Acc, Key) ->
+    Acc#acc{matches = match_add(Key, Matches)}.
+
+%% @doc Match given topic against the index and return the first match, or `false` if
+%% no match is found.
+-spec match(emqx_types:topic(), nextf()) -> false | key(_).
+match(Topic, NextF) ->
+    try search(Topic, NextF, [return_first]) of
+        [] ->
+            false
+    catch
+        throw:{return_first, Res} ->
+            Res
+    end.
+
+%% @doc Match given topic against the index and return _all_ matches.
+%% If `unique` option is given, return only unique matches by record ID.
+-spec matches(emqx_types:topic(), nextf(), opts()) -> [key(_)].
+matches(Topic, NextF, Opts) ->
+    search(Topic, NextF, Opts).
+
+%% @doc Entrypoint of the search for a given topic.
+search(Topic, NextF, Opts) ->
+    {Words, Prefix} = match_init(Topic),
+    Context = #ctx{
+        nextf = NextF,
+        prefix0 = Prefix,
+        words0 = Words,
+        return_first = proplists:get_bool(return_first, Opts)
+    },
+    Matches0 =
+        case proplists:get_bool(unique, Opts) of
+            true ->
+                #{};
+            false ->
+                []
+        end,
+    Acc = search_new(Context, base(Prefix), #acc{matches = Matches0}),
+    #acc{matches = Matches} = match_non_wildcards(Context, base(Topic), Acc),
+    case is_map(Matches) of
+        true ->
+            maps:values(Matches);
+        false ->
+            Matches
+    end.
+
+%% The recursive entrypoint of the trie-search algorithm.
+%% Always start from the initial prefix and words.
+search_new(#ctx{prefix0 = Prefix, words0 = Words0} = C, NewBase, Acc0) ->
+    #acc{target = {Filter, _}} = Acc = move_up(C, Acc0, NewBase),
+    case Prefix of
+        [] ->
+            %% This is not a '$' topic, start from '+'
+            search_plus(C, Words0, Filter, [], Acc);
+        [DollarWord] ->
+            %% Start from the '$' word
+            search_up(C, DollarWord, Words0, Filter, [], Acc)
+    end.
+
+%% Search to the bigger end of ordered collection of topics and topic-filters.
+search_up(C, Word, Words, Filter, RPrefix, #acc{target = Base} = Acc) ->
+    case compare(Word, Filter, Words) of
+        {match, full} ->
+            search_new(C, Base, add(C, Acc));
+        {match, prefix} ->
+            search_new(C, Base, Acc);
+        lower ->
+            Acc;
+        higher ->
+            NewBase = base(lists:reverse([Word | RPrefix])),
+            search_new(C, NewBase, Acc);
+        shorter ->
+            search_plus(C, Words, tl(Filter), [Word | RPrefix], Acc)
+    end.
+
+%% Try to use '+' as the next word in the prefix.
+search_plus(C, [W, X | Words], [W, X | Filter], RPrefix, Acc) ->
+    %% Directly append the current word to the matching prefix (RPrefix).
+    %% Micro optimization: try not to call the next clause because
+    %% it is not a continuation.
+    search_plus(C, [X | Words], [X | Filter], [W | RPrefix], Acc);
+search_plus(C, [W | Words], Filter, RPrefix, Acc) ->
+    M = Acc#acc.moves,
+    case search_up(C, '+', Words, Filter, RPrefix, Acc) of
+        #acc{moves = M1} = Acc1 when M1 =:= M ->
+            %% Keep searching for one which has W as the next word
+            search_up(C, W, Words, Filter, RPrefix, Acc1);
+        Acc1 ->
+            %% Already searched
+            Acc1
+    end.
+
+%% Compare prefix word then the next words in suffix against the search-target
+%% topic or topic-filter.
+compare(_, NotFilter, _) when is_binary(NotFilter) ->
+    lower;
+compare(H, [H | Filter], Words) ->
+    compare(Filter, Words);
+compare(_, ['#'], _Words) ->
+    {match, full};
+compare(H1, [H2 | _T2], _Words) when H1 < H2 ->
+    lower;
+compare(_H, [_ | _], _Words) ->
+    higher.
+
+%% Now compare the filter suffix and the topic suffix.
+compare([], []) ->
+    {match, full};
+compare([], _Words) ->
+    {match, prefix};
+compare(['#'], _Words) ->
+    {match, full};
+compare([_ | _], []) ->
+    lower;
+compare([_ | _], _Words) ->
+    %% cannot know if it's a match, lower, or higher,
+    %% must search with a longer prefix.
+    shorter.
+
+match_add(K = {_Filter, ID}, Acc = #{}) ->
+    % NOTE: ensuring uniqueness by record ID
+    Acc#{ID => K};
+match_add(K, Acc) ->
+    [K | Acc].
+
+match_init(Topic) ->
+    case words(Topic) of
+        [W = <<"$", _/bytes>> | Rest] ->
+            % NOTE
+            % This will effectively skip attempts to match special topics to `#` or `+/...`.
+            {Rest, [W]};
+        Words ->
+            {Words, []}
+    end.
+
+-spec words(emqx_types:topic()) -> [word()].
+words(Topic) when is_binary(Topic) ->
+    % NOTE
+    % This is almost identical to `emqx_topic:words/1`, but it doesn't convert empty
+    % tokens to ''. This is needed to keep ordering of words consistent with what
+    % `match_filter/3` expects.
+    [word(W) || W <- emqx_topic:tokens(Topic)].
+
+-spec word(binary()) -> word().
+word(<<"+">>) -> '+';
+word(<<"#">>) -> '#';
+word(Bin) -> Bin.
+
+match_non_wildcards(#ctx{nextf = NextF} = C, {Topic, _} = Base, Acc) ->
+    case NextF(Base) of
+        {Topic, _ID} = Key ->
+            match_non_wildcards(C, Key, add(C, Acc, Key));
+        _Other ->
+            Acc
+    end.
diff --git a/apps/emqx/test/emqx_topic_index_SUITE.erl b/apps/emqx/test/emqx_topic_index_SUITE.erl
index 9058f3597..dd1fc07f3 100644
--- a/apps/emqx/test/emqx_topic_index_SUITE.erl
+++ b/apps/emqx/test/emqx_topic_index_SUITE.erl
@@ -25,42 +25,71 @@
 -import(emqx_proper_types, [scaled/2]).
 
 all() ->
-    emqx_common_test_helpers:all(?MODULE).
+    [
+        {group, ets},
+        {group, gb_tree}
+    ].
 
-t_insert(_) ->
-    Tab = emqx_topic_index:new(),
-    true = emqx_topic_index:insert(<<"sensor/1/metric/2">>, t_insert_1, <<>>, Tab),
-    true = emqx_topic_index:insert(<<"sensor/+/#">>, t_insert_2, <<>>, Tab),
-    true = emqx_topic_index:insert(<<"sensor/#">>, t_insert_3, <<>>, Tab),
-    ?assertEqual(<<"sensor/#">>, topic(match(<<"sensor">>, Tab))),
-    ?assertEqual(t_insert_3, id(match(<<"sensor">>, Tab))).
+groups() ->
+    All = emqx_common_test_helpers:all(?MODULE),
+    [
+        {ets, All},
+        {gb_tree, All}
+    ].
 
-t_match(_) ->
-    Tab = emqx_topic_index:new(),
-    true = emqx_topic_index:insert(<<"sensor/1/metric/2">>, t_match_1, <<>>, Tab),
-    true = emqx_topic_index:insert(<<"sensor/+/#">>, t_match_2, <<>>, Tab),
-    true = emqx_topic_index:insert(<<"sensor/#">>, t_match_3, <<>>, Tab),
+init_per_group(ets, Config) ->
+    [{index_module, emqx_topic_index} | Config];
+init_per_group(gb_tree, Config) ->
+    [{index_module, emqx_topic_gbt} | Config].
+
+end_per_group(_Group, _Config) ->
+    ok.
+
+get_module(Config) ->
+    proplists:get_value(index_module, Config).
+
+t_insert(Config) ->
+    M = get_module(Config),
+    Tab = M:new(),
+    true = M:insert(<<"sensor/1/metric/2">>, t_insert_1, <<>>, Tab),
+    true = M:insert(<<"sensor/+/#">>, t_insert_2, <<>>, Tab),
+    true = M:insert(<<"sensor/#">>, t_insert_3, <<>>, Tab),
+    ?assertEqual(<<"sensor/#">>, topic(match(M, <<"sensor">>, Tab))),
+    ?assertEqual(t_insert_3, id(match(M, <<"sensor">>, Tab))).
+
+t_match(Config) ->
+    M = get_module(Config),
+    Tab = M:new(),
+    true = M:insert(<<"sensor/1/metric/2">>, t_match_1, <<>>, Tab),
+    true = M:insert(<<"sensor/+/#">>, t_match_2, <<>>, Tab),
+    true = M:insert(<<"sensor/#">>, t_match_3, <<>>, Tab),
     ?assertMatch(
         [<<"sensor/#">>, <<"sensor/+/#">>],
-        [topic(M) || M <- matches(<<"sensor/1">>, Tab)]
+        [topic(X) || X <- matches(M, <<"sensor/1">>, Tab)]
     ).
 
-t_match2(_) ->
-    Tab = emqx_topic_index:new(),
-    true = emqx_topic_index:insert(<<"#">>, t_match2_1, <<>>, Tab),
-    true = emqx_topic_index:insert(<<"+/#">>, t_match2_2, <<>>, Tab),
-    true = emqx_topic_index:insert(<<"+/+/#">>, t_match2_3, <<>>, Tab),
+t_match2(Config) ->
+    M = get_module(Config),
+    Tab = M:new(),
+    true = M:insert(<<"#">>, t_match2_1, <<>>, Tab),
+    true = M:insert(<<"+/#">>, t_match2_2, <<>>, Tab),
+    true = M:insert(<<"+/+/#">>, t_match2_3, <<>>, Tab),
     ?assertEqual(
         [<<"#">>, <<"+/#">>, <<"+/+/#">>],
-        [topic(M) || M <- matches(<<"a/b/c">>, Tab)]
+        [topic(X) || X <- matches(M, <<"a/b/c">>, Tab)]
     ),
     ?assertEqual(
         false,
-        emqx_topic_index:match(<<"$SYS/broker/zenmq">>, Tab)
+        M:match(<<"$SYS/broker/zenmq">>, Tab)
+    ),
+    ?assertEqual(
+        [],
+        matches(M, <<"$SYS/broker/zenmq">>, Tab)
     ).
 
-t_match3(_) ->
-    Tab = emqx_topic_index:new(),
+t_match3(Config) ->
+    M = get_module(Config),
+    Tab = M:new(),
     Records = [
         {<<"d/#">>, t_match3_1},
         {<<"a/b/+">>, t_match3_2},
@@ -69,37 +98,39 @@ t_match3(_) ->
         {<<"$SYS/#">>, t_match3_sys}
     ],
     lists:foreach(
-        fun({Topic, ID}) -> emqx_topic_index:insert(Topic, ID, <<>>, Tab) end,
+        fun({Topic, ID}) -> M:insert(Topic, ID, <<>>, Tab) end,
         Records
     ),
-    Matched = matches(<<"a/b/c">>, Tab),
+    Matched = matches(M, <<"a/b/c">>, Tab),
     case length(Matched) of
         3 -> ok;
         _ -> error({unexpected, Matched})
     end,
     ?assertEqual(
         t_match3_sys,
-        id(match(<<"$SYS/a/b/c">>, Tab))
+        id(match(M, <<"$SYS/a/b/c">>, Tab))
     ).
 
-t_match4(_) ->
-    Tab = emqx_topic_index:new(),
+t_match4(Config) ->
+    M = get_module(Config),
+    Tab = M:new(),
     Records = [{<<"/#">>, t_match4_1}, {<<"/+">>, t_match4_2}, {<<"/+/a/b/c">>, t_match4_3}],
     lists:foreach(
-        fun({Topic, ID}) -> emqx_topic_index:insert(Topic, ID, <<>>, Tab) end,
+        fun({Topic, ID}) -> M:insert(Topic, ID, <<>>, Tab) end,
         Records
     ),
     ?assertEqual(
         [<<"/#">>, <<"/+">>],
-        [topic(M) || M <- matches(<<"/">>, Tab)]
+        [topic(X) || X <- matches(M, <<"/">>, Tab)]
     ),
     ?assertEqual(
         [<<"/#">>, <<"/+/a/b/c">>],
-        [topic(M) || M <- matches(<<"/0/a/b/c">>, Tab)]
+        [topic(X) || X <- matches(M, <<"/0/a/b/c">>, Tab)]
     ).
 
-t_match5(_) ->
-    Tab = emqx_topic_index:new(),
+t_match5(Config) ->
+    M = get_module(Config),
+    Tab = M:new(),
     T = <<"a/b/c/d/e/f/g/h/i/j/k/l/m/n/o/p/q/r/s/t/u/v/w/x/y/z">>,
     Records = [
         {<<"#">>, t_match5_1},
@@ -107,58 +138,63 @@ t_match5(_) ->
         {<<T/binary, "/+">>, t_match5_3}
     ],
     lists:foreach(
-        fun({Topic, ID}) -> emqx_topic_index:insert(Topic, ID, <<>>, Tab) end,
+        fun({Topic, ID}) -> M:insert(Topic, ID, <<>>, Tab) end,
         Records
     ),
     ?assertEqual(
         [<<"#">>, <<T/binary, "/#">>],
-        [topic(M) || M <- matches(T, Tab)]
+        [topic(X) || X <- matches(M, T, Tab)]
     ),
     ?assertEqual(
         [<<"#">>, <<T/binary, "/#">>, <<T/binary, "/+">>],
-        [topic(M) || M <- matches(<<T/binary, "/1">>, Tab)]
+        [topic(X) || X <- matches(M, <<T/binary, "/1">>, Tab)]
     ).
 
-t_match6(_) ->
-    Tab = emqx_topic_index:new(),
+t_match6(Config) ->
+    M = get_module(Config),
+    Tab = M:new(),
     T = <<"a/b/c/d/e/f/g/h/i/j/k/l/m/n/o/p/q/r/s/t/u/v/w/x/y/z">>,
     W = <<"+/+/+/+/+/+/+/+/+/+/+/+/+/+/+/+/+/+/+/+/+/+/+/+/+/+/#">>,
-    emqx_topic_index:insert(W, ID = t_match6, <<>>, Tab),
-    ?assertEqual(ID, id(match(T, Tab))).
+    M:insert(W, ID = t_match6, <<>>, Tab),
+    ?assertEqual(ID, id(match(M, T, Tab))).
 
-t_match7(_) ->
-    Tab = emqx_topic_index:new(),
+t_match7(Config) ->
+    M = get_module(Config),
+    Tab = M:new(),
     T = <<"a/b/c/d/e/f/g/h/i/j/k/l/m/n/o/p/q/r/s/t/u/v/w/x/y/z">>,
     W = <<"a/+/c/+/e/+/g/+/i/+/k/+/m/+/o/+/q/+/s/+/u/+/w/+/y/+/#">>,
-    emqx_topic_index:insert(W, t_match7, <<>>, Tab),
-    ?assertEqual(W, topic(match(T, Tab))).
+    M:insert(W, t_match7, <<>>, Tab),
+    ?assertEqual(W, topic(match(M, T, Tab))).
 
-t_match_fast_forward(_) ->
-    Tab = emqx_topic_index:new(),
-    emqx_topic_index:insert(<<"a/b/1/2/3/4/5/6/7/8/9/#">>, id1, <<>>, Tab),
-    emqx_topic_index:insert(<<"z/y/x/+/+">>, id2, <<>>, Tab),
-    emqx_topic_index:insert(<<"a/b/c/+">>, id3, <<>>, Tab),
+t_match_fast_forward(Config) ->
+    M = get_module(Config),
+    Tab = M:new(),
+    M:insert(<<"a/b/1/2/3/4/5/6/7/8/9/#">>, id1, <<>>, Tab),
+    M:insert(<<"z/y/x/+/+">>, id2, <<>>, Tab),
+    M:insert(<<"a/b/c/+">>, id3, <<>>, Tab),
     % dbg:tracer(),
     % dbg:p(all, c),
     % dbg:tpl({ets, next, '_'}, x),
-    ?assertEqual(id1, id(match(<<"a/b/1/2/3/4/5/6/7/8/9/0">>, Tab))),
-    ?assertEqual([id1], [id(M) || M <- matches(<<"a/b/1/2/3/4/5/6/7/8/9/0">>, Tab)]).
+    ?assertEqual(id1, id(match(M, <<"a/b/1/2/3/4/5/6/7/8/9/0">>, Tab))),
+    ?assertEqual([id1], [id(X) || X <- matches(M, <<"a/b/1/2/3/4/5/6/7/8/9/0">>, Tab)]).
 
-t_match_unique(_) ->
-    Tab = emqx_topic_index:new(),
-    emqx_topic_index:insert(<<"a/b/c">>, t_match_id1, <<>>, Tab),
-    emqx_topic_index:insert(<<"a/b/+">>, t_match_id1, <<>>, Tab),
-    emqx_topic_index:insert(<<"a/b/c/+">>, t_match_id2, <<>>, Tab),
+t_match_unique(Config) ->
+    M = get_module(Config),
+    Tab = M:new(),
+    M:insert(<<"a/b/c">>, t_match_id1, <<>>, Tab),
+    M:insert(<<"a/b/+">>, t_match_id1, <<>>, Tab),
+    M:insert(<<"a/b/c/+">>, t_match_id2, <<>>, Tab),
     ?assertEqual(
         [t_match_id1, t_match_id1],
-        [id(M) || M <- emqx_topic_index:matches(<<"a/b/c">>, Tab, [])]
+        [id(X) || X <- matches(M, <<"a/b/c">>, Tab, [])]
     ),
     ?assertEqual(
         [t_match_id1],
-        [id(M) || M <- emqx_topic_index:matches(<<"a/b/c">>, Tab, [unique])]
+        [id(X) || X <- matches(M, <<"a/b/c">>, Tab, [unique])]
     ).
 
-t_match_wildcard_edge_cases(_) ->
+t_match_wildcard_edge_cases(Config) ->
+    M = get_module(Config),
     CommonTopics = [
         <<"a/b">>,
         <<"a/b/#">>,
@@ -179,32 +215,33 @@ t_match_wildcard_edge_cases(_) ->
             {[<<"/">>, <<"+">>], <<"a">>, [2]}
         ],
     F = fun({Topics, TopicName, Expected}) ->
-        Tab = emqx_topic_index:new(),
-        _ = [emqx_topic_index:insert(T, N, <<>>, Tab) || {N, T} <- lists:enumerate(Topics)],
+        Tab = M:new(),
+        _ = [M:insert(T, N, <<>>, Tab) || {N, T} <- lists:enumerate(Topics)],
         ?assertEqual(
             lists:last(Expected),
-            id(emqx_topic_index:match(TopicName, Tab)),
+            id(M:match(TopicName, Tab)),
             #{"Base topics" => Topics, "Topic name" => TopicName}
         ),
         ?assertEqual(
             Expected,
-            [id(M) || M <- emqx_topic_index:matches(TopicName, Tab, [unique])],
+            [id(X) || X <- matches(M, TopicName, Tab, [unique])],
             #{"Base topics" => Topics, "Topic name" => TopicName}
         )
     end,
     lists:foreach(F, Datasets).
 
-t_prop_matches(_) ->
+t_prop_matches(Config) ->
+    M = get_module(Config),
     ?assert(
         proper:quickcheck(
-            topic_matches_prop(),
+            topic_matches_prop(M),
             [{max_size, 100}, {numtests, 100}]
         )
     ),
     Statistics = [{C, account(C)} || C <- [filters, topics, matches, maxhits]],
     ct:pal("Statistics: ~p", [maps:from_list(Statistics)]).
 
-topic_matches_prop() ->
+topic_matches_prop(M) ->
     ?FORALL(
         % Generate a longer list of topics and a shorter list of topic filter patterns.
         #{
@@ -219,12 +256,12 @@ topic_matches_prop() ->
             patterns => list(topic_filter_pattern_t())
         }),
         begin
-            Tab = emqx_topic_index:new(),
+            Tab = M:new(),
             Topics = [emqx_topic:join(T) || T <- TTopics],
             % Produce topic filters from generated topics and patterns.
             % Number of filters is equal to the number of patterns, most of the time.
             Filters = lists:enumerate(mk_filters(Pats, TTopics)),
-            _ = [emqx_topic_index:insert(F, N, <<>>, Tab) || {N, F} <- Filters],
+            _ = [M:insert(F, N, <<>>, Tab) || {N, F} <- Filters],
             % Gather some basic statistics
             _ = account(filters, length(Filters)),
             _ = account(topics, NTopics = length(Topics)),
@@ -233,7 +270,7 @@ topic_matches_prop() ->
             % matching it against the list of filters one by one.
             lists:all(
                 fun(Topic) ->
-                    Ids1 = [id(M) || M <- emqx_topic_index:matches(Topic, Tab, [unique])],
+                    Ids1 = [id(X) || X <- matches(M, Topic, Tab, [unique])],
                     Ids2 = lists:filtermap(
                         fun({N, F}) ->
                             case emqx_topic:match(Topic, F) of
@@ -277,17 +314,20 @@ account(Counter) ->
 
 %%
 
-match(T, Tab) ->
-    emqx_topic_index:match(T, Tab).
+match(M, T, Tab) ->
+    M:match(T, Tab).
 
-matches(T, Tab) ->
-    lists:sort(emqx_topic_index:matches(T, Tab, [])).
+matches(M, T, Tab) ->
+    lists:sort(M:matches(T, Tab, [])).
+
+matches(M, T, Tab, Opts) ->
+    M:matches(T, Tab, Opts).
 
 id(Match) ->
-    emqx_topic_index:get_id(Match).
+    emqx_trie_search:get_id(Match).
 
 topic(Match) ->
-    emqx_topic_index:get_topic(Match).
+    emqx_trie_search:get_topic(Match).
 
 %%
 

From a1e66356147245a57ba1fc58d888927cbfdd3093 Mon Sep 17 00:00:00 2001
From: "Zaiming (Stone) Shi" <zmstone@gmail.com>
Date: Wed, 23 Aug 2023 10:12:46 +0200
Subject: [PATCH 04/12] refactor(topic_index): no forced ceiling entry in index
 table

---
 apps/emqx/src/emqx_topic_gbt.erl   | 10 +++++--
 apps/emqx/src/emqx_topic_index.erl |  4 +--
 apps/emqx/src/emqx_trie_search.erl | 45 ++++++++++++------------------
 3 files changed, 26 insertions(+), 33 deletions(-)

diff --git a/apps/emqx/src/emqx_topic_gbt.erl b/apps/emqx/src/emqx_topic_gbt.erl
index da6320bc7..dade70cee 100644
--- a/apps/emqx/src/emqx_topic_gbt.erl
+++ b/apps/emqx/src/emqx_topic_gbt.erl
@@ -104,12 +104,16 @@ gbt_update(Name, Tree) ->
     true.
 
 gbt_next(nil, _Input) ->
-    emqx_trie_search:ceiling();
+    '$end_of_table';
 gbt_next({P, _V, _Smaller, Bigger}, K) when K >= P ->
     gbt_next(Bigger, K);
 gbt_next({P, _V, Smaller, _Bigger}, K) ->
-    NextKey = gbt_next(Smaller, K),
-    min(P, NextKey).
+    case gbt_next(Smaller, K) of
+        '$end_of_table' ->
+            P;
+        NextKey ->
+            NextKey
+    end.
 
 make_nextf(Name) ->
     {_SizeWeDontCare, TheTree} = gbt(Name),
diff --git a/apps/emqx/src/emqx_topic_index.erl b/apps/emqx/src/emqx_topic_index.erl
index e1da5b3b7..acbf36627 100644
--- a/apps/emqx/src/emqx_topic_index.erl
+++ b/apps/emqx/src/emqx_topic_index.erl
@@ -35,9 +35,7 @@
 %% Usable mostly for testing purposes.
 -spec new() -> ets:table().
 new() ->
-    T = ets:new(?MODULE, [public, ordered_set, {read_concurrency, true}]),
-    ets:insert(T, {emqx_trie_search:ceiling(), []}),
-    T.
+    ets:new(?MODULE, [public, ordered_set, {read_concurrency, true}]).
 
 %% @doc Insert a new entry into the index that associates given topic filter to given
 %% record ID, and attaches arbitrary record to the entry. This allows users to choose
diff --git a/apps/emqx/src/emqx_trie_search.erl b/apps/emqx/src/emqx_trie_search.erl
index 8d9463c26..a25f3c9f4 100644
--- a/apps/emqx/src/emqx_trie_search.erl
+++ b/apps/emqx/src/emqx_trie_search.erl
@@ -98,14 +98,14 @@
 
 -module(emqx_trie_search).
 
--export([ceiling/0, make_key/2]).
+-export([make_key/2]).
 -export([match/2, matches/3, get_id/1, get_topic/1]).
 -export_type([key/1, word/0, nextf/0, opts/0]).
 
 -type word() :: binary() | '+' | '#'.
 -type base_key() :: {binary() | [word()], {}}.
 -type key(ID) :: {binary() | [word()], {ID}}.
--type nextf() :: fun((key(_) | base_key()) -> key(_)).
+-type nextf() :: fun((key(_) | base_key()) -> '$end_of_table' | key(_)).
 -type opts() :: [unique | return_first].
 
 %% Holds the constant values of each search.
@@ -134,31 +134,20 @@
     matches = []
 }).
 
-%% All valid utf8 bytes are less than 255.
--define(CEILING_TOPIC, <<255>>).
--define(CEILING, {?CEILING_TOPIC, {1}}).
-
-%% @doc Return a key which is greater than all other valid keys.
-ceiling() ->
-    ?CEILING.
-
 %% @doc Make a search-key for the given topic.
 -spec make_key(emqx_types:topic(), ID) -> key(ID).
 make_key(Topic, ID) when is_binary(Topic) ->
     Words = words(Topic),
-    Key =
-        case lists:any(fun erlang:is_atom/1, Words) of
-            true ->
-                %% it's a wildcard
-                {Words, {ID}};
-            false ->
-                %% Not a wildcard. We do not split the topic
-                %% because they can be found with direct lookups.
-                %% it is also more compact in memory.
-                {Topic, {ID}}
-        end,
-    Key > ceiling() andalso throw({invalid_topic, Topic}),
-    Key.
+    case lists:any(fun erlang:is_atom/1, Words) of
+        true ->
+            %% it's a wildcard
+            {Words, {ID}};
+        false ->
+            %% Not a wildcard. We do not split the topic
+            %% because they can be found with direct lookups.
+            %% it is also more compact in memory.
+            {Topic, {ID}}
+    end.
 
 %% @doc Extract record ID from the match.
 -spec get_id(key(ID)) -> ID.
@@ -236,12 +225,14 @@ search(Topic, NextF, Opts) ->
 %% The recursive entrypoint of the trie-search algorithm.
 %% Always start from the initial prefix and words.
 search_new(#ctx{prefix0 = Prefix, words0 = Words0} = C, NewBase, Acc0) ->
-    #acc{target = {Filter, _}} = Acc = move_up(C, Acc0, NewBase),
-    case Prefix of
-        [] ->
+    case move_up(C, Acc0, NewBase) of
+        #acc{target = '$end_of_table'} = Acc ->
+            Acc;
+        #acc{target = {Filter, _}} = Acc when Prefix =:= [] ->
             %% This is not a '$' topic, start from '+'
             search_plus(C, Words0, Filter, [], Acc);
-        [DollarWord] ->
+        #acc{target = {Filter, _}} = Acc ->
+            [DollarWord] = Prefix,
             %% Start from the '$' word
             search_up(C, DollarWord, Words0, Filter, [], Acc)
     end.

From a30d87e14f3ab6ce7635159c47df1275861d9598 Mon Sep 17 00:00:00 2001
From: "Zaiming (Stone) Shi" <zmstone@gmail.com>
Date: Thu, 24 Aug 2023 11:51:34 +0200
Subject: [PATCH 05/12] refactor(topic_index): remove more unnecessary next
 calls

also avoid using records (setelement) for recursive return values
---
 apps/emqx/src/emqx_trie_search.erl        | 119 ++++++++++------------
 apps/emqx/test/emqx_topic_index_SUITE.erl |  15 +++
 2 files changed, 71 insertions(+), 63 deletions(-)

diff --git a/apps/emqx/src/emqx_trie_search.erl b/apps/emqx/src/emqx_trie_search.erl
index a25f3c9f4..36ccd656b 100644
--- a/apps/emqx/src/emqx_trie_search.erl
+++ b/apps/emqx/src/emqx_trie_search.erl
@@ -102,12 +102,15 @@
 -export([match/2, matches/3, get_id/1, get_topic/1]).
 -export_type([key/1, word/0, nextf/0, opts/0]).
 
+-define(END, '$end_of_table').
+
 -type word() :: binary() | '+' | '#'.
 -type base_key() :: {binary() | [word()], {}}.
 -type key(ID) :: {binary() | [word()], {ID}}.
--type nextf() :: fun((key(_) | base_key()) -> '$end_of_table' | key(_)).
+-type nextf() :: fun((key(_) | base_key()) -> ?END | key(_)).
 -type opts() :: [unique | return_first].
 
+
 %% Holds the constant values of each search.
 -record(ctx, {
     %% A function which can quickly find the immediate-next record of the given prefix
@@ -122,18 +125,6 @@
     return_first :: boolean()
 }).
 
-%% Holds the variable parts of each search.
--record(acc, {
-    %% The current searching target topic/filter
-    target,
-    %% The number of moves.
-    %% This is used to check if the target has been moved
-    %% after attempting to append '+' to the searching prefix
-    moves = 0,
-    %% Search result accumulation
-    matches = []
-}).
-
 %% @doc Make a search-key for the given topic.
 -spec make_key(emqx_types:topic(), ID) -> key(ID).
 make_key(Topic, ID) when is_binary(Topic) ->
@@ -166,18 +157,14 @@ base(Prefix) ->
     {Prefix, {}}.
 
 %% Move the search target to the key next to the given Base.
-move_up(#ctx{nextf = NextF}, #acc{moves = Moves} = Acc, Base) ->
-    Acc#acc{target = NextF(Base), moves = Moves + 1}.
-
-%% The current target key is a match, add it to the accumulation.
-add(C, #acc{target = Key} = Acc) ->
-    add(C, Acc, Key).
+move_up(#ctx{nextf = NextF}, Base) ->
+    NextF(Base).
 
 %% Add the given key to the accumulation.
 add(#ctx{return_first = true}, _Acc, Key) ->
     throw({return_first, Key});
-add(_C, #acc{matches = Matches} = Acc, Key) ->
-    Acc#acc{matches = match_add(Key, Matches)}.
+add(_C, Acc, Key) ->
+    match_add(Key, Acc).
 
 %% @doc Match given topic against the index and return the first match, or `false` if
 %% no match is found.
@@ -206,69 +193,69 @@ search(Topic, NextF, Opts) ->
         words0 = Words,
         return_first = proplists:get_bool(return_first, Opts)
     },
-    Matches0 =
+    Acc0 =
         case proplists:get_bool(unique, Opts) of
             true ->
                 #{};
             false ->
                 []
         end,
-    Acc = search_new(Context, base(Prefix), #acc{matches = Matches0}),
-    #acc{matches = Matches} = match_non_wildcards(Context, base(Topic), Acc),
-    case is_map(Matches) of
+    {MaybeEnd, Acc1} = search_new(Context, base(Prefix), Acc0),
+    Acc = match_topics(Context, Topic, MaybeEnd, Acc1),
+    case is_map(Acc) of
         true ->
-            maps:values(Matches);
+            maps:values(Acc);
         false ->
-            Matches
+            Acc
     end.
 
 %% The recursive entrypoint of the trie-search algorithm.
 %% Always start from the initial prefix and words.
-search_new(#ctx{prefix0 = Prefix, words0 = Words0} = C, NewBase, Acc0) ->
-    case move_up(C, Acc0, NewBase) of
-        #acc{target = '$end_of_table'} = Acc ->
-            Acc;
-        #acc{target = {Filter, _}} = Acc when Prefix =:= [] ->
-            %% This is not a '$' topic, start from '+'
-            search_plus(C, Words0, Filter, [], Acc);
-        #acc{target = {Filter, _}} = Acc ->
-            [DollarWord] = Prefix,
-            %% Start from the '$' word
-            search_up(C, DollarWord, Words0, Filter, [], Acc)
-    end.
+search_new(C, NewBase, Acc) ->
+    search_moved(C, move_up(C, NewBase), Acc).
+
+search_moved(_, ?END, Acc) ->
+    {?END, Acc};
+search_moved(#ctx{prefix0 = [], words0 = Words0} = C, {Filter, _} = T, Acc) ->
+    %% This is not a '$' topic, start from '+'
+    search_plus(C, Words0, Filter, [], T, Acc);
+search_moved(#ctx{prefix0 = Prefix, words0 = Words0} = C, {Filter, _} = T, Acc) ->
+    [DollarWord] = Prefix,
+    %% Start from the '$' word
+    search_up(C, DollarWord, Words0, Filter, [], T, Acc).
 
 %% Search to the bigger end of ordered collection of topics and topic-filters.
-search_up(C, Word, Words, Filter, RPrefix, #acc{target = Base} = Acc) ->
+search_up(C, Word, Words, Filter, RPrefix, T, Acc) ->
     case compare(Word, Filter, Words) of
         {match, full} ->
-            search_new(C, Base, add(C, Acc));
+            search_new(C, T, add(C, Acc, T));
         {match, prefix} ->
-            search_new(C, Base, Acc);
+            search_new(C, T, Acc);
         lower ->
-            Acc;
+            {T, Acc};
         higher ->
             NewBase = base(lists:reverse([Word | RPrefix])),
             search_new(C, NewBase, Acc);
         shorter ->
-            search_plus(C, Words, tl(Filter), [Word | RPrefix], Acc)
+            search_plus(C, Words, tl(Filter), [Word | RPrefix], T, Acc)
     end.
 
 %% Try to use '+' as the next word in the prefix.
-search_plus(C, [W, X | Words], [W, X | Filter], RPrefix, Acc) ->
+search_plus(C, [W, X | Words], [W, X | Filter], RPrefix, T, Acc) ->
     %% Directly append the current word to the matching prefix (RPrefix).
     %% Micro optimization: try not to call the next clause because
     %% it is not a continuation.
-    search_plus(C, [X | Words], [X | Filter], [W | RPrefix], Acc);
-search_plus(C, [W | Words], Filter, RPrefix, Acc) ->
-    M = Acc#acc.moves,
-    case search_up(C, '+', Words, Filter, RPrefix, Acc) of
-        #acc{moves = M1} = Acc1 when M1 =:= M ->
-            %% Keep searching for one which has W as the next word
-            search_up(C, W, Words, Filter, RPrefix, Acc1);
-        Acc1 ->
-            %% Already searched
-            Acc1
-    end.
+    search_plus(C, [X | Words], [X | Filter], [W | RPrefix], T, Acc);
+search_plus(C, [W | Words], ['+' | _] = Filter, RPrefix, T, Acc) ->
+    case search_up(C, '+', Words, Filter, RPrefix, T, Acc) of
+        {T, Acc} ->
+            search_up(C, W, Words, Filter, RPrefix, T, Acc);
+        TargetMoved ->
+            TargetMoved
+    end;
+search_plus(C, [W | Words], Filter, RPrefix, T, Acc) ->
+    %% not a plus
+    search_up(C, W, Words, Filter, RPrefix, T, Acc).
 
 %% Compare prefix word then the next words in suffix against the search-target
 %% topic or topic-filter.
@@ -326,10 +313,16 @@ word(<<"+">>) -> '+';
 word(<<"#">>) -> '#';
 word(Bin) -> Bin.
 
-match_non_wildcards(#ctx{nextf = NextF} = C, {Topic, _} = Base, Acc) ->
-    case NextF(Base) of
-        {Topic, _ID} = Key ->
-            match_non_wildcards(C, Key, add(C, Acc, Key));
-        _Other ->
-            Acc
-    end.
+%% match non-wildcard topics
+match_topics(#ctx{nextf = NextF} = C, Topic, {Topic, _} = Key, Acc) ->
+    %% found a topic match
+    match_topics(C, Topic, NextF(Key), add(C, Acc, Key));
+match_topics(#ctx{nextf = NextF} = C, Topic, {F, _}, Acc) when F < Topic ->
+    %% the last key is a filter, try jump to the topic
+    match_topics(C, Topic, NextF(base(Topic)), Acc);
+match_topics(#ctx{nextf = NextF} = C, Topic, continue, Acc) ->
+    %% the last key is a '+/...' wildcard
+    match_topics(C, Topic, NextF(base(Topic)), Acc);
+match_topics(_C, _Topic, _Key, Acc) ->
+    %% gone pass the topic
+    Acc.
diff --git a/apps/emqx/test/emqx_topic_index_SUITE.erl b/apps/emqx/test/emqx_topic_index_SUITE.erl
index dd1fc07f3..eb0ac0e40 100644
--- a/apps/emqx/test/emqx_topic_index_SUITE.erl
+++ b/apps/emqx/test/emqx_topic_index_SUITE.erl
@@ -166,6 +166,21 @@ t_match7(Config) ->
     M:insert(W, t_match7, <<>>, Tab),
     ?assertEqual(W, topic(match(M, T, Tab))).
 
+t_match8(Config) ->
+    M = get_module(Config),
+    Tab = M:new(),
+    Filters = [<<"+">>, <<"dev/global/sensor">>, <<"dev/+/sensor/#">>],
+    IDs = [1,2,3],
+    Keys = [{F, ID} || F <- Filters, ID <- IDs],
+    lists:foreach(fun({F, ID}) ->
+                          M:insert(F, ID, <<>>, Tab)
+                  end, Keys),
+    Topic = <<"dev/global/sensor">>,
+    Matches = lists:sort(matches(M, Topic, Tab)),
+    ?assertEqual([<<"dev/+/sensor/#">>, <<"dev/+/sensor/#">>, <<"dev/+/sensor/#">>,
+                  <<"dev/global/sensor">>, <<"dev/global/sensor">>, <<"dev/global/sensor">>],
+                 [emqx_topic_index:get_topic(Match) || Match <- Matches]).
+
 t_match_fast_forward(Config) ->
     M = get_module(Config),
     Tab = M:new(),

From 62423b0b12deac9641152a5ff01236660fbf3746 Mon Sep 17 00:00:00 2001
From: "Zaiming (Stone) Shi" <zmstone@gmail.com>
Date: Thu, 24 Aug 2023 13:08:24 +0200
Subject: [PATCH 06/12] refactor(topic_index): less special handling for
 leading $ words

---
 apps/emqx/src/emqx_trie_search.erl        | 92 ++++++++++-------------
 apps/emqx/test/emqx_topic_index_SUITE.erl | 25 ++++--
 2 files changed, 56 insertions(+), 61 deletions(-)

diff --git a/apps/emqx/src/emqx_trie_search.erl b/apps/emqx/src/emqx_trie_search.erl
index 36ccd656b..71bc8d8a0 100644
--- a/apps/emqx/src/emqx_trie_search.erl
+++ b/apps/emqx/src/emqx_trie_search.erl
@@ -16,10 +16,10 @@
 
 %% @doc Topic index for matching topics to topic filters.
 %%
-%% Works on top of ETS ordered_set table. Keys are tuples constructed from
-%% parsed topic filters and record IDs, wrapped in a tuple to order them
-%% strictly greater than unit tuple (`{}`). Existing table may be used if
-%% existing keys will not collide with index keys.
+%% Works on top of a ordered collection data set, such as ETS ordered_set table.
+%% Keys are tuples constructed from parsed topic filters and record IDs,
+%% wrapped in a tuple to order them strictly greater than unit tuple (`{}`).
+%% Existing table may be used if existing keys will not collide with index keys.
 %%
 %% Designed to effectively answer questions like:
 %% 1. Does any topic filter match given topic?
@@ -110,15 +110,10 @@
 -type nextf() :: fun((key(_) | base_key()) -> ?END | key(_)).
 -type opts() :: [unique | return_first].
 
-
 %% Holds the constant values of each search.
 -record(ctx, {
     %% A function which can quickly find the immediate-next record of the given prefix
     nextf :: nextf(),
-    %% The initial prefix to start searching from
-    %% if the input topic starts with a dollar-word, it's the first word like [<<"$SYS">>]
-    %% otherwise it's a []
-    prefix0 :: [word()],
     %% The initial words of a topic
     words0 :: [word()],
     %% Return as soon as there is one match found
@@ -129,7 +124,7 @@
 -spec make_key(emqx_types:topic(), ID) -> key(ID).
 make_key(Topic, ID) when is_binary(Topic) ->
     Words = words(Topic),
-    case lists:any(fun erlang:is_atom/1, Words) of
+    case emqx_topic:wildcard(Words) of
         true ->
             %% it's a wildcard
             {Words, {ID}};
@@ -186,10 +181,9 @@ matches(Topic, NextF, Opts) ->
 
 %% @doc Entrypoint of the search for a given topic.
 search(Topic, NextF, Opts) ->
-    {Words, Prefix} = match_init(Topic),
+    Words = words(Topic),
     Context = #ctx{
         nextf = NextF,
-        prefix0 = Prefix,
         words0 = Words,
         return_first = proplists:get_bool(return_first, Opts)
     },
@@ -200,7 +194,15 @@ search(Topic, NextF, Opts) ->
             false ->
                 []
         end,
-    {MaybeEnd, Acc1} = search_new(Context, base(Prefix), Acc0),
+    Base =
+        case hd(Words) of
+            <<$$, _/binary>> ->
+                %% skip all filters starts with # or +
+                base([hd(Words)]);
+            _ ->
+                base([])
+        end,
+    {MaybeEnd, Acc1} = search_new(Context, Base, Acc0),
     Acc = match_topics(Context, Topic, MaybeEnd, Acc1),
     case is_map(Acc) of
         true ->
@@ -211,18 +213,30 @@ search(Topic, NextF, Opts) ->
 
 %% The recursive entrypoint of the trie-search algorithm.
 %% Always start from the initial prefix and words.
-search_new(C, NewBase, Acc) ->
-    search_moved(C, move_up(C, NewBase), Acc).
+search_new(#ctx{words0 = Words} = C, NewBase, Acc) ->
+    case move_up(C, NewBase) of
+        ?END ->
+            {?END, Acc};
+        {Filter, _} = T ->
+            search_plus(C, Words, Filter, [], T, Acc)
+    end.
 
-search_moved(_, ?END, Acc) ->
-    {?END, Acc};
-search_moved(#ctx{prefix0 = [], words0 = Words0} = C, {Filter, _} = T, Acc) ->
-    %% This is not a '$' topic, start from '+'
-    search_plus(C, Words0, Filter, [], T, Acc);
-search_moved(#ctx{prefix0 = Prefix, words0 = Words0} = C, {Filter, _} = T, Acc) ->
-    [DollarWord] = Prefix,
-    %% Start from the '$' word
-    search_up(C, DollarWord, Words0, Filter, [], T, Acc).
+%% Try to use '+' as the next word in the prefix.
+search_plus(C, [W, X | Words], [W, X | Filter], RPrefix, T, Acc) ->
+    %% Directly append the current word to the matching prefix (RPrefix).
+    %% Micro optimization: try not to call the next clause because
+    %% it is not a continuation.
+    search_plus(C, [X | Words], [X | Filter], [W | RPrefix], T, Acc);
+search_plus(C, [W | Words], ['+' | _] = Filter, RPrefix, T, Acc) ->
+    case search_up(C, '+', Words, Filter, RPrefix, T, Acc) of
+        {T, Acc1} ->
+            search_up(C, W, Words, Filter, RPrefix, T, Acc1);
+        TargetMoved ->
+            TargetMoved
+    end;
+search_plus(C, [W | Words], Filter, RPrefix, T, Acc) ->
+    %% not a plus
+    search_up(C, W, Words, Filter, RPrefix, T, Acc).
 
 %% Search to the bigger end of ordered collection of topics and topic-filters.
 search_up(C, Word, Words, Filter, RPrefix, T, Acc) ->
@@ -240,23 +254,6 @@ search_up(C, Word, Words, Filter, RPrefix, T, Acc) ->
             search_plus(C, Words, tl(Filter), [Word | RPrefix], T, Acc)
     end.
 
-%% Try to use '+' as the next word in the prefix.
-search_plus(C, [W, X | Words], [W, X | Filter], RPrefix, T, Acc) ->
-    %% Directly append the current word to the matching prefix (RPrefix).
-    %% Micro optimization: try not to call the next clause because
-    %% it is not a continuation.
-    search_plus(C, [X | Words], [X | Filter], [W | RPrefix], T, Acc);
-search_plus(C, [W | Words], ['+' | _] = Filter, RPrefix, T, Acc) ->
-    case search_up(C, '+', Words, Filter, RPrefix, T, Acc) of
-        {T, Acc} ->
-            search_up(C, W, Words, Filter, RPrefix, T, Acc);
-        TargetMoved ->
-            TargetMoved
-    end;
-search_plus(C, [W | Words], Filter, RPrefix, T, Acc) ->
-    %% not a plus
-    search_up(C, W, Words, Filter, RPrefix, T, Acc).
-
 %% Compare prefix word then the next words in suffix against the search-target
 %% topic or topic-filter.
 compare(_, NotFilter, _) when is_binary(NotFilter) ->
@@ -290,16 +287,6 @@ match_add(K = {_Filter, ID}, Acc = #{}) ->
 match_add(K, Acc) ->
     [K | Acc].
 
-match_init(Topic) ->
-    case words(Topic) of
-        [W = <<"$", _/bytes>> | Rest] ->
-            % NOTE
-            % This will effectively skip attempts to match special topics to `#` or `+/...`.
-            {Rest, [W]};
-        Words ->
-            {Words, []}
-    end.
-
 -spec words(emqx_types:topic()) -> [word()].
 words(Topic) when is_binary(Topic) ->
     % NOTE
@@ -320,9 +307,6 @@ match_topics(#ctx{nextf = NextF} = C, Topic, {Topic, _} = Key, Acc) ->
 match_topics(#ctx{nextf = NextF} = C, Topic, {F, _}, Acc) when F < Topic ->
     %% the last key is a filter, try jump to the topic
     match_topics(C, Topic, NextF(base(Topic)), Acc);
-match_topics(#ctx{nextf = NextF} = C, Topic, continue, Acc) ->
-    %% the last key is a '+/...' wildcard
-    match_topics(C, Topic, NextF(base(Topic)), Acc);
 match_topics(_C, _Topic, _Key, Acc) ->
     %% gone pass the topic
     Acc.
diff --git a/apps/emqx/test/emqx_topic_index_SUITE.erl b/apps/emqx/test/emqx_topic_index_SUITE.erl
index eb0ac0e40..551cc2438 100644
--- a/apps/emqx/test/emqx_topic_index_SUITE.erl
+++ b/apps/emqx/test/emqx_topic_index_SUITE.erl
@@ -170,16 +170,27 @@ t_match8(Config) ->
     M = get_module(Config),
     Tab = M:new(),
     Filters = [<<"+">>, <<"dev/global/sensor">>, <<"dev/+/sensor/#">>],
-    IDs = [1,2,3],
+    IDs = [1, 2, 3],
     Keys = [{F, ID} || F <- Filters, ID <- IDs],
-    lists:foreach(fun({F, ID}) ->
-                          M:insert(F, ID, <<>>, Tab)
-                  end, Keys),
+    lists:foreach(
+        fun({F, ID}) ->
+            M:insert(F, ID, <<>>, Tab)
+        end,
+        Keys
+    ),
     Topic = <<"dev/global/sensor">>,
     Matches = lists:sort(matches(M, Topic, Tab)),
-    ?assertEqual([<<"dev/+/sensor/#">>, <<"dev/+/sensor/#">>, <<"dev/+/sensor/#">>,
-                  <<"dev/global/sensor">>, <<"dev/global/sensor">>, <<"dev/global/sensor">>],
-                 [emqx_topic_index:get_topic(Match) || Match <- Matches]).
+    ?assertEqual(
+        [
+            <<"dev/+/sensor/#">>,
+            <<"dev/+/sensor/#">>,
+            <<"dev/+/sensor/#">>,
+            <<"dev/global/sensor">>,
+            <<"dev/global/sensor">>,
+            <<"dev/global/sensor">>
+        ],
+        [emqx_topic_index:get_topic(Match) || Match <- Matches]
+    ).
 
 t_match_fast_forward(Config) ->
     M = get_module(Config),

From cf45e80c71c008e604a5f95072015d80fae86eeb Mon Sep 17 00:00:00 2001
From: Andrew Mayorov <encube.ul@gmail.com>
Date: Fri, 25 Aug 2023 01:47:11 +0400
Subject: [PATCH 07/12] feat(topicidx): iterate on trie search implementation

This improves matching performance and decreases GC pressure on
synthetic workloads.
---
 apps/emqx/src/emqx_trie_search.erl        | 247 +++++++++++++---------
 apps/emqx/test/emqx_topic_index_SUITE.erl |  13 ++
 rebar.config.erl                          |   3 +-
 3 files changed, 158 insertions(+), 105 deletions(-)

diff --git a/apps/emqx/src/emqx_trie_search.erl b/apps/emqx/src/emqx_trie_search.erl
index 71bc8d8a0..95174f292 100644
--- a/apps/emqx/src/emqx_trie_search.erl
+++ b/apps/emqx/src/emqx_trie_search.erl
@@ -110,16 +110,6 @@
 -type nextf() :: fun((key(_) | base_key()) -> ?END | key(_)).
 -type opts() :: [unique | return_first].
 
-%% Holds the constant values of each search.
--record(ctx, {
-    %% A function which can quickly find the immediate-next record of the given prefix
-    nextf :: nextf(),
-    %% The initial words of a topic
-    words0 :: [word()],
-    %% Return as soon as there is one match found
-    return_first :: boolean()
-}).
-
 %% @doc Make a search-key for the given topic.
 -spec make_key(emqx_types:topic(), ID) -> key(ID).
 make_key(Topic, ID) when is_binary(Topic) ->
@@ -147,29 +137,29 @@ get_topic({Filter, _ID}) when is_list(Filter) ->
 get_topic({Topic, _ID}) ->
     Topic.
 
+-compile({inline, [base/1, move_up/2, match_add/2, compare/3]}).
+
 %% Make the base-key which can be used to locate the desired search target.
 base(Prefix) ->
     {Prefix, {}}.
 
-%% Move the search target to the key next to the given Base.
-move_up(#ctx{nextf = NextF}, Base) ->
-    NextF(Base).
+base_init([W = <<"$", _/bytes>> | _]) ->
+    base([W]);
+base_init(_) ->
+    base([]).
 
-%% Add the given key to the accumulation.
-add(#ctx{return_first = true}, _Acc, Key) ->
-    throw({return_first, Key});
-add(_C, Acc, Key) ->
-    match_add(Key, Acc).
+%% Move the search target to the key next to the given Base.
+move_up(NextF, Base) ->
+    NextF(Base).
 
 %% @doc Match given topic against the index and return the first match, or `false` if
 %% no match is found.
 -spec match(emqx_types:topic(), nextf()) -> false | key(_).
 match(Topic, NextF) ->
     try search(Topic, NextF, [return_first]) of
-        [] ->
-            false
+        _ -> false
     catch
-        throw:{return_first, Res} ->
+        throw:{first, Res} ->
             Res
     end.
 
@@ -182,110 +172,159 @@ matches(Topic, NextF, Opts) ->
 %% @doc Entrypoint of the search for a given topic.
 search(Topic, NextF, Opts) ->
     Words = words(Topic),
-    Context = #ctx{
-        nextf = NextF,
-        words0 = Words,
-        return_first = proplists:get_bool(return_first, Opts)
-    },
+    Base = base_init(Words),
+    ORetFirst = proplists:get_bool(return_first, Opts),
+    OUnique = proplists:get_bool(unique, Opts),
     Acc0 =
-        case proplists:get_bool(unique, Opts) of
+        case ORetFirst of
             true ->
+                first;
+            false when OUnique ->
                 #{};
             false ->
                 []
         end,
-    Base =
-        case hd(Words) of
-            <<$$, _/binary>> ->
-                %% skip all filters starts with # or +
-                base([hd(Words)]);
-            _ ->
-                base([])
+    Matches =
+        case search_new(Words, Base, NextF, Acc0) of
+            {Cursor, Acc} ->
+                match_topics(Topic, Cursor, NextF, Acc);
+            Acc ->
+                Acc
         end,
-    {MaybeEnd, Acc1} = search_new(Context, Base, Acc0),
-    Acc = match_topics(Context, Topic, MaybeEnd, Acc1),
-    case is_map(Acc) of
+    case is_map(Matches) of
         true ->
-            maps:values(Acc);
+            maps:values(Matches);
         false ->
-            Acc
+            Matches
     end.
 
 %% The recursive entrypoint of the trie-search algorithm.
 %% Always start from the initial prefix and words.
-search_new(#ctx{words0 = Words} = C, NewBase, Acc) ->
-    case move_up(C, NewBase) of
+search_new(Words0, NewBase, NextF, Acc) ->
+    case move_up(NextF, NewBase) of
         ?END ->
-            {?END, Acc};
-        {Filter, _} = T ->
-            search_plus(C, Words, Filter, [], T, Acc)
+            Acc;
+        Cursor ->
+            search_up(Words0, Cursor, NextF, Acc)
     end.
 
-%% Try to use '+' as the next word in the prefix.
-search_plus(C, [W, X | Words], [W, X | Filter], RPrefix, T, Acc) ->
-    %% Directly append the current word to the matching prefix (RPrefix).
-    %% Micro optimization: try not to call the next clause because
-    %% it is not a continuation.
-    search_plus(C, [X | Words], [X | Filter], [W | RPrefix], T, Acc);
-search_plus(C, [W | Words], ['+' | _] = Filter, RPrefix, T, Acc) ->
-    case search_up(C, '+', Words, Filter, RPrefix, T, Acc) of
-        {T, Acc1} ->
-            search_up(C, W, Words, Filter, RPrefix, T, Acc1);
-        TargetMoved ->
-            TargetMoved
-    end;
-search_plus(C, [W | Words], Filter, RPrefix, T, Acc) ->
-    %% not a plus
-    search_up(C, W, Words, Filter, RPrefix, T, Acc).
-
 %% Search to the bigger end of ordered collection of topics and topic-filters.
-search_up(C, Word, Words, Filter, RPrefix, T, Acc) ->
-    case compare(Word, Filter, Words) of
-        {match, full} ->
-            search_new(C, T, add(C, Acc, T));
-        {match, prefix} ->
-            search_new(C, T, Acc);
+search_up(Words, {Filter, _} = Cursor, NextF, Acc) ->
+    case compare(Filter, Words, false) of
+        match_full ->
+            search_new(Words, Cursor, NextF, match_add(Cursor, Acc));
+        match_prefix ->
+            search_new(Words, Cursor, NextF, Acc);
         lower ->
-            {T, Acc};
-        higher ->
-            NewBase = base(lists:reverse([Word | RPrefix])),
-            search_new(C, NewBase, Acc);
-        shorter ->
-            search_plus(C, Words, tl(Filter), [Word | RPrefix], T, Acc)
+            {Cursor, Acc};
+        [SeekWord | FilterTail] ->
+            % NOTE
+            % This is a seek instruction.
+            % If we visualize the `Filter` as `FilterHead ++ [_] ++ FilterTail`, we need to
+            % seek to `FilterHead ++ [SeekWord]`. It carries the `FilterTail` because it's
+            % much cheaper to return it from `compare/3` than anything more usable.
+            NewBase = base(seek(SeekWord, Filter, FilterTail)),
+            search_new(Words, NewBase, NextF, Acc)
     end.
 
-%% Compare prefix word then the next words in suffix against the search-target
-%% topic or topic-filter.
-compare(_, NotFilter, _) when is_binary(NotFilter) ->
-    lower;
-compare(H, [H | Filter], Words) ->
-    compare(Filter, Words);
-compare(_, ['#'], _Words) ->
-    {match, full};
-compare(H1, [H2 | _T2], _Words) when H1 < H2 ->
-    lower;
-compare(_H, [_ | _], _Words) ->
-    higher.
+seek(SeekWord, [_ | FilterTail], FilterTail) ->
+    [SeekWord];
+seek(SeekWord, [FilterWord | Rest], FilterTail) ->
+    [FilterWord | seek(SeekWord, Rest, FilterTail)].
 
-%% Now compare the filter suffix and the topic suffix.
-compare([], []) ->
-    {match, full};
-compare([], _Words) ->
-    {match, prefix};
-compare(['#'], _Words) ->
-    {match, full};
-compare([_ | _], []) ->
+compare(NotFilter, _, _) when is_binary(NotFilter) ->
     lower;
-compare([_ | _], _Words) ->
-    %% cannot know if it's a match, lower, or higher,
-    %% must search with a longer prefix.
-    shorter.
+compare([], [], _) ->
+    % NOTE
+    %  Topic: a/b/c/d
+    % Filter: a/+/+/d
+    % We matched the topic to a topic filter exactly (possibly with pluses).
+    % We include it in the result set, and now need to try next entry in the table.
+    % Closest possible next entries that we must not miss:
+    % * a/+/+/d (same topic but a different ID)
+    % * a/+/+/d/# (also a match)
+    match_full;
+compare([], _Words, _) ->
+    % NOTE
+    %  Topic: a/b/c/d
+    % Filter: a/+/c
+    % We found out that a topic filter is a prefix of the topic (possibly with pluses).
+    % We discard it, and now need to try next entry in the table.
+    % Closest possible next entries that we must not miss:
+    % * a/+/c/# (which is a match)
+    % * a/+/c/+ (also a match)
+    %
+    % TODO
+    % We might probably instead seek to a/+/c/# right away.
+    match_prefix;
+compare(['#'], _Words, _) ->
+    % NOTE
+    %  Topic: a/b/c/d
+    % Filter: a/+/+/d/#
+    % We matched the topic to a topic filter with wildcard (possibly with pluses).
+    % We include it in the result set, and now need to try next entry in the table.
+    % Closest possible next entries that we must not miss:
+    % * a/+/+/d/# (same topic but a different ID)
+    match_full;
+compare(['+' | TF], [HW | TW], _PrevBacktrack) ->
+    % NOTE
+    % We need to keep backtrack point each time we encounter a plus. To safely skip over
+    % parts of the search space, we may need last backtrack point when recursion terminates.
+    % See next clauses for examples.
+    compare(TF, TW, [HW | TF]);
+compare([HW | TF], [HW | TW], Backtrack) ->
+    % NOTE
+    % Skip over the same word in both topic and filter, keeping the last backtrack point.
+    compare(TF, TW, Backtrack);
+compare([HF | _], [HW | _], false) when HF > HW ->
+    % NOTE
+    %  Topic: a/b/c/d
+    % Filter: a/b/c/e/1
+    % The topic is lower than a topic filter. There's nowhere to backtrackto, we're out of
+    % the search space. We should stop the search.
+    lower;
+compare([HF | _], [HW | _], Backtrack) when HF > HW ->
+    % NOTE
+    %  Topic: a/b/c/d
+    % Filter: a/+/+/e/1
+    % The topic is lower than a topic filter. There was a plus, last time at the 3rd level,
+    % we have a backtrack point to seek to:
+    % Seek: [c | e/1]
+    % We need to skip over part of search space, and seek to the next possible match:
+    % Next: a/+/c
+    Backtrack;
+compare([_ | _], [], false) ->
+    % NOTE
+    %  Topic: a/b/c/d
+    % Filter: a/b/c/d/1
+    % The topic is lower than a topic filter. (since it's shorter). There's nowhere to
+    % backtrack to, we're out of the search space. We should stop the search.
+    lower;
+compare([_ | _], [], Backtrack) ->
+    % NOTE
+    %  Topic: a/b/c/d
+    % Filter: a/+/c/d/1
+    % The topic is lower than a topic filter. There was a plus, last and only time at the
+    % 3rd level, we have a backtrack point:
+    % Seek: [b | c/d/1]
+    % Next: a/b
+    Backtrack;
+compare([_HF | TF], [HW | _], _) ->
+    % NOTE
+    %  Topic: a/b/c/d
+    % Filter: a/+/+/0/1/2
+    % Topic is higher than the filter, we need to skip over to the next possible filter.
+    % Seek: [d | 0/1/2]
+    % Next: a/+/+/d
+    [HW | TF].
 
 match_add(K = {_Filter, ID}, Acc = #{}) ->
     % NOTE: ensuring uniqueness by record ID
     Acc#{ID => K};
-match_add(K, Acc) ->
-    [K | Acc].
+match_add(K, Acc) when is_list(Acc) ->
+    [K | Acc];
+match_add(K, first) ->
+    throw({first, K}).
 
 -spec words(emqx_types:topic()) -> [word()].
 words(Topic) when is_binary(Topic) ->
@@ -301,12 +340,12 @@ word(<<"#">>) -> '#';
 word(Bin) -> Bin.
 
 %% match non-wildcard topics
-match_topics(#ctx{nextf = NextF} = C, Topic, {Topic, _} = Key, Acc) ->
+match_topics(Topic, {Topic, _} = Key, NextF, Acc) ->
     %% found a topic match
-    match_topics(C, Topic, NextF(Key), add(C, Acc, Key));
-match_topics(#ctx{nextf = NextF} = C, Topic, {F, _}, Acc) when F < Topic ->
+    match_topics(Topic, NextF(Key), NextF, match_add(Key, Acc));
+match_topics(Topic, {F, _}, NextF, Acc) when F < Topic ->
     %% the last key is a filter, try jump to the topic
-    match_topics(C, Topic, NextF(base(Topic)), Acc);
-match_topics(_C, _Topic, _Key, Acc) ->
+    match_topics(Topic, NextF(base(Topic)), NextF, Acc);
+match_topics(_Topic, _Key, _NextF, Acc) ->
     %% gone pass the topic
     Acc.
diff --git a/apps/emqx/test/emqx_topic_index_SUITE.erl b/apps/emqx/test/emqx_topic_index_SUITE.erl
index 551cc2438..08056a16f 100644
--- a/apps/emqx/test/emqx_topic_index_SUITE.erl
+++ b/apps/emqx/test/emqx_topic_index_SUITE.erl
@@ -256,6 +256,19 @@ t_match_wildcard_edge_cases(Config) ->
     end,
     lists:foreach(F, Datasets).
 
+t_prop_edgecase(Config) ->
+    M = get_module(Config),
+    Tab = M:new(),
+    Topic = <<"01/01">>,
+    Filters = [
+        {1, <<>>},
+        {2, <<"+/01">>},
+        {3, <<>>},
+        {4, <<"+/+/01">>}
+    ],
+    _ = [M:insert(F, N, <<>>, Tab) || {N, F} <- Filters],
+    ?assertMatch([2], [id(X) || X <- matches(M, Topic, Tab, [unique])]).
+
 t_prop_matches(Config) ->
     M = get_module(Config),
     ?assert(
diff --git a/rebar.config.erl b/rebar.config.erl
index 02e946e15..8f26d11d8 100644
--- a/rebar.config.erl
+++ b/rebar.config.erl
@@ -190,7 +190,8 @@ test_deps() ->
         {meck, "0.9.2"},
         {proper, "1.4.0"},
         {er_coap_client, {git, "https://github.com/emqx/er_coap_client", {tag, "v1.0.5"}}},
-        {erl_csv, "0.2.0"}
+        {erl_csv, "0.2.0"},
+        {eministat, "0.10.1"}
     ].
 
 common_compile_opts() ->

From fc37d235c7cfbfbfbfc45957f98a4bd227475213 Mon Sep 17 00:00:00 2001
From: Andrew Mayorov <encube.ul@gmail.com>
Date: Fri, 25 Aug 2023 15:24:06 +0400
Subject: [PATCH 08/12] refactor(topicidx): simplify seek instructions

Which also avoids comparing filter tail repeatedly when evaluating
it.
---
 apps/emqx/src/emqx_trie_search.erl | 56 ++++++++++++++----------------
 1 file changed, 27 insertions(+), 29 deletions(-)

diff --git a/apps/emqx/src/emqx_trie_search.erl b/apps/emqx/src/emqx_trie_search.erl
index 95174f292..445425cb8 100644
--- a/apps/emqx/src/emqx_trie_search.erl
+++ b/apps/emqx/src/emqx_trie_search.erl
@@ -137,7 +137,7 @@ get_topic({Filter, _ID}) when is_list(Filter) ->
 get_topic({Topic, _ID}) ->
     Topic.
 
--compile({inline, [base/1, move_up/2, match_add/2, compare/3]}).
+-compile({inline, [base/1, move_up/2, match_add/2, compare/4]}).
 
 %% Make the base-key which can be used to locate the desired search target.
 base(Prefix) ->
@@ -210,31 +210,29 @@ search_new(Words0, NewBase, NextF, Acc) ->
 
 %% Search to the bigger end of ordered collection of topics and topic-filters.
 search_up(Words, {Filter, _} = Cursor, NextF, Acc) ->
-    case compare(Filter, Words, false) of
+    case compare(Filter, Words, 0, false) of
         match_full ->
             search_new(Words, Cursor, NextF, match_add(Cursor, Acc));
         match_prefix ->
             search_new(Words, Cursor, NextF, Acc);
         lower ->
             {Cursor, Acc};
-        [SeekWord | FilterTail] ->
+        {Pos, SeekWord} ->
             % NOTE
-            % This is a seek instruction.
-            % If we visualize the `Filter` as `FilterHead ++ [_] ++ FilterTail`, we need to
-            % seek to `FilterHead ++ [SeekWord]`. It carries the `FilterTail` because it's
-            % much cheaper to return it from `compare/3` than anything more usable.
-            NewBase = base(seek(SeekWord, Filter, FilterTail)),
+            % This is a seek instruction. It means we need to take `Pos` words
+            % from the current topic filter and attach `SeekWord` to the end of it.
+            NewBase = base(seek(Pos, SeekWord, Filter)),
             search_new(Words, NewBase, NextF, Acc)
     end.
 
-seek(SeekWord, [_ | FilterTail], FilterTail) ->
+seek(_Pos = 0, SeekWord, _FilterTail) ->
     [SeekWord];
-seek(SeekWord, [FilterWord | Rest], FilterTail) ->
-    [FilterWord | seek(SeekWord, Rest, FilterTail)].
+seek(Pos, SeekWord, [FilterWord | Rest]) ->
+    [FilterWord | seek(Pos - 1, SeekWord, Rest)].
 
-compare(NotFilter, _, _) when is_binary(NotFilter) ->
+compare(NotFilter, _, _, _) when is_binary(NotFilter) ->
     lower;
-compare([], [], _) ->
+compare([], [], _, _) ->
     % NOTE
     %  Topic: a/b/c/d
     % Filter: a/+/+/d
@@ -244,7 +242,7 @@ compare([], [], _) ->
     % * a/+/+/d (same topic but a different ID)
     % * a/+/+/d/# (also a match)
     match_full;
-compare([], _Words, _) ->
+compare([], _Words, _, _) ->
     % NOTE
     %  Topic: a/b/c/d
     % Filter: a/+/c
@@ -257,7 +255,7 @@ compare([], _Words, _) ->
     % TODO
     % We might probably instead seek to a/+/c/# right away.
     match_prefix;
-compare(['#'], _Words, _) ->
+compare(['#'], _Words, _, _) ->
     % NOTE
     %  Topic: a/b/c/d
     % Filter: a/+/+/d/#
@@ -266,57 +264,57 @@ compare(['#'], _Words, _) ->
     % Closest possible next entries that we must not miss:
     % * a/+/+/d/# (same topic but a different ID)
     match_full;
-compare(['+' | TF], [HW | TW], _PrevBacktrack) ->
+compare(['+' | TF], [HW | TW], Pos, _PrevBacktrack) ->
     % NOTE
     % We need to keep backtrack point each time we encounter a plus. To safely skip over
     % parts of the search space, we may need last backtrack point when recursion terminates.
     % See next clauses for examples.
-    compare(TF, TW, [HW | TF]);
-compare([HW | TF], [HW | TW], Backtrack) ->
+    compare(TF, TW, Pos + 1, {Pos, HW});
+compare([HW | TF], [HW | TW], Pos, Backtrack) ->
     % NOTE
     % Skip over the same word in both topic and filter, keeping the last backtrack point.
-    compare(TF, TW, Backtrack);
-compare([HF | _], [HW | _], false) when HF > HW ->
+    compare(TF, TW, Pos + 1, Backtrack);
+compare([HF | _], [HW | _], _, false) when HF > HW ->
     % NOTE
     %  Topic: a/b/c/d
     % Filter: a/b/c/e/1
     % The topic is lower than a topic filter. There's nowhere to backtrackto, we're out of
     % the search space. We should stop the search.
     lower;
-compare([HF | _], [HW | _], Backtrack) when HF > HW ->
+compare([HF | _], [HW | _], _, Backtrack) when HF > HW ->
     % NOTE
     %  Topic: a/b/c/d
     % Filter: a/+/+/e/1
     % The topic is lower than a topic filter. There was a plus, last time at the 3rd level,
     % we have a backtrack point to seek to:
-    % Seek: [c | e/1]
+    % Seek: {2, c}
     % We need to skip over part of search space, and seek to the next possible match:
     % Next: a/+/c
     Backtrack;
-compare([_ | _], [], false) ->
+compare([_ | _], [], _, false) ->
     % NOTE
     %  Topic: a/b/c/d
     % Filter: a/b/c/d/1
-    % The topic is lower than a topic filter. (since it's shorter). There's nowhere to
+    % The topic is lower than a topic filter (since it's shorter). There's nowhere to
     % backtrack to, we're out of the search space. We should stop the search.
     lower;
-compare([_ | _], [], Backtrack) ->
+compare([_ | _], [], _, Backtrack) ->
     % NOTE
     %  Topic: a/b/c/d
     % Filter: a/+/c/d/1
     % The topic is lower than a topic filter. There was a plus, last and only time at the
     % 3rd level, we have a backtrack point:
-    % Seek: [b | c/d/1]
+    % Seek: {1, b}
     % Next: a/b
     Backtrack;
-compare([_HF | TF], [HW | _], _) ->
+compare([_ | _], [HW | _], Pos, _) ->
     % NOTE
     %  Topic: a/b/c/d
     % Filter: a/+/+/0/1/2
     % Topic is higher than the filter, we need to skip over to the next possible filter.
-    % Seek: [d | 0/1/2]
+    % Seek: {3, d}
     % Next: a/+/+/d
-    [HW | TF].
+    {Pos, HW}.
 
 match_add(K = {_Filter, ID}, Acc = #{}) ->
     % NOTE: ensuring uniqueness by record ID

From 33ed53bb6aeab659c3d075838d24b7a04fcdfee7 Mon Sep 17 00:00:00 2001
From: Andrew Mayorov <encube.ul@gmail.com>
Date: Fri, 25 Aug 2023 16:26:39 +0400
Subject: [PATCH 09/12] refactor(topicidx): simplify `compare/3` further

It's possible to emit seek instruction just once, on the way back
out of the `compare/3` stack.
---
 apps/emqx/src/emqx_trie_search.erl | 80 ++++++++++++++----------------
 1 file changed, 38 insertions(+), 42 deletions(-)

diff --git a/apps/emqx/src/emqx_trie_search.erl b/apps/emqx/src/emqx_trie_search.erl
index 445425cb8..4acee979f 100644
--- a/apps/emqx/src/emqx_trie_search.erl
+++ b/apps/emqx/src/emqx_trie_search.erl
@@ -137,7 +137,7 @@ get_topic({Filter, _ID}) when is_list(Filter) ->
 get_topic({Topic, _ID}) ->
     Topic.
 
--compile({inline, [base/1, move_up/2, match_add/2, compare/4]}).
+-compile({inline, [base/1, move_up/2, match_add/2, compare/3]}).
 
 %% Make the base-key which can be used to locate the desired search target.
 base(Prefix) ->
@@ -210,7 +210,7 @@ search_new(Words0, NewBase, NextF, Acc) ->
 
 %% Search to the bigger end of ordered collection of topics and topic-filters.
 search_up(Words, {Filter, _} = Cursor, NextF, Acc) ->
-    case compare(Filter, Words, 0, false) of
+    case compare(Filter, Words, 0) of
         match_full ->
             search_new(Words, Cursor, NextF, match_add(Cursor, Acc));
         match_prefix ->
@@ -230,9 +230,9 @@ seek(_Pos = 0, SeekWord, _FilterTail) ->
 seek(Pos, SeekWord, [FilterWord | Rest]) ->
     [FilterWord | seek(Pos - 1, SeekWord, Rest)].
 
-compare(NotFilter, _, _, _) when is_binary(NotFilter) ->
+compare(NotFilter, _, _) when is_binary(NotFilter) ->
     lower;
-compare([], [], _, _) ->
+compare([], [], _) ->
     % NOTE
     %  Topic: a/b/c/d
     % Filter: a/+/+/d
@@ -242,7 +242,7 @@ compare([], [], _, _) ->
     % * a/+/+/d (same topic but a different ID)
     % * a/+/+/d/# (also a match)
     match_full;
-compare([], _Words, _, _) ->
+compare([], _Words, _) ->
     % NOTE
     %  Topic: a/b/c/d
     % Filter: a/+/c
@@ -255,7 +255,7 @@ compare([], _Words, _, _) ->
     % TODO
     % We might probably instead seek to a/+/c/# right away.
     match_prefix;
-compare(['#'], _Words, _, _) ->
+compare(['#'], _Words, _) ->
     % NOTE
     %  Topic: a/b/c/d
     % Filter: a/+/+/d/#
@@ -264,50 +264,46 @@ compare(['#'], _Words, _, _) ->
     % Closest possible next entries that we must not miss:
     % * a/+/+/d/# (same topic but a different ID)
     match_full;
-compare(['+' | TF], [HW | TW], Pos, _PrevBacktrack) ->
-    % NOTE
-    % We need to keep backtrack point each time we encounter a plus. To safely skip over
-    % parts of the search space, we may need last backtrack point when recursion terminates.
-    % See next clauses for examples.
-    compare(TF, TW, Pos + 1, {Pos, HW});
-compare([HW | TF], [HW | TW], Pos, Backtrack) ->
+compare(['+' | TF], [HW | TW], Pos) ->
+    case compare(TF, TW, Pos + 1) of
+        lower ->
+            % NOTE
+            %  Topic: a/b/c/d
+            % Filter: a/+/+/e/1 or a/b/+/d/1
+            % The topic is lower than a topic filter. But we're at the `+` position,
+            % so we emit a backtrack point to seek to:
+            % Seek: {2, c}
+            % We skip over part of search space, and seek to the next possible match:
+            % Next: a/+/c
+            {Pos, HW};
+        Other ->
+            % NOTE
+            % It's either already a backtrack point, emitted from the last `+`
+            % position or just a seek / match. In both cases we just pass it
+            % through.
+            Other
+    end;
+compare([HW | TF], [HW | TW], Pos) ->
     % NOTE
     % Skip over the same word in both topic and filter, keeping the last backtrack point.
-    compare(TF, TW, Pos + 1, Backtrack);
-compare([HF | _], [HW | _], _, false) when HF > HW ->
+    compare(TF, TW, Pos + 1);
+compare([HF | _], [HW | _], _) when HF > HW ->
     % NOTE
     %  Topic: a/b/c/d
-    % Filter: a/b/c/e/1
-    % The topic is lower than a topic filter. There's nowhere to backtrackto, we're out of
-    % the search space. We should stop the search.
+    % Filter: a/b/c/e/1 or a/b/+/e
+    % The topic is lower than a topic filter. In the first case there's nowhere to
+    % backtrack to, we're out of the search space. In the second case there's a `+`
+    % on 3rd level, we'll seek up from there.
     lower;
-compare([HF | _], [HW | _], _, Backtrack) when HF > HW ->
+compare([_ | _], [], _) ->
     % NOTE
     %  Topic: a/b/c/d
-    % Filter: a/+/+/e/1
-    % The topic is lower than a topic filter. There was a plus, last time at the 3rd level,
-    % we have a backtrack point to seek to:
-    % Seek: {2, c}
-    % We need to skip over part of search space, and seek to the next possible match:
-    % Next: a/+/c
-    Backtrack;
-compare([_ | _], [], _, false) ->
-    % NOTE
-    %  Topic: a/b/c/d
-    % Filter: a/b/c/d/1
-    % The topic is lower than a topic filter (since it's shorter). There's nowhere to
-    % backtrack to, we're out of the search space. We should stop the search.
+    % Filter: a/b/c/d/1 or a/+/c/d/1
+    % The topic is lower than a topic filter (since it's shorter). In the first case
+    % there's nowhere to backtrack to, we're out of the search space. In the second case
+    % there's a `+` on 2nd level, we'll seek up from there.
     lower;
-compare([_ | _], [], _, Backtrack) ->
-    % NOTE
-    %  Topic: a/b/c/d
-    % Filter: a/+/c/d/1
-    % The topic is lower than a topic filter. There was a plus, last and only time at the
-    % 3rd level, we have a backtrack point:
-    % Seek: {1, b}
-    % Next: a/b
-    Backtrack;
-compare([_ | _], [HW | _], Pos, _) ->
+compare([_ | _], [HW | _], Pos) ->
     % NOTE
     %  Topic: a/b/c/d
     % Filter: a/+/+/0/1/2

From 558402a68e6cf244c7217578796daddd1f49639f Mon Sep 17 00:00:00 2001
From: "Zaiming (Stone) Shi" <zmstone@gmail.com>
Date: Fri, 25 Aug 2023 10:49:23 +0200
Subject: [PATCH 10/12] chore(topic_index): add topic validation

---
 apps/emqx/src/emqx_trie_search.erl        | 22 +++++++++-------
 apps/emqx/test/emqx_trie_search_tests.erl | 32 +++++++++++++++++++++++
 2 files changed, 45 insertions(+), 9 deletions(-)
 create mode 100644 apps/emqx/test/emqx_trie_search_tests.erl

diff --git a/apps/emqx/src/emqx_trie_search.erl b/apps/emqx/src/emqx_trie_search.erl
index 4acee979f..ddeef966b 100644
--- a/apps/emqx/src/emqx_trie_search.erl
+++ b/apps/emqx/src/emqx_trie_search.erl
@@ -113,7 +113,7 @@
 %% @doc Make a search-key for the given topic.
 -spec make_key(emqx_types:topic(), ID) -> key(ID).
 make_key(Topic, ID) when is_binary(Topic) ->
-    Words = words(Topic),
+    Words = filter_words(Topic),
     case emqx_topic:wildcard(Words) of
         true ->
             %% it's a wildcard
@@ -171,7 +171,7 @@ matches(Topic, NextF, Opts) ->
 
 %% @doc Entrypoint of the search for a given topic.
 search(Topic, NextF, Opts) ->
-    Words = words(Topic),
+    Words = topic_words(Topic),
     Base = base_init(Words),
     ORetFirst = proplists:get_bool(return_first, Opts),
     OUnique = proplists:get_bool(unique, Opts),
@@ -320,18 +320,22 @@ match_add(K, Acc) when is_list(Acc) ->
 match_add(K, first) ->
     throw({first, K}).
 
--spec words(emqx_types:topic()) -> [word()].
-words(Topic) when is_binary(Topic) ->
+-spec filter_words(emqx_types:topic()) -> [word()].
+filter_words(Topic) when is_binary(Topic) ->
     % NOTE
     % This is almost identical to `emqx_topic:words/1`, but it doesn't convert empty
     % tokens to ''. This is needed to keep ordering of words consistent with what
     % `match_filter/3` expects.
-    [word(W) || W <- emqx_topic:tokens(Topic)].
+    [word(W, filter) || W <- emqx_topic:tokens(Topic)].
 
--spec word(binary()) -> word().
-word(<<"+">>) -> '+';
-word(<<"#">>) -> '#';
-word(Bin) -> Bin.
+topic_words(Topic) when is_binary(Topic) ->
+    [word(W, topic) || W <- emqx_topic:tokens(Topic)].
+
+word(<<"+">>, topic) -> error(badarg);
+word(<<"#">>, topic) -> error(badarg);
+word(<<"+">>, filter) -> '+';
+word(<<"#">>, filter) -> '#';
+word(Bin, _) -> Bin.
 
 %% match non-wildcard topics
 match_topics(Topic, {Topic, _} = Key, NextF, Acc) ->
diff --git a/apps/emqx/test/emqx_trie_search_tests.erl b/apps/emqx/test/emqx_trie_search_tests.erl
new file mode 100644
index 000000000..75994131d
--- /dev/null
+++ b/apps/emqx/test/emqx_trie_search_tests.erl
@@ -0,0 +1,32 @@
+%%--------------------------------------------------------------------
+%% Copyright (c) 2023 EMQ Technologies Co., Ltd. All Rights Reserved.
+%%
+%% Licensed under the Apache License, Version 2.0 (the "License");
+%% you may not use this file except in compliance with the License.
+%% You may obtain a copy of the License at
+%%
+%%     http://www.apache.org/licenses/LICENSE-2.0
+%%
+%% Unless required by applicable law or agreed to in writing, software
+%% distributed under the License is distributed on an "AS IS" BASIS,
+%% WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+%% See the License for the specific language governing permissions and
+%% limitations under the License.
+%%--------------------------------------------------------------------
+
+-module(emqx_trie_search_tests).
+
+-include_lib("eunit/include/eunit.hrl").
+
+topic_validation_test() ->
+    NextF = fun(_) -> '$end_of_table' end,
+    Call = fun(Topic) ->
+        emqx_trie_search:match(Topic, NextF)
+    end,
+    ?assertError(badarg, Call(<<"+">>)),
+    ?assertError(badarg, Call(<<"#">>)),
+    ?assertError(badarg, Call(<<"a/+/b">>)),
+    ?assertError(badarg, Call(<<"a/b/#">>)),
+    ?assertEqual(false, Call(<<"a/b/b+">>)),
+    ?assertEqual(false, Call(<<"a/b/c#">>)),
+    ok.

From ca59a87d47afacfba013e9e5c3b10e902422be78 Mon Sep 17 00:00:00 2001
From: Andrew Mayorov <encube.ul@gmail.com>
Date: Fri, 25 Aug 2023 16:40:23 +0400
Subject: [PATCH 11/12] chore(topicidx): refine example of wildcard compare

---
 apps/emqx/src/emqx_trie_search.erl | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/apps/emqx/src/emqx_trie_search.erl b/apps/emqx/src/emqx_trie_search.erl
index ddeef966b..6285eb4eb 100644
--- a/apps/emqx/src/emqx_trie_search.erl
+++ b/apps/emqx/src/emqx_trie_search.erl
@@ -258,7 +258,7 @@ compare([], _Words, _) ->
 compare(['#'], _Words, _) ->
     % NOTE
     %  Topic: a/b/c/d
-    % Filter: a/+/+/d/#
+    % Filter: a/+/+/d/# or just a/#
     % We matched the topic to a topic filter with wildcard (possibly with pluses).
     % We include it in the result set, and now need to try next entry in the table.
     % Closest possible next entries that we must not miss:

From d5cff533e3e3128bc36def697a217a0cc16f4a04 Mon Sep 17 00:00:00 2001
From: Andrew Mayorov <encube.ul@gmail.com>
Date: Fri, 25 Aug 2023 16:41:21 +0400
Subject: [PATCH 12/12] chore(topicidx): drop TODO comment

---
 apps/emqx/src/emqx_trie_search.erl | 3 ---
 1 file changed, 3 deletions(-)

diff --git a/apps/emqx/src/emqx_trie_search.erl b/apps/emqx/src/emqx_trie_search.erl
index 6285eb4eb..b774e1459 100644
--- a/apps/emqx/src/emqx_trie_search.erl
+++ b/apps/emqx/src/emqx_trie_search.erl
@@ -251,9 +251,6 @@ compare([], _Words, _) ->
     % Closest possible next entries that we must not miss:
     % * a/+/c/# (which is a match)
     % * a/+/c/+ (also a match)
-    %
-    % TODO
-    % We might probably instead seek to a/+/c/# right away.
     match_prefix;
 compare(['#'], _Words, _) ->
     % NOTE