diff --git a/.ci/docker-compose-file/docker-compose-greptimedb.yaml b/.ci/docker-compose-file/docker-compose-greptimedb.yaml index 6813b4983..b564abbf9 100644 --- a/.ci/docker-compose-file/docker-compose-greptimedb.yaml +++ b/.ci/docker-compose-file/docker-compose-greptimedb.yaml @@ -4,7 +4,7 @@ services: greptimedb: container_name: greptimedb hostname: greptimedb - image: greptime/greptimedb:v0.4.4 + image: greptime/greptimedb:v0.7.1 expose: - "4000" - "4001" diff --git a/.ci/docker-compose-file/docker-compose-iotdb.yaml b/.ci/docker-compose-file/docker-compose-iotdb.yaml index 2a2b0e603..f5448a1ef 100644 --- a/.ci/docker-compose-file/docker-compose-iotdb.yaml +++ b/.ci/docker-compose-file/docker-compose-iotdb.yaml @@ -1,24 +1,53 @@ version: '3.9' services: - iotdb: - container_name: iotdb - hostname: iotdb - image: apache/iotdb:1.1.0-standalone + iotdb_1_3_0: + container_name: iotdb130 + hostname: iotdb130 + image: apache/iotdb:1.3.0-standalone restart: always environment: - enable_rest_service=true - - cn_internal_address=iotdb + - cn_internal_address=iotdb130 - cn_internal_port=10710 - cn_consensus_port=10720 - - cn_target_config_node_list=iotdb:10710 - - dn_rpc_address=iotdb - - dn_internal_address=iotdb + - cn_seed_config_node=iotdb130:10710 + - dn_rpc_address=iotdb130 + - dn_internal_address=iotdb130 - dn_rpc_port=6667 - dn_mpp_data_exchange_port=10740 - dn_schema_region_consensus_port=10750 - dn_data_region_consensus_port=10760 - - dn_target_config_node_list=iotdb:10710 + - dn_seed_config_node=iotdb130:10710 + # volumes: + # - ./data:/iotdb/data + # - ./logs:/iotdb/logs + expose: + - "18080" + # IoTDB's REST interface, uncomment for local testing + # ports: + # - "18080:18080" + networks: + - emqx_bridge + + iotdb_1_1_0: + container_name: iotdb110 + hostname: iotdb110 + image: apache/iotdb:1.1.0-standalone + restart: always + environment: + - enable_rest_service=true + - cn_internal_address=iotdb110 + - cn_internal_port=10710 + - cn_consensus_port=10720 + - cn_target_config_node_list=iotdb110:10710 + - dn_rpc_address=iotdb110 + - dn_internal_address=iotdb110 + - dn_rpc_port=6667 + - dn_mpp_data_exchange_port=10740 + - dn_schema_region_consensus_port=10750 + - dn_data_region_consensus_port=10760 + - dn_target_config_node_list=iotdb110:10710 # volumes: # - ./data:/iotdb/data # - ./logs:/iotdb/logs diff --git a/.ci/docker-compose-file/rocketmq/conf/plain_acl.yml b/.ci/docker-compose-file/rocketmq/conf/plain_acl.yml index e2c41a87f..e78e47fe5 100644 --- a/.ci/docker-compose-file/rocketmq/conf/plain_acl.yml +++ b/.ci/docker-compose-file/rocketmq/conf/plain_acl.yml @@ -9,3 +9,4 @@ accounts: defaultGroupPerm: PUB|SUB topicPerms: - TopicTest=PUB|SUB + - Topic2=PUB|SUB diff --git a/.ci/docker-compose-file/toxiproxy.json b/.ci/docker-compose-file/toxiproxy.json index 103bae924..a3c1dfbf4 100644 --- a/.ci/docker-compose-file/toxiproxy.json +++ b/.ci/docker-compose-file/toxiproxy.json @@ -139,9 +139,15 @@ "enabled": true }, { - "name": "iotdb", + "name": "iotdb110", "listen": "0.0.0.0:18080", - "upstream": "iotdb:18080", + "upstream": "iotdb110:18080", + "enabled": true + }, + { + "name": "iotdb130", + "listen": "0.0.0.0:28080", + "upstream": "iotdb130:18080", "enabled": true }, { diff --git a/.github/CODEOWNERS b/.github/CODEOWNERS index 23911f9a8..95c219b88 100644 --- a/.github/CODEOWNERS +++ b/.github/CODEOWNERS @@ -1,18 +1,29 @@ ## Default * @emqx/emqx-review-board +# emqx-review-board members +## HJianBo +## id +## ieQu1 +## keynslug +## qzhuyan +## savonarola +## terry-xiaoyu +## thalesmg +## zhongwencool +## zmstone + ## apps /apps/emqx/ @emqx/emqx-review-board @lafirest -/apps/emqx_connector/ @emqx/emqx-review-board -/apps/emqx_auth/ @emqx/emqx-review-board @JimMoen @savonarola +/apps/emqx_auth/ @emqx/emqx-review-board @JimMoen /apps/emqx_connector/ @emqx/emqx-review-board @JimMoen /apps/emqx_dashboard/ @emqx/emqx-review-board @JimMoen @lafirest /apps/emqx_dashboard_rbac/ @emqx/emqx-review-board @lafirest /apps/emqx_dashboard_sso/ @emqx/emqx-review-board @JimMoen @lafirest -/apps/emqx_exhook/ @emqx/emqx-review-board @JimMoen @HJianBo -/apps/emqx_ft/ @emqx/emqx-review-board @savonarola @keynslug +/apps/emqx_exhook/ @emqx/emqx-review-board @JimMoen /apps/emqx_gateway/ @emqx/emqx-review-board @lafirest -/apps/emqx_management/ @emqx/emqx-review-board @lafirest @sstrigler +/apps/emqx_management/ @emqx/emqx-review-board @lafirest +/apps/emqx_opentelemetry @emqx/emqx-review-board @SergeTupchiy /apps/emqx_plugins/ @emqx/emqx-review-board @JimMoen /apps/emqx_prometheus/ @emqx/emqx-review-board @JimMoen /apps/emqx_psk/ @emqx/emqx-review-board @lafirest @@ -20,7 +31,7 @@ /apps/emqx_rule_engine/ @emqx/emqx-review-board @kjellwinblad /apps/emqx_slow_subs/ @emqx/emqx-review-board @lafirest /apps/emqx_statsd/ @emqx/emqx-review-board @JimMoen -/apps/emqx_durable_storage/ @emqx/emqx-review-board @ieQu1 @keynslug +/apps/emqx_durable_storage/ @emqx/emqx-review-board @keynslug ## CI /deploy/ @emqx/emqx-review-board @Rory-Z diff --git a/.github/workflows/build_packages.yaml b/.github/workflows/build_packages.yaml index b48e1ccf2..3b19e7094 100644 --- a/.github/workflows/build_packages.yaml +++ b/.github/workflows/build_packages.yaml @@ -151,7 +151,23 @@ jobs: with: ref: ${{ github.event.inputs.ref }} fetch-depth: 0 - - name: build emqx packages + - name: build tgz + env: + PROFILE: ${{ matrix.profile }} + ARCH: ${{ matrix.arch }} + OS: ${{ matrix.os }} + IS_ELIXIR: ${{ matrix.with_elixir }} + BUILDER: "ghcr.io/emqx/emqx-builder/${{ matrix.builder }}:${{ matrix.elixir }}-${{ matrix.otp }}-${{ matrix.os }}" + BUILDER_SYSTEM: force_docker + run: | + ./scripts/buildx.sh \ + --profile $PROFILE \ + --arch $ARCH \ + --builder $BUILDER \ + --elixir $IS_ELIXIR \ + --pkgtype tgz + - name: build pkg + if: matrix.with_elixir == 'no' env: PROFILE: ${{ matrix.profile }} ARCH: ${{ matrix.arch }} diff --git a/.github/workflows/codeql.yaml b/.github/workflows/codeql.yaml index 1b32e1174..7b9c14d5f 100644 --- a/.github/workflows/codeql.yaml +++ b/.github/workflows/codeql.yaml @@ -24,8 +24,8 @@ jobs: matrix: branch: - master - - release-55 - release-56 + - release-57 language: - cpp - python diff --git a/.github/workflows/green_master.yaml b/.github/workflows/green_master.yaml index 1984aa692..7053247e3 100644 --- a/.github/workflows/green_master.yaml +++ b/.github/workflows/green_master.yaml @@ -31,7 +31,7 @@ jobs: env: GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} run: | - gh api --method GET -f head_branch=master -f status=completed -f exclude_pull_requests=true /repos/emqx/emqx/actions/runs > runs.json + gh api --method GET -f head_sha=$(git rev-parse HEAD) -f status=completed -f exclude_pull_requests=true /repos/emqx/emqx/actions/runs > runs.json for id in $(jq -r '.workflow_runs[] | select((."conclusion" == "failure") and (."name" != "Keep master green") and .run_attempt < 3) | .id' runs.json); do echo "rerun https://github.com/emqx/emqx/actions/runs/$id" gh api --method POST /repos/emqx/emqx/actions/runs/$id/rerun-failed-jobs || true diff --git a/.github/workflows/release.yaml b/.github/workflows/release.yaml index 3dabc92fc..34a0e44e5 100644 --- a/.github/workflows/release.yaml +++ b/.github/workflows/release.yaml @@ -67,12 +67,13 @@ jobs: BUCKET=${{ secrets.AWS_S3_BUCKET }} OUTPUT_DIR=${{ steps.profile.outputs.s3dir }} aws s3 cp --recursive s3://$BUCKET/$OUTPUT_DIR/${{ env.ref_name }} packages - - uses: emqx/upload-assets@8d2083b4dbe3151b0b735572eaa153b6acb647fe # 0.5.0 + - uses: emqx/upload-assets@974befcf0e72a1811360a81c798855efb66b0551 # 0.5.2 env: GITHUB_TOKEN: ${{ github.token }} with: asset_paths: '["packages/*"]' tag_name: "${{ env.ref_name }}" + skip_existing: true - name: update to emqx.io if: startsWith(env.ref_name, 'v') && ((github.event_name == 'release' && !github.event.release.prerelease) || inputs.publish_release_artefacts) run: | diff --git a/.github/workflows/run_docker_tests.yaml b/.github/workflows/run_docker_tests.yaml index cfc67be75..527c1cb32 100644 --- a/.github/workflows/run_docker_tests.yaml +++ b/.github/workflows/run_docker_tests.yaml @@ -47,6 +47,9 @@ jobs: echo "_EMQX_DOCKER_IMAGE_TAG=$_EMQX_DOCKER_IMAGE_TAG" >> $GITHUB_ENV - name: dashboard tests working-directory: ./scripts/ui-tests + env: + EMQX_VERSION: ${{ inputs.version-emqx }} + EMQX_ENTERPRISE_VERSION: ${{ inputs.version-emqx-enterprise }} run: | set -eu docker compose up --abort-on-container-exit --exit-code-from selenium diff --git a/.github/workflows/run_emqx_app_tests.yaml b/.github/workflows/run_emqx_app_tests.yaml index 67175a37c..cd0de1582 100644 --- a/.github/workflows/run_emqx_app_tests.yaml +++ b/.github/workflows/run_emqx_app_tests.yaml @@ -35,6 +35,7 @@ jobs: shell: bash outputs: matrix: ${{ steps.matrix.outputs.matrix }} + skip: ${{ steps.matrix.outputs.skip }} steps: - uses: actions/checkout@9bb56186c3b09b4f86b1c65136769dd318469633 # v4.1.2 with: @@ -49,12 +50,16 @@ jobs: changed_files="$(git diff --name-only ${BEFORE_REF} ${AFTER_REF} apps/emqx)" if [ "$changed_files" = '' ]; then echo "nothing changed in apps/emqx, ignored." - echo "matrix=[]" | tee -a $GITHUB_OUTPUT + echo 'matrix=[]' | tee -a $GITHUB_OUTPUT + echo 'skip=true' | tee -a $GITHUB_OUTPUT exit 0 + else + echo 'skip=false' | tee -a $GITHUB_OUTPUT + echo 'matrix=[{"type": "eunit_proper_and_static"},{"type": "1_3"},{"type": "2_3"},{"type": "3_3"}]' | tee -a $GITHUB_OUTPUT fi - echo 'matrix=[{"type": "eunit_proper_and_static"},{"type": "1_3"},{"type": "2_3"},{"type": "3_3"}]' | tee -a $GITHUB_OUTPUT run_emqx_app_tests: + if: needs.prepare_matrix.outputs.skip != 'true' needs: - prepare_matrix runs-on: ${{ endsWith(github.repository, '/emqx') && 'ubuntu-22.04' || fromJSON('["self-hosted","ephemeral","linux","x64"]') }} diff --git a/.gitignore b/.gitignore index 5e91d4bc5..d5338d5c4 100644 --- a/.gitignore +++ b/.gitignore @@ -76,3 +76,5 @@ rebar-git-cache.tar .docker_image_tag .emqx_docker_image_tags .git/ +apps/emqx_utils/src/emqx_variform_parser.erl +apps/emqx_utils/src/emqx_variform_scan.erl diff --git a/Makefile b/Makefile index 1753dd4d8..b899017b4 100644 --- a/Makefile +++ b/Makefile @@ -20,8 +20,8 @@ endif # Dashboard version # from https://github.com/emqx/emqx-dashboard5 -export EMQX_DASHBOARD_VERSION ?= v1.8.0 -export EMQX_EE_DASHBOARD_VERSION ?= e1.6.0 +export EMQX_DASHBOARD_VERSION ?= v1.9.0-beta.1 +export EMQX_EE_DASHBOARD_VERSION ?= e1.7.0-beta.1 PROFILE ?= emqx REL_PROFILES := emqx emqx-enterprise diff --git a/apps/emqx/include/emqx_mqtt.hrl b/apps/emqx/include/emqx_mqtt.hrl index 63e2799fd..09f7495ea 100644 --- a/apps/emqx/include/emqx_mqtt.hrl +++ b/apps/emqx/include/emqx_mqtt.hrl @@ -673,7 +673,6 @@ end). -define(SHARE, "$share"). -define(QUEUE, "$queue"). --define(SHARE(Group, Topic), emqx_topic:join([<>, Group, Topic])). -define(REDISPATCH_TO(GROUP, TOPIC), {GROUP, TOPIC}). diff --git a/apps/emqx/include/emqx_trace.hrl b/apps/emqx/include/emqx_trace.hrl index 5c50fa706..d1e70b184 100644 --- a/apps/emqx/include/emqx_trace.hrl +++ b/apps/emqx/include/emqx_trace.hrl @@ -20,12 +20,17 @@ -record(?TRACE, { name :: binary() | undefined | '_', - type :: clientid | topic | ip_address | undefined | '_', + type :: clientid | topic | ip_address | ruleid | undefined | '_', filter :: - emqx_types:topic() | emqx_types:clientid() | emqx_trace:ip_address() | undefined | '_', + emqx_types:topic() + | emqx_types:clientid() + | emqx_trace:ip_address() + | emqx_trace:ruleid() + | undefined + | '_', enable = true :: boolean() | '_', payload_encode = text :: hex | text | hidden | '_', - extra = #{} :: map() | '_', + extra = #{formatter => text} :: #{formatter => text | json} | '_', start_at :: integer() | undefined | '_', end_at :: integer() | undefined | '_' }). diff --git a/apps/emqx/include/logger.hrl b/apps/emqx/include/logger.hrl index 7bd542b0d..31fe0e36a 100644 --- a/apps/emqx/include/logger.hrl +++ b/apps/emqx/include/logger.hrl @@ -44,11 +44,20 @@ ). -define(SLOG_THROTTLE(Level, Data, Meta), - case emqx_log_throttler:allow(maps:get(msg, Data)) of + case logger:allow(Level, ?MODULE) of true -> - ?SLOG(Level, Data, Meta); + (fun(#{msg := __Msg} = __Data) -> + case emqx_log_throttler:allow(__Msg) of + true -> + logger:log(Level, __Data, Meta); + false -> + ?_DO_TRACE(Level, __Msg, maps:merge(__Data, Meta)) + end + end)( + Data + ); false -> - ?_DO_TRACE(Level, maps:get(msg, Data), maps:merge(Data, Meta)) + ok end ). diff --git a/apps/emqx/integration_test/emqx_persistent_session_ds_SUITE.erl b/apps/emqx/integration_test/emqx_persistent_session_ds_SUITE.erl index 39764af30..a5260f780 100644 --- a/apps/emqx/integration_test/emqx_persistent_session_ds_SUITE.erl +++ b/apps/emqx/integration_test/emqx_persistent_session_ds_SUITE.erl @@ -184,7 +184,7 @@ list_all_pubranges(Node) -> session_open(Node, ClientId) -> ClientInfo = #{}, - ConnInfo = #{peername => {undefined, undefined}}, + ConnInfo = #{peername => {undefined, undefined}, proto_name => <<"MQTT">>, proto_ver => 5}, WillMsg = undefined, erpc:call( Node, @@ -252,7 +252,6 @@ t_session_subscription_idempotency(Config) -> ok end, fun(Trace) -> - ct:pal("trace:\n ~p", [Trace]), Session = session_open(Node1, ClientId), ?assertMatch( #{SubTopicFilter := #{}}, @@ -326,7 +325,6 @@ t_session_unsubscription_idempotency(Config) -> ok end, fun(Trace) -> - ct:pal("trace:\n ~p", [Trace]), Session = session_open(Node1, ClientId), ?assertEqual( #{}, @@ -415,10 +413,7 @@ do_t_session_discard(Params) -> ok end, - fun(Trace) -> - ct:pal("trace:\n ~p", [Trace]), - ok - end + [] ), ok. diff --git a/apps/emqx/rebar.config b/apps/emqx/rebar.config index 4e850b2cc..b58734cf8 100644 --- a/apps/emqx/rebar.config +++ b/apps/emqx/rebar.config @@ -27,8 +27,8 @@ {lc, {git, "https://github.com/emqx/lc.git", {tag, "0.3.2"}}}, {gproc, {git, "https://github.com/emqx/gproc", {tag, "0.9.0.1"}}}, {cowboy, {git, "https://github.com/emqx/cowboy", {tag, "2.9.2"}}}, - {esockd, {git, "https://github.com/emqx/esockd", {tag, "5.11.1"}}}, - {ekka, {git, "https://github.com/emqx/ekka", {tag, "0.19.1"}}}, + {esockd, {git, "https://github.com/emqx/esockd", {tag, "5.11.2"}}}, + {ekka, {git, "https://github.com/emqx/ekka", {tag, "0.19.3"}}}, {gen_rpc, {git, "https://github.com/emqx/gen_rpc", {tag, "3.3.1"}}}, {hocon, {git, "https://github.com/emqx/hocon.git", {tag, "0.42.1"}}}, {emqx_http_lib, {git, "https://github.com/emqx/emqx_http_lib.git", {tag, "0.5.3"}}}, diff --git a/apps/emqx/src/bhvrs/emqx_config_backup.erl b/apps/emqx/src/bhvrs/emqx_config_backup.erl index e4818a871..1ec08c23b 100644 --- a/apps/emqx/src/bhvrs/emqx_config_backup.erl +++ b/apps/emqx/src/bhvrs/emqx_config_backup.erl @@ -16,9 +16,14 @@ -module(emqx_config_backup). +-type ok_result() :: #{ + root_key => emqx_utils_maps:config_key(), + changed => [emqx_utils_maps:config_key_path()] +}. + +-type error_result() :: #{root_key => emqx_utils_maps:config_key(), reason => term()}. + -callback import_config(RawConf :: map()) -> - {ok, #{ - root_key => emqx_utils_maps:config_key(), - changed => [emqx_utils_maps:config_key_path()] - }} - | {error, #{root_key => emqx_utils_maps:config_key(), reason => term()}}. + {ok, ok_result()} + | {error, error_result()} + | {results, {[ok_result()], [error_result()]}}. diff --git a/apps/emqx/src/emqx_broker.erl b/apps/emqx/src/emqx_broker.erl index 1470b7d8b..ed29ea614 100644 --- a/apps/emqx/src/emqx_broker.erl +++ b/apps/emqx/src/emqx_broker.erl @@ -253,8 +253,12 @@ persist_publish(Msg) -> case emqx_persistent_message:persist(Msg) of ok -> [persisted]; - {_SkipOrError, _Reason} -> - % TODO: log errors? + {skipped, _} -> + []; + {error, Recoverable, Reason} -> + ?SLOG(debug, #{ + msg => "failed_to_persist_message", is_recoverable => Recoverable, reason => Reason + }), [] end. diff --git a/apps/emqx/src/emqx_channel.erl b/apps/emqx/src/emqx_channel.erl index 27babfcc9..efb5133bc 100644 --- a/apps/emqx/src/emqx_channel.erl +++ b/apps/emqx/src/emqx_channel.erl @@ -251,7 +251,7 @@ init( MP -> MP end, ListenerId = emqx_listeners:listener_id(Type, Listener), - ClientInfo0 = set_peercert_infos( + ClientInfo = set_peercert_infos( Peercert, #{ zone => Zone, @@ -269,8 +269,6 @@ init( }, Zone ), - AttrExtractionConfig = get_mqtt_conf(Zone, client_attrs_init), - ClientInfo = initialize_client_attrs_from_cert(AttrExtractionConfig, ClientInfo0, Peercert), {NClientInfo, NConnInfo} = take_ws_cookie(ClientInfo, ConnInfo), #channel{ conninfo = NConnInfo, @@ -1575,7 +1573,7 @@ enrich_client(ConnPkt, Channel = #channel{clientinfo = ClientInfo}) -> fun maybe_username_as_clientid/2, fun maybe_assign_clientid/2, %% attr init should happen after clientid and username assign - fun maybe_set_client_initial_attr/2 + fun maybe_set_client_initial_attrs/2 ], ConnPkt, ClientInfo @@ -1587,47 +1585,6 @@ enrich_client(ConnPkt, Channel = #channel{clientinfo = ClientInfo}) -> {error, ReasonCode, Channel#channel{clientinfo = NClientInfo}} end. -initialize_client_attrs_from_cert( - #{ - extract_from := From, - extract_regexp := Regexp, - extract_as := AttrName - }, - ClientInfo, - Peercert -) when From =:= cn orelse From =:= dn -> - case extract_client_attr_from_cert(From, Regexp, Peercert) of - {ok, Value} -> - ?SLOG( - debug, - #{ - msg => "client_attr_init_from_cert", - extracted_as => AttrName, - extracted_value => Value - } - ), - ClientInfo#{client_attrs => #{AttrName => Value}}; - _ -> - ClientInfo#{client_attrs => #{}} - end; -initialize_client_attrs_from_cert(_, ClientInfo, _Peercert) -> - ClientInfo. - -extract_client_attr_from_cert(cn, Regexp, Peercert) -> - CN = esockd_peercert:common_name(Peercert), - re_extract(CN, Regexp); -extract_client_attr_from_cert(dn, Regexp, Peercert) -> - DN = esockd_peercert:subject(Peercert), - re_extract(DN, Regexp). - -re_extract(Str, Regexp) when is_binary(Str) -> - case re:run(Str, Regexp, [{capture, all_but_first, list}]) of - {match, [_ | _] = List} -> {ok, iolist_to_binary(List)}; - _ -> nomatch - end; -re_extract(_NotStr, _Regexp) -> - ignored. - set_username( #mqtt_packet_connect{username = Username}, ClientInfo = #{username := undefined} @@ -1668,75 +1625,50 @@ maybe_assign_clientid(#mqtt_packet_connect{clientid = <<>>}, ClientInfo) -> maybe_assign_clientid(#mqtt_packet_connect{clientid = ClientId}, ClientInfo) -> {ok, ClientInfo#{clientid => ClientId}}. -maybe_set_client_initial_attr(ConnPkt, #{zone := Zone} = ClientInfo0) -> - Config = get_mqtt_conf(Zone, client_attrs_init), - ClientInfo = initialize_client_attrs_from_user_property(Config, ConnPkt, ClientInfo0), - Attrs = maps:get(client_attrs, ClientInfo, #{}), - case extract_attr_from_clientinfo(Config, ClientInfo) of - {ok, Value} -> - #{extract_as := Name} = Config, - ?SLOG( - debug, - #{ - msg => "client_attr_init_from_clientinfo", - extracted_as => Name, - extracted_value => Value - } - ), - {ok, ClientInfo#{client_attrs => Attrs#{Name => Value}}}; - _ -> - {ok, ClientInfo} - end. +get_client_attrs_init_config(Zone) -> + get_mqtt_conf(Zone, client_attrs_init, []). -initialize_client_attrs_from_user_property( - #{ - extract_from := user_property, - extract_as := PropertyKey - }, - ConnPkt, - ClientInfo -) -> - case extract_client_attr_from_user_property(ConnPkt, PropertyKey) of - {ok, Value} -> - ?SLOG( - debug, - #{ - msg => "client_attr_init_from_user_property", - extracted_as => PropertyKey, - extracted_value => Value - } - ), - ClientInfo#{client_attrs => #{PropertyKey => Value}}; - _ -> - ClientInfo - end; -initialize_client_attrs_from_user_property(_, _ConnInfo, ClientInfo) -> - ClientInfo. +maybe_set_client_initial_attrs(ConnPkt, #{zone := Zone} = ClientInfo) -> + Inits = get_client_attrs_init_config(Zone), + UserProperty = get_user_property_as_map(ConnPkt), + {ok, initialize_client_attrs(Inits, ClientInfo#{user_property => UserProperty})}. -extract_client_attr_from_user_property( - #mqtt_packet_connect{properties = #{'User-Property' := UserProperty}}, PropertyKey -) -> - case lists:keyfind(PropertyKey, 1, UserProperty) of - {_, Value} -> - {ok, Value}; - _ -> - not_found - end; -extract_client_attr_from_user_property(_ConnPkt, _PropertyKey) -> - ignored. +initialize_client_attrs(Inits, ClientInfo) -> + lists:foldl( + fun(#{expression := Variform, set_as_attr := Name}, Acc) -> + Attrs = maps:get(client_attrs, ClientInfo, #{}), + case emqx_variform:render(Variform, ClientInfo) of + {ok, Value} -> + ?SLOG( + debug, + #{ + msg => "client_attr_initialized", + set_as_attr => Name, + attr_value => Value + } + ), + Acc#{client_attrs => Attrs#{Name => Value}}; + {error, Reason} -> + ?SLOG( + warning, + #{ + msg => "client_attr_initialization_failed", + reason => Reason + } + ), + Acc + end + end, + ClientInfo, + Inits + ). -extract_attr_from_clientinfo(#{extract_from := clientid, extract_regexp := Regexp}, #{ - clientid := ClientId -}) -> - re_extract(ClientId, Regexp); -extract_attr_from_clientinfo(#{extract_from := username, extract_regexp := Regexp}, #{ - username := Username -}) when - Username =/= undefined +get_user_property_as_map(#mqtt_packet_connect{properties = #{'User-Property' := UserProperty}}) when + is_list(UserProperty) -> - re_extract(Username, Regexp); -extract_attr_from_clientinfo(_Config, _CLientInfo) -> - ignored. + maps:from_list(UserProperty); +get_user_property_as_map(_) -> + #{}. fix_mountpoint(#{mountpoint := undefined} = ClientInfo) -> ClientInfo; diff --git a/apps/emqx/src/emqx_metrics.erl b/apps/emqx/src/emqx_metrics.erl index 6b8b60209..13ac40c68 100644 --- a/apps/emqx/src/emqx_metrics.erl +++ b/apps/emqx/src/emqx_metrics.erl @@ -222,7 +222,9 @@ % Messages delivered {counter, 'messages.delivered'}, % Messages acked - {counter, 'messages.acked'} + {counter, 'messages.acked'}, + % Messages persistently stored + {counter, 'messages.persisted'} ] ). @@ -718,4 +720,5 @@ reserved_idx('overload_protection.gc') -> 403; reserved_idx('overload_protection.new_conn') -> 404; reserved_idx('messages.validation_succeeded') -> 405; reserved_idx('messages.validation_failed') -> 406; +reserved_idx('messages.persisted') -> 407; reserved_idx(_) -> undefined. diff --git a/apps/emqx/src/emqx_persistent_message.erl b/apps/emqx/src/emqx_persistent_message.erl index e3fa23296..c909c5c5f 100644 --- a/apps/emqx/src/emqx_persistent_message.erl +++ b/apps/emqx/src/emqx_persistent_message.erl @@ -98,7 +98,7 @@ pre_config_update(_Root, _NewConf, _OldConf) -> %%-------------------------------------------------------------------- -spec persist(emqx_types:message()) -> - ok | {skipped, _Reason} | {error, _TODO}. + emqx_ds:store_batch_result() | {skipped, needs_no_persistence}. persist(Msg) -> ?WHEN_ENABLED( case needs_persistence(Msg) andalso has_subscribers(Msg) of @@ -114,6 +114,7 @@ needs_persistence(Msg) -> -spec store_message(emqx_types:message()) -> emqx_ds:store_batch_result(). store_message(Msg) -> + emqx_metrics:inc('messages.persisted'), emqx_ds:store_batch(?PERSISTENT_MESSAGE_DB, [Msg], #{sync => false}). has_subscribers(#message{topic = Topic}) -> diff --git a/apps/emqx/src/emqx_persistent_session_ds.erl b/apps/emqx/src/emqx_persistent_session_ds.erl index c1ed6aabd..4bfefe5b6 100644 --- a/apps/emqx/src/emqx_persistent_session_ds.erl +++ b/apps/emqx/src/emqx_persistent_session_ds.erl @@ -75,7 +75,8 @@ %% Managment APIs: -export([ - list_client_subscriptions/1 + list_client_subscriptions/1, + get_client_subscription/2 ]). %% session table operations @@ -116,15 +117,42 @@ %% Currently, this is the clientid. We avoid `emqx_types:clientid()' because that can be %% an atom, in theory (?). -type id() :: binary(). --type topic_filter() :: emqx_types:topic(). +-type topic_filter() :: emqx_types:topic() | #share{}. + +%% Subscription and subscription states: +%% +%% Persistent sessions cannot simply update or delete subscriptions, +%% since subscription parameters must be exactly the same during +%% replay. +%% +%% To solve this problem, we store subscriptions in a twofold manner: +%% +%% - `subscription' is an object that holds up-to-date information +%% about the client's subscription and a reference to the latest +%% subscription state id +%% +%% - `subscription_state' is an immutable object that holds +%% information about the subcription parameters at a certain point of +%% time +%% +%% New subscription states are created whenever the client subscribes +%% to a topics, or updates an existing subscription. +%% +%% Stream replay states contain references to the subscription states. +%% +%% Outdated subscription states are discarded when they are not +%% referenced by either subscription or stream replay state objects. -type subscription_id() :: integer(). +%% This type is a result of merging +%% `emqx_persistent_session_ds_subs:subscription()' with its current +%% state. -type subscription() :: #{ id := subscription_id(), start_time := emqx_ds:time(), - props := map(), - deleted := boolean() + current_state := emqx_persistent_session_ds_subs:subscription_state_id(), + subopts := map() }. -define(TIMER_PULL, timer_pull). @@ -184,7 +212,9 @@ seqno_q2_dup, seqno_q2_rec, seqno_q2_next, - n_streams + n_streams, + awaiting_rel_cnt, + awaiting_rel_max ]). %% @@ -206,7 +236,8 @@ open(#{clientid := ClientID} = ClientInfo, ConnInfo, MaybeWillMsg, Conf) -> ok = emqx_cm:takeover_kick(ClientID), case session_open(ClientID, ClientInfo, ConnInfo, MaybeWillMsg) of Session0 = #{} -> - Session = Session0#{props => Conf}, + Session1 = Session0#{props => Conf}, + Session = do_expire(ClientInfo, Session1), {true, ensure_timers(Session), []}; false -> false @@ -249,7 +280,7 @@ info(is_persistent, #{}) -> info(subscriptions, #{s := S}) -> emqx_persistent_session_ds_subs:to_map(S); info(subscriptions_cnt, #{s := S}) -> - emqx_topic_gbt:size(emqx_persistent_session_ds_state:get_subscriptions(S)); + emqx_persistent_session_ds_state:n_subscriptions(S); info(subscriptions_max, #{props := Conf}) -> maps:get(max_subscriptions, Conf); info(upgrade_qos, #{props := Conf}) -> @@ -262,21 +293,21 @@ info(inflight_max, #{inflight := Inflight}) -> emqx_persistent_session_ds_inflight:receive_maximum(Inflight); info(retry_interval, #{props := Conf}) -> maps:get(retry_interval, Conf); -% info(mqueue, #sessmem{mqueue = MQueue}) -> -% MQueue; info(mqueue_len, #{inflight := Inflight}) -> emqx_persistent_session_ds_inflight:n_buffered(all, Inflight); -% info(mqueue_max, #sessmem{mqueue = MQueue}) -> -% emqx_mqueue:max_len(MQueue); info(mqueue_dropped, _Session) -> 0; %% info(next_pkt_id, #{s := S}) -> %% {PacketId, _} = emqx_persistent_message_ds_replayer:next_packet_id(S), %% PacketId; -% info(awaiting_rel, #sessmem{awaiting_rel = AwaitingRel}) -> -% AwaitingRel; -%% info(awaiting_rel_cnt, #{s := S}) -> -%% seqno_diff(?QOS_2, ?rec, ?committed(?QOS_2), S); +info(awaiting_rel, #{s := S}) -> + emqx_persistent_session_ds_state:fold_awaiting_rel(fun maps:put/3, #{}, S); +info(awaiting_rel_max, #{props := Conf}) -> + maps:get(max_awaiting_rel, Conf); +info(awaiting_rel_cnt, #{s := S}) -> + emqx_persistent_session_ds_state:n_awaiting_rel(S); +info(await_rel_timeout, #{props := Conf}) -> + maps:get(await_rel_timeout, Conf); info(seqno_q1_comm, #{s := S}) -> emqx_persistent_session_ds_state:get_seqno(?committed(?QOS_1), S); info(seqno_q1_dup, #{s := S}) -> @@ -292,17 +323,7 @@ info(seqno_q2_rec, #{s := S}) -> info(seqno_q2_next, #{s := S}) -> emqx_persistent_session_ds_state:get_seqno(?next(?QOS_2), S); info(n_streams, #{s := S}) -> - emqx_persistent_session_ds_state:fold_streams( - fun(_, _, Acc) -> Acc + 1 end, - 0, - S - ); -info(awaiting_rel_max, #{props := Conf}) -> - maps:get(max_awaiting_rel, Conf); -info(await_rel_timeout, #{props := _Conf}) -> - %% TODO: currently this setting is ignored: - %% maps:get(await_rel_timeout, Conf). - 0; + emqx_persistent_session_ds_state:n_streams(S); info({MsgsQ, _PagerParams}, _Session) when MsgsQ =:= mqueue_msgs; MsgsQ =:= inflight_msgs -> {error, not_implemented}. @@ -337,93 +358,49 @@ print_session(ClientId) -> -spec subscribe(topic_filter(), emqx_types:subopts(), session()) -> {ok, session()} | {error, emqx_types:reason_code()}. +subscribe( + #share{}, + _SubOpts, + _Session +) -> + %% TODO: Shared subscriptions are not supported yet: + {error, ?RC_SHARED_SUBSCRIPTIONS_NOT_SUPPORTED}; subscribe( TopicFilter, SubOpts, - Session = #{id := ID, s := S0} + Session ) -> - case emqx_persistent_session_ds_subs:lookup(TopicFilter, S0) of - undefined -> - %% TODO: max subscriptions - - %% N.B.: we chose to update the router before adding the - %% subscription to the session/iterator table. The - %% reasoning for this is as follows: - %% - %% Messages matching this topic filter should start to be - %% persisted as soon as possible to avoid missing - %% messages. If this is the first such persistent session - %% subscription, it's important to do so early on. - %% - %% This could, in turn, lead to some inconsistency: if - %% such a route gets created but the session/iterator data - %% fails to be updated accordingly, we have a dangling - %% route. To remove such dangling routes, we may have a - %% periodic GC process that removes routes that do not - %% have a matching persistent subscription. Also, route - %% operations use dirty mnesia operations, which - %% inherently have room for inconsistencies. - %% - %% In practice, we use the iterator reference table as a - %% source of truth, since it is guarded by a transaction - %% context: we consider a subscription operation to be - %% successful if it ended up changing this table. Both - %% router and iterator information can be reconstructed - %% from this table, if needed. - ok = emqx_persistent_session_ds_router:do_add_route(TopicFilter, ID), - {SubId, S1} = emqx_persistent_session_ds_state:new_id(S0), - Subscription = #{ - start_time => now_ms(), - props => SubOpts, - id => SubId, - deleted => false - }, - IsNew = true; - Subscription0 = #{} -> - Subscription = Subscription0#{props => SubOpts}, - IsNew = false, - S1 = S0 - end, - S = emqx_persistent_session_ds_subs:on_subscribe(TopicFilter, Subscription, S1), - ?tp(persistent_session_ds_subscription_added, #{ - topic_filter => TopicFilter, sub => Subscription, is_new => IsNew - }), - {ok, Session#{s => S}}. + case emqx_persistent_session_ds_subs:on_subscribe(TopicFilter, SubOpts, Session) of + {ok, S1} -> + S = emqx_persistent_session_ds_state:commit(S1), + {ok, Session#{s => S}}; + Error = {error, _} -> + Error + end. -spec unsubscribe(topic_filter(), session()) -> {ok, session(), emqx_types:subopts()} | {error, emqx_types:reason_code()}. unsubscribe( TopicFilter, - Session = #{id := ID, s := S0} + Session = #{id := SessionId, s := S0} ) -> - case emqx_persistent_session_ds_subs:lookup(TopicFilter, S0) of - undefined -> - {error, ?RC_NO_SUBSCRIPTION_EXISTED}; - Subscription = #{props := SubOpts} -> - S = do_unsubscribe(ID, TopicFilter, Subscription, S0), - {ok, Session#{s => S}, SubOpts} + case emqx_persistent_session_ds_subs:on_unsubscribe(SessionId, TopicFilter, S0) of + {ok, S1, #{id := SubId, subopts := SubOpts}} -> + S2 = emqx_persistent_session_ds_stream_scheduler:on_unsubscribe(SubId, S1), + S = emqx_persistent_session_ds_state:commit(S2), + {ok, Session#{s => S}, SubOpts}; + Error = {error, _} -> + Error end. --spec do_unsubscribe(id(), topic_filter(), subscription(), emqx_persistent_session_ds_state:t()) -> - emqx_persistent_session_ds_state:t(). -do_unsubscribe(SessionId, TopicFilter, Subscription = #{id := SubId}, S0) -> - S1 = emqx_persistent_session_ds_subs:on_unsubscribe(TopicFilter, Subscription, S0), - ?tp(persistent_session_ds_subscription_delete, #{ - session_id => SessionId, topic_filter => TopicFilter - }), - S = emqx_persistent_session_ds_stream_scheduler:on_unsubscribe(SubId, S1), - ?tp_span( - persistent_session_ds_subscription_route_delete, - #{session_id => SessionId, topic_filter => TopicFilter}, - ok = emqx_persistent_session_ds_router:do_delete_route(TopicFilter, SessionId) - ), - S. - -spec get_subscription(topic_filter(), session()) -> emqx_types:subopts() | undefined. +get_subscription(#share{}, _) -> + %% TODO: shared subscriptions are not supported yet: + undefined; get_subscription(TopicFilter, #{s := S}) -> case emqx_persistent_session_ds_subs:lookup(TopicFilter, S) of - _Subscription = #{props := SubOpts} -> + #{subopts := SubOpts} -> SubOpts; undefined -> undefined @@ -436,11 +413,72 @@ get_subscription(TopicFilter, #{s := S}) -> -spec publish(emqx_types:packet_id(), emqx_types:message(), session()) -> {ok, emqx_types:publish_result(), session()} | {error, emqx_types:reason_code()}. +publish( + PacketId, + Msg = #message{qos = ?QOS_2, timestamp = Ts}, + Session = #{s := S0} +) -> + case is_awaiting_full(Session) of + false -> + case emqx_persistent_session_ds_state:get_awaiting_rel(PacketId, S0) of + undefined -> + Results = emqx_broker:publish(Msg), + S = emqx_persistent_session_ds_state:put_awaiting_rel(PacketId, Ts, S0), + {ok, Results, Session#{s => S}}; + _Ts -> + {error, ?RC_PACKET_IDENTIFIER_IN_USE} + end; + true -> + {error, ?RC_RECEIVE_MAXIMUM_EXCEEDED} + end; publish(_PacketId, Msg, Session) -> - %% TODO: QoS2 Result = emqx_broker:publish(Msg), {ok, Result, Session}. +is_awaiting_full(#{s := S, props := Props}) -> + emqx_persistent_session_ds_state:n_awaiting_rel(S) >= + maps:get(max_awaiting_rel, Props, infinity). + +-spec expire(emqx_types:clientinfo(), session()) -> + {ok, [], timeout(), session()} | {ok, [], session()}. +expire(ClientInfo, Session0 = #{props := Props}) -> + Session = #{s := S} = do_expire(ClientInfo, Session0), + case emqx_persistent_session_ds_state:n_awaiting_rel(S) of + 0 -> + {ok, [], Session}; + _ -> + AwaitRelTimeout = maps:get(await_rel_timeout, Props), + {ok, [], AwaitRelTimeout, Session} + end. + +do_expire(ClientInfo, Session = #{s := S0, props := Props}) -> + %% 1. Find expired packet IDs: + Now = erlang:system_time(millisecond), + AwaitRelTimeout = maps:get(await_rel_timeout, Props), + ExpiredPacketIds = + emqx_persistent_session_ds_state:fold_awaiting_rel( + fun(PacketId, Ts, Acc) -> + Age = Now - Ts, + case Age > AwaitRelTimeout of + true -> + [PacketId | Acc]; + false -> + Acc + end + end, + [], + S0 + ), + %% 2. Perform side effects: + _ = emqx_session_events:handle_event(ClientInfo, {expired_rel, length(ExpiredPacketIds)}), + %% 3. Update state: + S = lists:foldl( + fun emqx_persistent_session_ds_state:del_awaiting_rel/2, + S0, + ExpiredPacketIds + ), + Session#{s => S}. + %%-------------------------------------------------------------------- %% Client -> Broker: PUBACK %%-------------------------------------------------------------------- @@ -477,9 +515,14 @@ pubrec(PacketId, Session0) -> -spec pubrel(emqx_types:packet_id(), session()) -> {ok, session()} | {error, emqx_types:reason_code()}. -pubrel(_PacketId, Session = #{}) -> - % TODO: stub - {ok, Session}. +pubrel(PacketId, Session = #{s := S0}) -> + case emqx_persistent_session_ds_state:get_awaiting_rel(PacketId, S0) of + undefined -> + {error, ?RC_PACKET_IDENTIFIER_NOT_FOUND}; + _TS -> + S = emqx_persistent_session_ds_state:del_awaiting_rel(PacketId, S0), + {ok, Session#{s => S}} + end. %%-------------------------------------------------------------------- %% Client -> Broker: PUBCOMP @@ -552,6 +595,8 @@ handle_timeout(_ClientInfo, #req_sync{from = From, ref = Ref}, Session = #{s := S = emqx_persistent_session_ds_state:commit(S0), From ! Ref, {ok, [], Session#{s => S}}; +handle_timeout(ClientInfo, expire_awaiting_rel, Session) -> + expire(ClientInfo, Session); handle_timeout(_ClientInfo, Timeout, Session) -> ?SLOG(warning, #{msg => "unknown_ds_timeout", timeout => Timeout}), {ok, [], Session}. @@ -571,7 +616,7 @@ replay(ClientInfo, [], Session0 = #{s := S0}) -> Session = replay_streams(Session0#{replay => Streams}, ClientInfo), {ok, [], Session}. -replay_streams(Session0 = #{replay := [{_StreamKey, Srs0} | Rest]}, ClientInfo) -> +replay_streams(Session0 = #{replay := [{StreamKey, Srs0} | Rest]}, ClientInfo) -> case replay_batch(Srs0, Session0, ClientInfo) of Session = #{} -> replay_streams(Session#{replay := Rest}, ClientInfo); @@ -579,7 +624,7 @@ replay_streams(Session0 = #{replay := [{_StreamKey, Srs0} | Rest]}, ClientInfo) RetryTimeout = ?TIMEOUT_RETRY_REPLAY, ?SLOG(warning, #{ msg => "failed_to_fetch_replay_batch", - stream => Srs0, + stream => StreamKey, reason => Reason, class => recoverable, retry_in_ms => RetryTimeout @@ -645,7 +690,7 @@ list_client_subscriptions(ClientId) -> %% TODO: this is not the most optimal implementation, since it %% should be possible to avoid reading extra data (streams, etc.) case print_session(ClientId) of - Sess = #{s := #{subscriptions := Subs}} -> + Sess = #{s := #{subscriptions := Subs, subscription_states := SStates}} -> Node = case Sess of #{'_alive' := {true, Pid}} -> @@ -655,8 +700,9 @@ list_client_subscriptions(ClientId) -> end, SubList = maps:fold( - fun(Topic, #{props := SubProps}, Acc) -> - Elem = {Topic, SubProps}, + fun(Topic, #{current_state := CS}, Acc) -> + #{subopts := SubOpts} = maps:get(CS, SStates), + Elem = {Topic, SubOpts}, [Elem | Acc] end, [], @@ -670,6 +716,11 @@ list_client_subscriptions(ClientId) -> {error, not_found} end. +-spec get_client_subscription(emqx_types:clientid(), emqx_types:topic()) -> + subscription() | undefined. +get_client_subscription(ClientId, Topic) -> + emqx_persistent_session_ds_subs:cold_get_subscription(ClientId, Topic). + %%-------------------------------------------------------------------- %% Session tables operations %%-------------------------------------------------------------------- @@ -701,7 +752,12 @@ sync(ClientId) -> %% the broker. -spec session_open(id(), emqx_types:clientinfo(), emqx_types:conninfo(), emqx_maybe:t(message())) -> session() | false. -session_open(SessionId, ClientInfo, NewConnInfo, MaybeWillMsg) -> +session_open( + SessionId, + ClientInfo, + NewConnInfo = #{proto_name := ProtoName, proto_ver := ProtoVer}, + MaybeWillMsg +) -> NowMS = now_ms(), case emqx_persistent_session_ds_state:open(SessionId) of {ok, S0} -> @@ -720,8 +776,9 @@ session_open(SessionId, ClientInfo, NewConnInfo, MaybeWillMsg) -> maps:get(peername, NewConnInfo), S2 ), S4 = emqx_persistent_session_ds_state:set_will_message(MaybeWillMsg, S3), - S5 = emqx_persistent_session_ds_state:set_clientinfo(ClientInfo, S4), - S = emqx_persistent_session_ds_state:commit(S5), + S5 = set_clientinfo(ClientInfo, S4), + S6 = emqx_persistent_session_ds_state:set_protocol({ProtoName, ProtoVer}, S5), + S = emqx_persistent_session_ds_state:commit(S6), Inflight = emqx_persistent_session_ds_inflight:new( receive_maximum(NewConnInfo) ), @@ -744,7 +801,9 @@ session_open(SessionId, ClientInfo, NewConnInfo, MaybeWillMsg) -> emqx_session:conf() ) -> session(). -session_ensure_new(Id, ClientInfo, ConnInfo, MaybeWillMsg, Conf) -> +session_ensure_new( + Id, ClientInfo, ConnInfo = #{proto_name := ProtoName, proto_ver := ProtoVer}, MaybeWillMsg, Conf +) -> ?tp(debug, persistent_session_ds_ensure_new, #{id => Id}), Now = now_ms(), S0 = emqx_persistent_session_ds_state:create_new(Id), @@ -767,8 +826,9 @@ session_ensure_new(Id, ClientInfo, ConnInfo, MaybeWillMsg, Conf) -> ] ), S5 = emqx_persistent_session_ds_state:set_will_message(MaybeWillMsg, S4), - S6 = emqx_persistent_session_ds_state:set_clientinfo(ClientInfo, S5), - S = emqx_persistent_session_ds_state:commit(S6), + S6 = set_clientinfo(ClientInfo, S5), + S7 = emqx_persistent_session_ds_state:set_protocol({ProtoName, ProtoVer}, S6), + S = emqx_persistent_session_ds_state:commit(S7), #{ id => Id, props => Conf, @@ -779,18 +839,12 @@ session_ensure_new(Id, ClientInfo, ConnInfo, MaybeWillMsg, Conf) -> %% @doc Called when a client reconnects with `clean session=true' or %% during session GC -spec session_drop(id(), _Reason) -> ok. -session_drop(ID, Reason) -> - case emqx_persistent_session_ds_state:open(ID) of +session_drop(SessionId, Reason) -> + case emqx_persistent_session_ds_state:open(SessionId) of {ok, S0} -> - ?tp(debug, drop_persistent_session, #{client_id => ID, reason => Reason}), - _S = emqx_persistent_session_ds_subs:fold( - fun(TopicFilter, Subscription, S) -> - do_unsubscribe(ID, TopicFilter, Subscription, S) - end, - S0, - S0 - ), - emqx_persistent_session_ds_state:delete(ID); + ?tp(debug, drop_persistent_session, #{client_id => SessionId, reason => Reason}), + emqx_persistent_session_ds_subs:on_session_drop(SessionId, S0), + emqx_persistent_session_ds_state:delete(SessionId); undefined -> ok end. @@ -798,6 +852,11 @@ session_drop(ID, Reason) -> now_ms() -> erlang:system_time(millisecond). +set_clientinfo(ClientInfo0, S) -> + %% Remove unnecessary fields from the clientinfo: + ClientInfo = maps:without([cn, dn, auth_result], ClientInfo0), + emqx_persistent_session_ds_state:set_clientinfo(ClientInfo, S). + %%-------------------------------------------------------------------- %% RPC targets (v1) %%-------------------------------------------------------------------- @@ -867,29 +926,38 @@ new_batch({StreamKey, Srs0}, BatchSize, Session0 = #{s := S0}, ClientInfo) -> %% TODO: Handle unrecoverable error. ?SLOG(info, #{ msg => "failed_to_fetch_batch", - stream => Srs1, + stream => StreamKey, reason => Reason, class => Class }), Session0 end. -enqueue_batch(IsReplay, BatchSize, Srs0, Session = #{inflight := Inflight0}, ClientInfo) -> +enqueue_batch(IsReplay, BatchSize, Srs0, Session = #{inflight := Inflight0, s := S}, ClientInfo) -> #srs{ it_begin = ItBegin0, it_end = ItEnd0, first_seqno_qos1 = FirstSeqnoQos1, - first_seqno_qos2 = FirstSeqnoQos2 + first_seqno_qos2 = FirstSeqnoQos2, + sub_state_id = SubStateId } = Srs0, ItBegin = case IsReplay of true -> ItBegin0; false -> ItEnd0 end, + SubState = #{} = emqx_persistent_session_ds_state:get_subscription_state(SubStateId, S), case emqx_ds:next(?PERSISTENT_MESSAGE_DB, ItBegin, BatchSize) of {ok, ItEnd, Messages} -> {Inflight, LastSeqnoQos1, LastSeqnoQos2} = process_batch( - IsReplay, Session, ClientInfo, FirstSeqnoQos1, FirstSeqnoQos2, Messages, Inflight0 + IsReplay, + Session, + SubState, + ClientInfo, + FirstSeqnoQos1, + FirstSeqnoQos2, + Messages, + Inflight0 ), Srs = Srs0#srs{ it_begin = ItBegin, @@ -913,27 +981,29 @@ enqueue_batch(IsReplay, BatchSize, Srs0, Session = #{inflight := Inflight0}, Cli %% key_of_iter(#{3 := #{3 := #{5 := K}}}) -> %% K. -process_batch(_IsReplay, _Session, _ClientInfo, LastSeqNoQos1, LastSeqNoQos2, [], Inflight) -> +process_batch( + _IsReplay, _Session, _SubState, _ClientInfo, LastSeqNoQos1, LastSeqNoQos2, [], Inflight +) -> {Inflight, LastSeqNoQos1, LastSeqNoQos2}; process_batch( - IsReplay, Session, ClientInfo, FirstSeqNoQos1, FirstSeqNoQos2, [KV | Messages], Inflight0 + IsReplay, + Session, + SubState, + ClientInfo, + FirstSeqNoQos1, + FirstSeqNoQos2, + [KV | Messages], + Inflight0 ) -> - #{s := S, props := #{upgrade_qos := UpgradeQoS}} = Session, - {_DsMsgKey, Msg0 = #message{topic = Topic}} = KV, + #{s := S} = Session, + #{upgrade_qos := UpgradeQoS, subopts := SubOpts} = SubState, + {_DsMsgKey, Msg0} = KV, Comm1 = emqx_persistent_session_ds_state:get_seqno(?committed(?QOS_1), S), Comm2 = emqx_persistent_session_ds_state:get_seqno(?committed(?QOS_2), S), Dup1 = emqx_persistent_session_ds_state:get_seqno(?dup(?QOS_1), S), Dup2 = emqx_persistent_session_ds_state:get_seqno(?dup(?QOS_2), S), Rec = emqx_persistent_session_ds_state:get_seqno(?rec, S), - Subs = emqx_persistent_session_ds_state:get_subscriptions(S), - Msgs = [ - Msg - || SubMatch <- emqx_topic_gbt:matches(Topic, Subs, []), - Msg <- begin - #{props := SubOpts} = emqx_topic_gbt:get_record(SubMatch, Subs), - emqx_session:enrich_message(ClientInfo, Msg0, SubOpts, UpgradeQoS) - end - ], + Msgs = emqx_session:enrich_message(ClientInfo, Msg0, SubOpts, UpgradeQoS), {Inflight, LastSeqNoQos1, LastSeqNoQos2} = lists:foldl( fun(Msg = #message{qos = Qos}, {Acc, SeqNoQos10, SeqNoQos20}) -> case Qos of @@ -989,14 +1059,16 @@ process_batch( Msgs ), process_batch( - IsReplay, Session, ClientInfo, LastSeqNoQos1, LastSeqNoQos2, Messages, Inflight + IsReplay, Session, SubState, ClientInfo, LastSeqNoQos1, LastSeqNoQos2, Messages, Inflight ). %%-------------------------------------------------------------------- %% Transient messages %%-------------------------------------------------------------------- -enqueue_transient(ClientInfo, Msg0, Session = #{s := S, props := #{upgrade_qos := UpgradeQoS}}) -> +enqueue_transient( + _ClientInfo, Msg = #message{qos = Qos}, Session = #{inflight := Inflight0, s := S0} +) -> %% TODO: Such messages won't be retransmitted, should the session %% reconnect before transient messages are acked. %% @@ -1006,18 +1078,6 @@ enqueue_transient(ClientInfo, Msg0, Session = #{s := S, props := #{upgrade_qos : %% queued messages. Since streams in this DB are exclusive to the %% session, messages from the queue can be dropped as soon as they %% are acked. - Subs = emqx_persistent_session_ds_state:get_subscriptions(S), - Msgs = [ - Msg - || SubMatch <- emqx_topic_gbt:matches(Msg0#message.topic, Subs, []), - Msg <- begin - #{props := SubOpts} = emqx_topic_gbt:get_record(SubMatch, Subs), - emqx_session:enrich_message(ClientInfo, Msg0, SubOpts, UpgradeQoS) - end - ], - lists:foldl(fun do_enqueue_transient/2, Session, Msgs). - -do_enqueue_transient(Msg = #message{qos = Qos}, Session = #{inflight := Inflight0, s := S0}) -> case Qos of ?QOS_0 -> S = S0, diff --git a/apps/emqx/src/emqx_persistent_session_ds.hrl b/apps/emqx/src/emqx_persistent_session_ds.hrl index 56862dfa5..79920629a 100644 --- a/apps/emqx/src/emqx_persistent_session_ds.hrl +++ b/apps/emqx/src/emqx_persistent_session_ds.hrl @@ -65,17 +65,21 @@ last_seqno_qos2 = 0 :: emqx_persistent_session_ds:seqno(), %% This stream belongs to an unsubscribed topic-filter, and is %% marked for deletion: - unsubscribed = false :: boolean() + unsubscribed = false :: boolean(), + %% Reference to the subscription state: + sub_state_id :: emqx_persistent_session_ds_subs:subscription_state_id() }). %% Session metadata keys: -define(created_at, created_at). -define(last_alive_at, last_alive_at). -define(expiry_interval, expiry_interval). -%% Unique integer used to create unique identities +%% Unique integer used to create unique identities: -define(last_id, last_id). +%% Connection info (relevent for the dashboard): -define(peername, peername). -define(will_message, will_message). -define(clientinfo, clientinfo). +-define(protocol, protocol). -endif. diff --git a/apps/emqx/src/emqx_persistent_session_ds_state.erl b/apps/emqx/src/emqx_persistent_session_ds_state.erl index 28297964d..9efffc7ff 100644 --- a/apps/emqx/src/emqx_persistent_session_ds_state.erl +++ b/apps/emqx/src/emqx_persistent_session_ds_state.erl @@ -22,6 +22,9 @@ %% It is responsible for saving, caching, and restoring session state. %% It is completely devoid of business logic. Not even the default %% values should be set in this module. +%% +%% Session process MUST NOT use `cold_*' functions! They are reserved +%% for use in the management APIs. -module(emqx_persistent_session_ds_state). -export([create_tables/0]). @@ -33,22 +36,44 @@ -export([get_clientinfo/1, set_clientinfo/2]). -export([get_will_message/1, set_will_message/2, clear_will_message/1, clear_will_message_now/1]). -export([get_peername/1, set_peername/2]). +-export([get_protocol/1, set_protocol/2]). -export([new_id/1]). --export([get_stream/2, put_stream/3, del_stream/2, fold_streams/3]). +-export([get_stream/2, put_stream/3, del_stream/2, fold_streams/3, n_streams/1]). -export([get_seqno/2, put_seqno/3]). -export([get_rank/2, put_rank/3, del_rank/2, fold_ranks/3]). --export([get_subscriptions/1, put_subscription/4, del_subscription/3]). +-export([ + get_subscription_state/2, + cold_get_subscription_state/2, + fold_subscription_states/3, + put_subscription_state/3, + del_subscription_state/2 +]). +-export([ + get_subscription/2, + cold_get_subscription/2, + fold_subscriptions/3, + n_subscriptions/1, + put_subscription/3, + del_subscription/2 +]). +-export([ + get_awaiting_rel/2, + put_awaiting_rel/3, + del_awaiting_rel/2, + fold_awaiting_rel/3, + n_awaiting_rel/1 +]). -export([make_session_iterator/0, session_iterator_next/2]). -export_type([ t/0, metadata/0, - subscriptions/0, seqno_type/0, stream_key/0, rank_key/0, - session_iterator/0 + session_iterator/0, + protocol/0 ]). -include("emqx_mqtt.hrl"). @@ -62,8 +87,6 @@ -type message() :: emqx_types:message(). --type subscriptions() :: emqx_topic_gbt:t(_SubId, emqx_persistent_session_ds:subscription()). - -opaque session_iterator() :: emqx_persistent_session_ds:id() | '$end_of_table'. %% Generic key-value wrapper that is used for exporting arbitrary @@ -92,13 +115,16 @@ dirty :: #{K => dirty | del} }. +-type protocol() :: {binary(), emqx_types:proto_ver()}. + -type metadata() :: #{ ?created_at => emqx_persistent_session_ds:timestamp(), ?last_alive_at => emqx_persistent_session_ds:timestamp(), ?expiry_interval => non_neg_integer(), ?last_id => integer(), - ?peername => emqx_types:peername() + ?peername => emqx_types:peername(), + ?protocol => protocol() }. -type seqno_type() :: @@ -110,22 +136,49 @@ | ?rec | ?committed(?QOS_2). +-define(id, id). +-define(dirty, dirty). +-define(metadata, metadata). +-define(subscriptions, subscriptions). +-define(subscription_states, subscription_states). +-define(seqnos, seqnos). +-define(streams, streams). +-define(ranks, ranks). +-define(awaiting_rel, awaiting_rel). + -opaque t() :: #{ - id := emqx_persistent_session_ds:id(), - dirty := boolean(), - metadata := metadata(), - subscriptions := subscriptions(), - seqnos := pmap(seqno_type(), emqx_persistent_session_ds:seqno()), - streams := pmap(emqx_ds:stream(), emqx_persistent_session_ds:stream_state()), - ranks := pmap(term(), integer()) + ?id := emqx_persistent_session_ds:id(), + ?dirty := boolean(), + ?metadata := metadata(), + ?subscriptions := pmap( + emqx_persistent_session_ds:topic_filter(), emqx_persistent_session_ds_subs:subscription() + ), + ?subscription_states := pmap( + emqx_persistent_session_ds_subs:subscription_state_id(), + emqx_persistent_session_ds_subs:subscription_state() + ), + ?seqnos := pmap(seqno_type(), emqx_persistent_session_ds:seqno()), + ?streams := pmap(emqx_ds:stream(), emqx_persistent_session_ds:stream_state()), + ?ranks := pmap(term(), integer()), + ?awaiting_rel := pmap(emqx_types:packet_id(), _Timestamp :: integer()) }. -define(session_tab, emqx_ds_session_tab). -define(subscription_tab, emqx_ds_session_subscriptions). +-define(subscription_states_tab, emqx_ds_session_subscription_states). -define(stream_tab, emqx_ds_session_streams). -define(seqno_tab, emqx_ds_session_seqnos). -define(rank_tab, emqx_ds_session_ranks). --define(pmap_tables, [?stream_tab, ?seqno_tab, ?rank_tab, ?subscription_tab]). +-define(awaiting_rel_tab, emqx_ds_session_awaiting_rel). + +-define(pmaps, [ + {?subscriptions, ?subscription_tab}, + {?subscription_states, ?subscription_states_tab}, + {?streams, ?stream_tab}, + {?seqnos, ?seqno_tab}, + {?ranks, ?rank_tab}, + {?awaiting_rel, ?awaiting_rel_tab} +]). %% Enable this flag if you suspect some code breaks the sequence: -ifndef(CHECK_SEQNO). @@ -152,23 +205,25 @@ create_tables() -> {attributes, record_info(fields, kv)} ] ), - [create_kv_pmap_table(Table) || Table <- ?pmap_tables], - mria:wait_for_tables([?session_tab | ?pmap_tables]). + {_, PmapTables} = lists:unzip(?pmaps), + [create_kv_pmap_table(Table) || Table <- PmapTables], + mria:wait_for_tables([?session_tab | PmapTables]). -spec open(emqx_persistent_session_ds:id()) -> {ok, t()} | undefined. open(SessionId) -> ro_transaction(fun() -> case kv_restore(?session_tab, SessionId) of [Metadata] -> - Rec = #{ - id => SessionId, - metadata => Metadata, - subscriptions => read_subscriptions(SessionId), - streams => pmap_open(?stream_tab, SessionId), - seqnos => pmap_open(?seqno_tab, SessionId), - ranks => pmap_open(?rank_tab, SessionId), - ?unset_dirty - }, + Rec = update_pmaps( + fun(_Pmap, Table) -> + pmap_open(Table, SessionId) + end, + #{ + id => SessionId, + metadata => Metadata, + ?unset_dirty + } + ), {ok, Rec}; [] -> undefined @@ -185,27 +240,13 @@ print_session(SessionId) -> end. -spec format(t()) -> map(). -format(#{ - metadata := Metadata, - subscriptions := SubsGBT, - streams := Streams, - seqnos := Seqnos, - ranks := Ranks -}) -> - Subs = emqx_topic_gbt:fold( - fun(Key, Sub, Acc) -> - maps:put(emqx_topic_gbt:get_topic(Key), Sub, Acc) +format(Rec) -> + update_pmaps( + fun(Pmap, _Table) -> + pmap_format(Pmap) end, - #{}, - SubsGBT - ), - #{ - metadata => Metadata, - subscriptions => Subs, - streams => pmap_format(Streams), - seqnos => pmap_format(Seqnos), - ranks => pmap_format(Ranks) - }. + maps:without([id, dirty], Rec) + ). -spec list_sessions() -> [emqx_persistent_session_ds:id()]. list_sessions() -> @@ -215,7 +256,7 @@ list_sessions() -> delete(Id) -> transaction( fun() -> - [kv_pmap_delete(Table, Id) || Table <- ?pmap_tables], + [kv_pmap_delete(Table, Id) || {_, Table} <- ?pmaps], mnesia:delete(?session_tab, Id, write) end ). @@ -226,36 +267,34 @@ commit(Rec = #{dirty := false}) -> commit( Rec = #{ id := SessionId, - metadata := Metadata, - streams := Streams, - seqnos := SeqNos, - ranks := Ranks + metadata := Metadata } ) -> check_sequence(Rec), transaction(fun() -> kv_persist(?session_tab, SessionId, Metadata), - Rec#{ - streams => pmap_commit(SessionId, Streams), - seqnos => pmap_commit(SessionId, SeqNos), - ranks => pmap_commit(SessionId, Ranks), - ?unset_dirty - } + update_pmaps( + fun(Pmap, _Table) -> + pmap_commit(SessionId, Pmap) + end, + Rec#{?unset_dirty} + ) end). -spec create_new(emqx_persistent_session_ds:id()) -> t(). create_new(SessionId) -> transaction(fun() -> delete(SessionId), - #{ - id => SessionId, - metadata => #{}, - subscriptions => emqx_topic_gbt:new(), - streams => pmap_open(?stream_tab, SessionId), - seqnos => pmap_open(?seqno_tab, SessionId), - ranks => pmap_open(?rank_tab, SessionId), - ?set_dirty - } + update_pmaps( + fun(_Pmap, Table) -> + pmap_open(Table, SessionId) + end, + #{ + id => SessionId, + metadata => #{}, + ?set_dirty + } + ) end). %% @@ -292,6 +331,14 @@ get_peername(Rec) -> set_peername(Val, Rec) -> set_meta(?peername, Val, Rec). +-spec get_protocol(t()) -> protocol() | undefined. +get_protocol(Rec) -> + get_meta(?protocol, Rec). + +-spec set_protocol(protocol(), t()) -> t(). +set_protocol(Val, Rec) -> + set_meta(?protocol, Val, Rec). + -spec get_clientinfo(t()) -> emqx_maybe:t(emqx_types:clientinfo()). get_clientinfo(Rec) -> get_meta(?clientinfo, Rec). @@ -336,30 +383,65 @@ new_id(Rec) -> %% --spec get_subscriptions(t()) -> subscriptions(). -get_subscriptions(#{subscriptions := Subs}) -> - Subs. +-spec get_subscription(emqx_persistent_session_ds:topic_filter(), t()) -> + emqx_persistent_session_ds_subs:subscription() | undefined. +get_subscription(TopicFilter, Rec) -> + gen_get(?subscriptions, TopicFilter, Rec). + +-spec cold_get_subscription(emqx_persistent_session_ds:id(), emqx_types:topic()) -> + [emqx_persistent_session_ds_subs:subscription()]. +cold_get_subscription(SessionId, Topic) -> + kv_pmap_read(?subscription_tab, SessionId, Topic). + +-spec fold_subscriptions(fun(), Acc, t()) -> Acc. +fold_subscriptions(Fun, Acc, Rec) -> + gen_fold(?subscriptions, Fun, Acc, Rec). + +-spec n_subscriptions(t()) -> non_neg_integer(). +n_subscriptions(Rec) -> + gen_size(?subscriptions, Rec). -spec put_subscription( emqx_persistent_session_ds:topic_filter(), - _SubId, - emqx_persistent_session_ds:subscription(), + emqx_persistent_session_ds_subs:subscription(), t() ) -> t(). -put_subscription(TopicFilter, SubId, Subscription, Rec = #{id := Id, subscriptions := Subs0}) -> - %% Note: currently changes to the subscriptions are persisted immediately. - Key = {TopicFilter, SubId}, - transaction(fun() -> kv_pmap_persist(?subscription_tab, Id, Key, Subscription) end), - Subs = emqx_topic_gbt:insert(TopicFilter, SubId, Subscription, Subs0), - Rec#{subscriptions => Subs}. +put_subscription(TopicFilter, Subscription, Rec) -> + gen_put(?subscriptions, TopicFilter, Subscription, Rec). --spec del_subscription(emqx_persistent_session_ds:topic_filter(), _SubId, t()) -> t(). -del_subscription(TopicFilter, SubId, Rec = #{id := Id, subscriptions := Subs0}) -> - %% Note: currently the subscriptions are persisted immediately. - Key = {TopicFilter, SubId}, - transaction(fun() -> kv_pmap_delete(?subscription_tab, Id, Key) end), - Subs = emqx_topic_gbt:delete(TopicFilter, SubId, Subs0), - Rec#{subscriptions => Subs}. +-spec del_subscription(emqx_persistent_session_ds:topic_filter(), t()) -> t(). +del_subscription(TopicFilter, Rec) -> + gen_del(?subscriptions, TopicFilter, Rec). + +%% + +-spec get_subscription_state(emqx_persistent_session_ds_subs:subscription_state_id(), t()) -> + emqx_persistent_session_ds_subs:subscription_state() | undefined. +get_subscription_state(SStateId, Rec) -> + gen_get(?subscription_states, SStateId, Rec). + +-spec cold_get_subscription_state( + emqx_persistent_session_ds:id(), emqx_persistent_session_ds_subs:subscription_state_id() +) -> + [emqx_persistent_session_ds_subs:subscription_state()]. +cold_get_subscription_state(SessionId, SStateId) -> + kv_pmap_read(?subscription_states_tab, SessionId, SStateId). + +-spec fold_subscription_states(fun(), Acc, t()) -> Acc. +fold_subscription_states(Fun, Acc, Rec) -> + gen_fold(?subscription_states, Fun, Acc, Rec). + +-spec put_subscription_state( + emqx_persistent_session_ds_subs:subscription_state_id(), + emqx_persistent_session_ds_subs:subscription_state(), + t() +) -> t(). +put_subscription_state(SStateId, SState, Rec) -> + gen_put(?subscription_states, SStateId, SState, Rec). + +-spec del_subscription_state(emqx_persistent_session_ds_subs:subscription_state_id(), t()) -> t(). +del_subscription_state(SStateId, Rec) -> + gen_del(?subscription_states, SStateId, Rec). %% @@ -368,29 +450,33 @@ del_subscription(TopicFilter, SubId, Rec = #{id := Id, subscriptions := Subs0}) -spec get_stream(stream_key(), t()) -> emqx_persistent_session_ds:stream_state() | undefined. get_stream(Key, Rec) -> - gen_get(streams, Key, Rec). + gen_get(?streams, Key, Rec). -spec put_stream(stream_key(), emqx_persistent_session_ds:stream_state(), t()) -> t(). put_stream(Key, Val, Rec) -> - gen_put(streams, Key, Val, Rec). + gen_put(?streams, Key, Val, Rec). -spec del_stream(stream_key(), t()) -> t(). del_stream(Key, Rec) -> - gen_del(streams, Key, Rec). + gen_del(?streams, Key, Rec). -spec fold_streams(fun(), Acc, t()) -> Acc. fold_streams(Fun, Acc, Rec) -> - gen_fold(streams, Fun, Acc, Rec). + gen_fold(?streams, Fun, Acc, Rec). + +-spec n_streams(t()) -> non_neg_integer(). +n_streams(Rec) -> + gen_size(?streams, Rec). %% -spec get_seqno(seqno_type(), t()) -> emqx_persistent_session_ds:seqno() | undefined. get_seqno(Key, Rec) -> - gen_get(seqnos, Key, Rec). + gen_get(?seqnos, Key, Rec). -spec put_seqno(seqno_type(), emqx_persistent_session_ds:seqno(), t()) -> t(). put_seqno(Key, Val, Rec) -> - gen_put(seqnos, Key, Val, Rec). + gen_put(?seqnos, Key, Val, Rec). %% @@ -398,19 +484,43 @@ put_seqno(Key, Val, Rec) -> -spec get_rank(rank_key(), t()) -> integer() | undefined. get_rank(Key, Rec) -> - gen_get(ranks, Key, Rec). + gen_get(?ranks, Key, Rec). -spec put_rank(rank_key(), integer(), t()) -> t(). put_rank(Key, Val, Rec) -> - gen_put(ranks, Key, Val, Rec). + gen_put(?ranks, Key, Val, Rec). -spec del_rank(rank_key(), t()) -> t(). del_rank(Key, Rec) -> - gen_del(ranks, Key, Rec). + gen_del(?ranks, Key, Rec). -spec fold_ranks(fun(), Acc, t()) -> Acc. fold_ranks(Fun, Acc, Rec) -> - gen_fold(ranks, Fun, Acc, Rec). + gen_fold(?ranks, Fun, Acc, Rec). + +%% + +-spec get_awaiting_rel(emqx_types:packet_id(), t()) -> integer() | undefined. +get_awaiting_rel(Key, Rec) -> + gen_get(?awaiting_rel, Key, Rec). + +-spec put_awaiting_rel(emqx_types:packet_id(), _Timestamp :: integer(), t()) -> t(). +put_awaiting_rel(Key, Val, Rec) -> + gen_put(?awaiting_rel, Key, Val, Rec). + +-spec del_awaiting_rel(emqx_types:packet_id(), t()) -> t(). +del_awaiting_rel(Key, Rec) -> + gen_del(?awaiting_rel, Key, Rec). + +-spec fold_awaiting_rel(fun(), Acc, t()) -> Acc. +fold_awaiting_rel(Fun, Acc, Rec) -> + gen_fold(?awaiting_rel, Fun, Acc, Rec). + +-spec n_awaiting_rel(t()) -> non_neg_integer(). +n_awaiting_rel(Rec) -> + gen_size(?awaiting_rel, Rec). + +%% -spec make_session_iterator() -> session_iterator(). make_session_iterator() -> @@ -475,16 +585,20 @@ gen_del(Field, Key, Rec) -> Rec#{?set_dirty} ). -%% +gen_size(Field, Rec) -> + check_sequence(Rec), + pmap_size(maps:get(Field, Rec)). -read_subscriptions(SessionId) -> - Records = kv_pmap_restore(?subscription_tab, SessionId), +-spec update_pmaps(fun((pmap(_K, _V) | undefined, atom()) -> term()), map()) -> map(). +update_pmaps(Fun, Map) -> lists:foldl( - fun({{TopicFilter, SubId}, Subscription}, Acc) -> - emqx_topic_gbt:insert(TopicFilter, SubId, Subscription, Acc) + fun({MapKey, Table}, Acc) -> + OldVal = maps:get(MapKey, Map, undefined), + Val = Fun(OldVal, Table), + maps:put(MapKey, Val, Acc) end, - emqx_topic_gbt:new(), - Records + Map, + ?pmaps ). %% @@ -547,6 +661,10 @@ pmap_commit( pmap_format(#pmap{cache = Cache}) -> Cache. +-spec pmap_size(pmap(_K, _V)) -> non_neg_integer(). +pmap_size(#pmap{cache = Cache}) -> + maps:size(Cache). + %% Functions dealing with set tables: kv_persist(Tab, SessionId, Val0) -> @@ -574,6 +692,14 @@ kv_pmap_persist(Tab, SessionId, Key, Val0) -> Val = encoder(encode, Tab, Val0), mnesia:write(Tab, #kv{k = {SessionId, Key}, v = Val}, write). +kv_pmap_read(Table, SessionId, Key) -> + lists:map( + fun(#kv{v = Val}) -> + encoder(decode, Table, Val) + end, + mnesia:dirty_read(Table, {SessionId, Key}) + ). + kv_pmap_restore(Table, SessionId) -> MS = [{#kv{k = {SessionId, '$1'}, v = '$2'}, [], [{{'$1', '$2'}}]}], Objs = mnesia:select(Table, MS, read), diff --git a/apps/emqx/src/emqx_persistent_session_ds_stream_scheduler.erl b/apps/emqx/src/emqx_persistent_session_ds_stream_scheduler.erl index 286d32ef4..1be0bdf4a 100644 --- a/apps/emqx/src/emqx_persistent_session_ds_stream_scheduler.erl +++ b/apps/emqx/src/emqx_persistent_session_ds_stream_scheduler.erl @@ -126,9 +126,10 @@ find_new_streams(S) -> renew_streams(S0) -> S1 = remove_unsubscribed_streams(S0), S2 = remove_fully_replayed_streams(S1), + S3 = update_stream_subscription_state_ids(S2), emqx_persistent_session_ds_subs:fold( fun - (Key, #{start_time := StartTime, id := SubId, deleted := false}, Acc) -> + (Key, #{start_time := StartTime, id := SubId, current_state := SStateId}, Acc) -> TopicFilter = emqx_topic:words(Key), Streams = select_streams( SubId, @@ -137,7 +138,7 @@ renew_streams(S0) -> ), lists:foldl( fun(I, Acc1) -> - ensure_iterator(TopicFilter, StartTime, SubId, I, Acc1) + ensure_iterator(TopicFilter, StartTime, SubId, SStateId, I, Acc1) end, Acc, Streams @@ -145,8 +146,8 @@ renew_streams(S0) -> (_Key, _DeletedSubscription, Acc) -> Acc end, - S2, - S2 + S3, + S3 ). -spec on_unsubscribe( @@ -201,23 +202,32 @@ is_fully_acked(Srs, S) -> %% Internal functions %%================================================================================ -ensure_iterator(TopicFilter, StartTime, SubId, {{RankX, RankY}, Stream}, S) -> +ensure_iterator(TopicFilter, StartTime, SubId, SStateId, {{RankX, RankY}, Stream}, S) -> Key = {SubId, Stream}, case emqx_persistent_session_ds_state:get_stream(Key, S) of undefined -> ?SLOG(debug, #{ msg => new_stream, key => Key, stream => Stream }), - {ok, Iterator} = emqx_ds:make_iterator( - ?PERSISTENT_MESSAGE_DB, Stream, TopicFilter, StartTime - ), - NewStreamState = #srs{ - rank_x = RankX, - rank_y = RankY, - it_begin = Iterator, - it_end = Iterator - }, - emqx_persistent_session_ds_state:put_stream(Key, NewStreamState, S); + case emqx_ds:make_iterator(?PERSISTENT_MESSAGE_DB, Stream, TopicFilter, StartTime) of + {ok, Iterator} -> + NewStreamState = #srs{ + rank_x = RankX, + rank_y = RankY, + it_begin = Iterator, + it_end = Iterator, + sub_state_id = SStateId + }, + emqx_persistent_session_ds_state:put_stream(Key, NewStreamState, S); + {error, recoverable, Reason} -> + ?SLOG(warning, #{ + msg => "failed_to_initialize_stream_iterator", + stream => Stream, + class => recoverable, + reason => Reason + }), + S + end; #srs{} -> S end. @@ -342,6 +352,38 @@ remove_fully_replayed_streams(S0) -> S1 ). +%% @doc Update subscription state IDs for all streams that don't have unacked messages +-spec update_stream_subscription_state_ids(emqx_persistent_session_ds_state:t()) -> + emqx_persistent_session_ds_state:t(). +update_stream_subscription_state_ids(S0) -> + CommQos1 = emqx_persistent_session_ds_state:get_seqno(?committed(?QOS_1), S0), + CommQos2 = emqx_persistent_session_ds_state:get_seqno(?committed(?QOS_2), S0), + %% Find the latest state IDs for each subscription: + LastSStateIds = emqx_persistent_session_ds_state:fold_subscriptions( + fun(_, #{id := SubId, current_state := SStateId}, Acc) -> + Acc#{SubId => SStateId} + end, + #{}, + S0 + ), + %% Update subscription state IDs for fully acked streams: + emqx_persistent_session_ds_state:fold_streams( + fun + (_, #srs{unsubscribed = true}, S) -> + S; + (Key = {SubId, _Stream}, SRS0, S) -> + case is_fully_acked(CommQos1, CommQos2, SRS0) of + true -> + SRS = SRS0#srs{sub_state_id = maps:get(SubId, LastSStateIds)}, + emqx_persistent_session_ds_state:put_stream(Key, SRS, S); + false -> + S + end + end, + S0, + S0 + ). + %% @doc Compare the streams by the order in which they were replayed. compare_streams( {_KeyA, #srs{first_seqno_qos1 = A1, first_seqno_qos2 = A2}}, diff --git a/apps/emqx/src/emqx_persistent_session_ds_subs.erl b/apps/emqx/src/emqx_persistent_session_ds_subs.erl index 92f17b108..8b4f70a69 100644 --- a/apps/emqx/src/emqx_persistent_session_ds_subs.erl +++ b/apps/emqx/src/emqx_persistent_session_ds_subs.erl @@ -24,14 +24,56 @@ -module(emqx_persistent_session_ds_subs). %% API: --export([on_subscribe/3, on_unsubscribe/3, gc/1, lookup/2, to_map/1, fold/3, fold_all/3]). +-export([ + on_subscribe/3, + on_unsubscribe/3, + on_session_drop/2, + gc/1, + lookup/2, + to_map/1, + fold/3 +]). --export_type([]). +%% Management API: +-export([ + cold_get_subscription/2 +]). + +-export_type([subscription_state_id/0, subscription/0, subscription_state/0]). + +-include("emqx_persistent_session_ds.hrl"). +-include("emqx_mqtt.hrl"). +-include_lib("snabbkaffe/include/trace.hrl"). %%================================================================================ %% Type declarations %%================================================================================ +-type subscription() :: #{ + %% Session-unique identifier of the subscription. Other objects + %% can use it as a compact reference: + id := emqx_persistent_session_ds:subscription_id(), + %% Reference to the current subscription state: + current_state := subscription_state_id(), + %% Time when the subscription was added: + start_time := emqx_ds:time() +}. + +-type subscription_state_id() :: integer(). + +-type subscription_state() :: #{ + parent_subscription := emqx_persistent_session_ds:subscription_id(), + upgrade_qos := boolean(), + %% SubOpts: + subopts := #{ + nl => _, + qos => _, + rap => _, + subid => _, + _ => _ + } +}. + %%================================================================================ %% API functions %%================================================================================ @@ -39,41 +81,131 @@ %% @doc Process a new subscription -spec on_subscribe( emqx_persistent_session_ds:topic_filter(), - emqx_persistent_session_ds:subscription(), - emqx_persistent_session_ds_state:t() + emqx_types:subopts(), + emqx_persistent_session_ds:session() ) -> - emqx_persistent_session_ds_state:t(). -on_subscribe(TopicFilter, Subscription, S) -> - emqx_persistent_session_ds_state:put_subscription(TopicFilter, [], Subscription, S). + {ok, emqx_persistent_session_ds_state:t()} | {error, ?RC_QUOTA_EXCEEDED}. +on_subscribe(TopicFilter, SubOpts, #{id := SessionId, s := S0, props := Props}) -> + #{upgrade_qos := UpgradeQoS, max_subscriptions := MaxSubscriptions} = Props, + case emqx_persistent_session_ds_state:get_subscription(TopicFilter, S0) of + undefined -> + %% This is a new subscription: + case emqx_persistent_session_ds_state:n_subscriptions(S0) < MaxSubscriptions of + true -> + ok = emqx_persistent_session_ds_router:do_add_route(TopicFilter, SessionId), + {SubId, S1} = emqx_persistent_session_ds_state:new_id(S0), + {SStateId, S2} = emqx_persistent_session_ds_state:new_id(S1), + SState = #{ + parent_subscription => SubId, upgrade_qos => UpgradeQoS, subopts => SubOpts + }, + S3 = emqx_persistent_session_ds_state:put_subscription_state( + SStateId, SState, S2 + ), + Subscription = #{ + id => SubId, + current_state => SStateId, + start_time => now_ms() + }, + S = emqx_persistent_session_ds_state:put_subscription( + TopicFilter, Subscription, S3 + ), + ?tp(persistent_session_ds_subscription_added, #{ + topic_filter => TopicFilter, session => SessionId + }), + {ok, S}; + false -> + {error, ?RC_QUOTA_EXCEEDED} + end; + Sub0 = #{current_state := SStateId0, id := SubId} -> + SState = #{parent_subscription => SubId, upgrade_qos => UpgradeQoS, subopts => SubOpts}, + case emqx_persistent_session_ds_state:get_subscription_state(SStateId0, S0) of + SState -> + %% Client resubscribed with the same parameters: + {ok, S0}; + _ -> + %% Subsription parameters changed: + {SStateId, S1} = emqx_persistent_session_ds_state:new_id(S0), + S2 = emqx_persistent_session_ds_state:put_subscription_state( + SStateId, SState, S1 + ), + Sub = Sub0#{current_state => SStateId}, + S = emqx_persistent_session_ds_state:put_subscription(TopicFilter, Sub, S2), + {ok, S} + end + end. %% @doc Process UNSUBSCRIBE -spec on_unsubscribe( + emqx_persistent_session_ds:id(), emqx_persistent_session_ds:topic_filter(), - emqx_persistent_session_ds:subscription(), emqx_persistent_session_ds_state:t() ) -> - emqx_persistent_session_ds_state:t(). -on_unsubscribe(TopicFilter, Subscription0, S0) -> - %% Note: we cannot delete the subscription immediately, since its - %% metadata can be used during replay (see `process_batch'). We - %% instead mark it as deleted, and let `subscription_gc' function - %% dispatch it later: - Subscription = Subscription0#{deleted => true}, - emqx_persistent_session_ds_state:put_subscription(TopicFilter, [], Subscription, S0). + {ok, emqx_persistent_session_ds_state:t(), emqx_persistent_session_ds:subscription()} + | {error, ?RC_NO_SUBSCRIPTION_EXISTED}. +on_unsubscribe(SessionId, TopicFilter, S0) -> + case lookup(TopicFilter, S0) of + undefined -> + {error, ?RC_NO_SUBSCRIPTION_EXISTED}; + Subscription -> + ?tp(persistent_session_ds_subscription_delete, #{ + session_id => SessionId, topic_filter => TopicFilter + }), + ?tp_span( + persistent_session_ds_subscription_route_delete, + #{session_id => SessionId, topic_filter => TopicFilter}, + ok = emqx_persistent_session_ds_router:do_delete_route(TopicFilter, SessionId) + ), + {ok, emqx_persistent_session_ds_state:del_subscription(TopicFilter, S0), Subscription} + end. -%% @doc Remove subscriptions that have been marked for deletion, and -%% that don't have any unacked messages: +-spec on_session_drop(emqx_persistent_session_ds:id(), emqx_persistent_session_ds_state:t()) -> ok. +on_session_drop(SessionId, S0) -> + fold( + fun(TopicFilter, _Subscription, S) -> + case on_unsubscribe(SessionId, TopicFilter, S) of + {ok, S1, _} -> S1; + _ -> S + end + end, + S0, + S0 + ). + +%% @doc Remove subscription states that don't have a parent, and that +%% don't have any unacked messages: -spec gc(emqx_persistent_session_ds_state:t()) -> emqx_persistent_session_ds_state:t(). gc(S0) -> - fold_all( - fun(TopicFilter, #{id := SubId, deleted := Deleted}, Acc) -> - case Deleted andalso has_no_unacked_streams(SubId, S0) of - true -> - emqx_persistent_session_ds_state:del_subscription(TopicFilter, [], Acc); + %% Create a set of subscription states IDs referenced either by a + %% subscription or a stream replay state: + AliveSet0 = emqx_persistent_session_ds_state:fold_subscriptions( + fun(_TopicFilter, #{current_state := SStateId}, Acc) -> + Acc#{SStateId => true} + end, + #{}, + S0 + ), + AliveSet = emqx_persistent_session_ds_state:fold_streams( + fun(_StreamId, SRS = #srs{sub_state_id = SStateId}, Acc) -> + case emqx_persistent_session_ds_stream_scheduler:is_fully_acked(SRS, S0) of false -> + Acc#{SStateId => true}; + true -> Acc end end, + AliveSet0, + S0 + ), + %% Delete dangling subscription states: + emqx_persistent_session_ds_state:fold_subscription_states( + fun(SStateId, _, S) -> + case maps:is_key(SStateId, AliveSet) of + true -> + S; + false -> + emqx_persistent_session_ds_state:del_subscription_state(SStateId, S) + end + end, S0, S0 ). @@ -82,12 +214,16 @@ gc(S0) -> -spec lookup(emqx_persistent_session_ds:topic_filter(), emqx_persistent_session_ds_state:t()) -> emqx_persistent_session_ds:subscription() | undefined. lookup(TopicFilter, S) -> - Subs = emqx_persistent_session_ds_state:get_subscriptions(S), - case emqx_topic_gbt:lookup(TopicFilter, [], Subs, undefined) of - #{deleted := true} -> - undefined; - Sub -> - Sub + case emqx_persistent_session_ds_state:get_subscription(TopicFilter, S) of + Sub = #{current_state := SStateId} -> + case emqx_persistent_session_ds_state:get_subscription_state(SStateId, S) of + #{subopts := SubOpts} -> + Sub#{subopts => SubOpts}; + undefined -> + undefined + end; + undefined -> + undefined end. %% @doc Convert active subscriptions to a map, for information @@ -95,7 +231,7 @@ lookup(TopicFilter, S) -> -spec to_map(emqx_persistent_session_ds_state:t()) -> map(). to_map(S) -> fold( - fun(TopicFilter, #{props := Props}, Acc) -> Acc#{TopicFilter => Props} end, + fun(TopicFilter, _, Acc) -> Acc#{TopicFilter => lookup(TopicFilter, S)} end, #{}, S ). @@ -107,48 +243,29 @@ to_map(S) -> emqx_persistent_session_ds_state:t() ) -> Acc. -fold(Fun, AccIn, S) -> - fold_all( - fun(TopicFilter, Sub = #{deleted := Deleted}, Acc) -> - case Deleted of - true -> Acc; - false -> Fun(TopicFilter, Sub, Acc) - end - end, - AccIn, - S - ). +fold(Fun, Acc, S) -> + emqx_persistent_session_ds_state:fold_subscriptions(Fun, Acc, S). -%% @doc Fold over all subscriptions, including inactive ones: --spec fold_all( - fun((emqx_types:topic(), emqx_persistent_session_ds:subscription(), Acc) -> Acc), - Acc, - emqx_persistent_session_ds_state:t() -) -> - Acc. -fold_all(Fun, AccIn, S) -> - Subs = emqx_persistent_session_ds_state:get_subscriptions(S), - emqx_topic_gbt:fold( - fun(Key, Sub, Acc) -> Fun(emqx_topic_gbt:get_topic(Key), Sub, Acc) end, - AccIn, - Subs - ). +-spec cold_get_subscription(emqx_persistent_session_ds:id(), emqx_types:topic()) -> + emqx_persistent_session_ds:subscription() | undefined. +cold_get_subscription(SessionId, Topic) -> + case emqx_persistent_session_ds_state:cold_get_subscription(SessionId, Topic) of + [Sub = #{current_state := SStateId}] -> + case + emqx_persistent_session_ds_state:cold_get_subscription_state(SessionId, SStateId) + of + [#{subopts := Subopts}] -> + Sub#{subopts => Subopts}; + _ -> + undefined + end; + _ -> + undefined + end. %%================================================================================ %% Internal functions %%================================================================================ --spec has_no_unacked_streams( - emqx_persistent_session_ds:subscription_id(), emqx_persistent_session_ds_state:t() -) -> boolean(). -has_no_unacked_streams(SubId, S) -> - emqx_persistent_session_ds_state:fold_streams( - fun - ({SID, _Stream}, Srs, Acc) when SID =:= SubId -> - emqx_persistent_session_ds_stream_scheduler:is_fully_acked(Srs, S) andalso Acc; - (_StreamKey, _Srs, Acc) -> - Acc - end, - true, - S - ). +now_ms() -> + erlang:system_time(millisecond). diff --git a/apps/emqx/src/emqx_schema.erl b/apps/emqx/src/emqx_schema.erl index 427df5db0..02e31387e 100644 --- a/apps/emqx/src/emqx_schema.erl +++ b/apps/emqx/src/emqx_schema.erl @@ -61,6 +61,8 @@ }. -type url() :: binary(). -type json_binary() :: binary(). +-type template() :: binary(). +-type template_str() :: string(). -typerefl_from_string({duration/0, emqx_schema, to_duration}). -typerefl_from_string({duration_s/0, emqx_schema, to_duration_s}). @@ -78,6 +80,8 @@ -typerefl_from_string({comma_separated_atoms/0, emqx_schema, to_comma_separated_atoms}). -typerefl_from_string({url/0, emqx_schema, to_url}). -typerefl_from_string({json_binary/0, emqx_schema, to_json_binary}). +-typerefl_from_string({template/0, emqx_schema, to_template}). +-typerefl_from_string({template_str/0, emqx_schema, to_template_str}). -type parsed_server() :: #{ hostname := string(), @@ -120,7 +124,9 @@ to_erl_cipher_suite/1, to_comma_separated_atoms/1, to_url/1, - to_json_binary/1 + to_json_binary/1, + to_template/1, + to_template_str/1 ]). -export([ @@ -160,7 +166,9 @@ comma_separated_atoms/0, url/0, json_binary/0, - port_number/0 + port_number/0, + template/0, + template_str/0 ]). -export([namespace/0, roots/0, roots/1, fields/1, desc/1, tags/0]). @@ -1734,20 +1742,38 @@ fields(durable_storage) -> emqx_ds_schema:schema(); fields("client_attrs_init") -> [ - {extract_from, + {expression, sc( - hoconsc:enum([clientid, username, cn, dn, user_property]), - #{desc => ?DESC("client_attrs_init_extract_from")} + typerefl:alias("string", any()), + #{ + desc => ?DESC("client_attrs_init_expression"), + converter => fun compile_variform/2 + } )}, - {extract_regexp, sc(binary(), #{desc => ?DESC("client_attrs_init_extract_regexp")})}, - {extract_as, + {set_as_attr, sc(binary(), #{ - default => <<"alias">>, - desc => ?DESC("client_attrs_init_extract_as"), + desc => ?DESC("client_attrs_init_set_as_attr"), validator => fun restricted_string/1 })} ]. +compile_variform(undefined, _Opts) -> + undefined; +compile_variform(Expression, #{make_serializable := true}) -> + case is_binary(Expression) of + true -> + Expression; + false -> + emqx_variform:decompile(Expression) + end; +compile_variform(Expression, _Opts) -> + case emqx_variform:compile(Expression) of + {ok, Compiled} -> + Compiled; + {error, Reason} -> + throw(#{expression => Expression, reason => Reason}) + end. + restricted_string(Str) -> case emqx_utils:is_restricted_str(Str) of true -> ok; @@ -2576,6 +2602,12 @@ to_json_binary(Str) -> Error end. +to_template(Str) -> + {ok, iolist_to_binary(Str)}. + +to_template_str(Str) -> + {ok, unicode:characters_to_list(Str, utf8)}. + %% @doc support the following format: %% - 127.0.0.1:1883 %% - ::1:1883 @@ -3552,9 +3584,9 @@ mqtt_general() -> )}, {"client_attrs_init", sc( - hoconsc:union([disabled, ref("client_attrs_init")]), + hoconsc:array(ref("client_attrs_init")), #{ - default => disabled, + default => [], desc => ?DESC("client_attrs_init") } )} diff --git a/apps/emqx/src/emqx_session.erl b/apps/emqx/src/emqx_session.erl index 37a86bda6..3892740a6 100644 --- a/apps/emqx/src/emqx_session.erl +++ b/apps/emqx/src/emqx_session.erl @@ -429,6 +429,11 @@ enrich_deliver(ClientInfo, {deliver, Topic, Msg}, UpgradeQoS, Session) -> end, enrich_message(ClientInfo, Msg, SubOpts, UpgradeQoS). +%% Caution: updating this function _may_ break consistency of replay +%% for persistent sessions. Persistent sessions expect it to return +%% the same result during replay. If it changes the behavior between +%% releases, sessions restored from the cold storage may end up +%% replaying messages with different QoS, etc. enrich_message( ClientInfo = #{clientid := ClientId}, Msg = #message{from = ClientId}, diff --git a/apps/emqx/src/emqx_sys.erl b/apps/emqx/src/emqx_sys.erl index cc8eec3af..f50e23235 100644 --- a/apps/emqx/src/emqx_sys.erl +++ b/apps/emqx/src/emqx_sys.erl @@ -22,6 +22,7 @@ -include("types.hrl"). -include("logger.hrl"). -include("emqx_hooks.hrl"). +-include("emqx_mqtt.hrl"). -export([ start_link/0, @@ -279,7 +280,7 @@ on_client_subscribed( clientid => ClientId, username => Username, protocol => Protocol, - topic => Topic, + topic => emqx_topic:maybe_format_share(Topic), subopts => SubOpts, ts => erlang:system_time(millisecond) }, @@ -298,7 +299,7 @@ on_client_unsubscribed( clientid => ClientId, username => Username, protocol => Protocol, - topic => Topic, + topic => emqx_topic:maybe_format_share(Topic), ts => erlang:system_time(millisecond) }, publish(unsubscribed, Payload). diff --git a/apps/emqx/src/emqx_trace/emqx_trace.erl b/apps/emqx/src/emqx_trace/emqx_trace.erl index f3a5be084..7bbe59b2b 100644 --- a/apps/emqx/src/emqx_trace/emqx_trace.erl +++ b/apps/emqx/src/emqx_trace/emqx_trace.erl @@ -28,7 +28,8 @@ subscribe/3, unsubscribe/2, log/3, - log/4 + log/4, + rendered_action_template/2 ]). -export([ @@ -66,6 +67,9 @@ -export_type([ip_address/0]). -type ip_address() :: string(). +-export_type([ruleid/0]). +-type ruleid() :: binary(). + publish(#message{topic = <<"$SYS/", _/binary>>}) -> ignore; publish(#message{from = From, topic = Topic, payload = Payload}) when @@ -83,6 +87,32 @@ unsubscribe(<<"$SYS/", _/binary>>, _SubOpts) -> unsubscribe(Topic, SubOpts) -> ?TRACE("UNSUBSCRIBE", "unsubscribe", #{topic => Topic, sub_opts => SubOpts}). +rendered_action_template(ActionID, RenderResult) -> + TraceResult = ?TRACE( + "QUERY_RENDER", + "action_template_rendered", + #{ + result => RenderResult, + action_id => ActionID + } + ), + case logger:get_process_metadata() of + #{stop_action_after_render := true} -> + %% We throw an unrecoverable error to stop action before the + %% resource is called/modified + StopMsg = lists:flatten( + io_lib:format( + "Action ~ts stopped after template rendering due to test setting.", + [ActionID] + ) + ), + MsgBin = unicode:characters_to_binary(StopMsg), + error({unrecoverable_error, {action_stopped_after_template_rendering, MsgBin}}); + _ -> + ok + end, + TraceResult. + log(List, Msg, Meta) -> log(debug, List, Msg, Meta). @@ -159,8 +189,10 @@ create(Trace) -> case mnesia:table_info(?TRACE, size) < ?MAX_SIZE of true -> case to_trace(Trace) of - {ok, TraceRec} -> insert_new_trace(TraceRec); - {error, Reason} -> {error, Reason} + {ok, TraceRec} -> + insert_new_trace(TraceRec); + {error, Reason} -> + {error, Reason} end; false -> {error, @@ -222,7 +254,11 @@ format(Traces) -> lists:map( fun(Trace0 = #?TRACE{}) -> [_ | Values] = tuple_to_list(Trace0), - maps:from_list(lists:zip(Fields, Values)) + Map0 = maps:from_list(lists:zip(Fields, Values)), + Extra = maps:get(extra, Map0, #{}), + Formatter = maps:get(formatter, Extra, text), + Map1 = Map0#{formatter => Formatter}, + maps:remove(extra, Map1) end, Traces ). @@ -368,9 +404,17 @@ start_trace(Trace) -> type = Type, filter = Filter, start_at = Start, - payload_encode = PayloadEncode + payload_encode = PayloadEncode, + extra = Extra } = Trace, - Who = #{name => Name, type => Type, filter => Filter, payload_encode => PayloadEncode}, + Formatter = maps:get(formatter, Extra, text), + Who = #{ + name => Name, + type => Type, + filter => Filter, + payload_encode => PayloadEncode, + formatter => Formatter + }, emqx_trace_handler:install(Who, debug, log_file(Name, Start)). stop_trace(Finished, Started) -> @@ -517,6 +561,9 @@ to_trace(#{type := ip_address, ip_address := Filter} = Trace, Rec) -> Error -> Error end; +to_trace(#{type := ruleid, ruleid := Filter} = Trace, Rec) -> + Trace0 = maps:without([type, ruleid], Trace), + to_trace(Trace0, Rec#?TRACE{type = ruleid, filter = Filter}); to_trace(#{type := Type}, _Rec) -> {error, io_lib:format("required ~s field", [Type])}; to_trace(#{payload_encode := PayloadEncode} = Trace, Rec) -> @@ -532,6 +579,12 @@ to_trace(#{end_at := EndAt} = Trace, Rec) -> {ok, _Sec} -> {error, "end_at time has already passed"} end; +to_trace(#{formatter := Formatter} = Trace, Rec) -> + Extra = Rec#?TRACE.extra, + to_trace( + maps:remove(formatter, Trace), + Rec#?TRACE{extra = Extra#{formatter => Formatter}} + ); to_trace(_, Rec) -> {ok, Rec}. diff --git a/apps/emqx/src/emqx_trace/emqx_trace_handler.erl b/apps/emqx/src/emqx_trace/emqx_trace_handler.erl index 313826cde..8179f8c0a 100644 --- a/apps/emqx/src/emqx_trace/emqx_trace_handler.erl +++ b/apps/emqx/src/emqx_trace/emqx_trace_handler.erl @@ -27,12 +27,14 @@ install/3, install/4, install/5, + install/6, uninstall/1, uninstall/2 ]). %% For logger handler filters callbacks -export([ + filter_ruleid/2, filter_clientid/2, filter_topic/2, filter_ip_address/2 @@ -45,7 +47,8 @@ name := binary(), type := clientid | topic | ip_address, filter := emqx_types:clientid() | emqx_types:topic() | emqx_trace:ip_address(), - payload_encode := text | hidden | hex + payload_encode := text | hidden | hex, + formatter => json | text }. -define(CONFIG(_LogFile_), #{ @@ -68,17 +71,29 @@ Type :: clientid | topic | ip_address, Filter :: emqx_types:clientid() | emqx_types:topic() | string(), Level :: logger:level() | all, - LogFilePath :: string() + LogFilePath :: string(), + Formatter :: text | json ) -> ok | {error, term()}. -install(Name, Type, Filter, Level, LogFile) -> +install(Name, Type, Filter, Level, LogFile, Formatter) -> Who = #{ type => Type, filter => ensure_bin(Filter), name => ensure_bin(Name), - payload_encode => payload_encode() + payload_encode => payload_encode(), + formatter => Formatter }, install(Who, Level, LogFile). +-spec install( + Name :: binary() | list(), + Type :: clientid | topic | ip_address, + Filter :: emqx_types:clientid() | emqx_types:topic() | string(), + Level :: logger:level() | all, + LogFilePath :: string() +) -> ok | {error, term()}. +install(Name, Type, Filter, Level, LogFile) -> + install(Name, Type, Filter, Level, LogFile, text). + -spec install( Type :: clientid | topic | ip_address, Filter :: emqx_types:clientid() | emqx_types:topic() | string(), @@ -133,9 +148,23 @@ uninstall(HandlerId) -> running() -> lists:foldl(fun filter_traces/2, [], emqx_logger:get_log_handlers(started)). +-spec filter_ruleid(logger:log_event(), {binary(), atom()}) -> logger:log_event() | stop. +filter_ruleid(#{meta := Meta = #{rule_id := RuleId}} = Log, {MatchId, _Name}) -> + RuleIDs = maps:get(rule_ids, Meta, #{}), + IsMatch = (RuleId =:= MatchId) orelse maps:get(MatchId, RuleIDs, false), + filter_ret(IsMatch andalso is_trace(Meta), Log); +filter_ruleid(#{meta := Meta = #{rule_ids := RuleIDs}} = Log, {MatchId, _Name}) -> + filter_ret(maps:get(MatchId, RuleIDs, false) andalso is_trace(Meta), Log); +filter_ruleid(_Log, _ExpectId) -> + stop. + -spec filter_clientid(logger:log_event(), {binary(), atom()}) -> logger:log_event() | stop. filter_clientid(#{meta := Meta = #{clientid := ClientId}} = Log, {MatchId, _Name}) -> - filter_ret(ClientId =:= MatchId andalso is_trace(Meta), Log); + ClientIDs = maps:get(client_ids, Meta, #{}), + IsMatch = (ClientId =:= MatchId) orelse maps:get(MatchId, ClientIDs, false), + filter_ret(IsMatch andalso is_trace(Meta), Log); +filter_clientid(#{meta := Meta = #{client_ids := ClientIDs}} = Log, {MatchId, _Name}) -> + filter_ret(maps:get(MatchId, ClientIDs, false) andalso is_trace(Meta), Log); filter_clientid(_Log, _ExpectId) -> stop. @@ -164,8 +193,14 @@ filters(#{type := clientid, filter := Filter, name := Name}) -> filters(#{type := topic, filter := Filter, name := Name}) -> [{topic, {fun ?MODULE:filter_topic/2, {ensure_bin(Filter), Name}}}]; filters(#{type := ip_address, filter := Filter, name := Name}) -> - [{ip_address, {fun ?MODULE:filter_ip_address/2, {ensure_list(Filter), Name}}}]. + [{ip_address, {fun ?MODULE:filter_ip_address/2, {ensure_list(Filter), Name}}}]; +filters(#{type := ruleid, filter := Filter, name := Name}) -> + [{ruleid, {fun ?MODULE:filter_ruleid/2, {ensure_bin(Filter), Name}}}]. +formatter(#{type := _Type, payload_encode := PayloadEncode, formatter := json}) -> + {emqx_trace_json_formatter, #{ + payload_encode => PayloadEncode + }}; formatter(#{type := _Type, payload_encode := PayloadEncode}) -> {emqx_trace_formatter, #{ %% template is for ?SLOG message not ?TRACE. @@ -184,7 +219,8 @@ filter_traces(#{id := Id, level := Level, dst := Dst, filters := Filters}, Acc) [{Type, {FilterFun, {Filter, Name}}}] when Type =:= topic orelse Type =:= clientid orelse - Type =:= ip_address + Type =:= ip_address orelse + Type =:= ruleid -> [Init#{type => Type, filter => Filter, name => Name, filter_fun => FilterFun} | Acc]; _ -> diff --git a/apps/emqx/src/emqx_trace/emqx_trace_json_formatter.erl b/apps/emqx/src/emqx_trace/emqx_trace_json_formatter.erl new file mode 100644 index 000000000..35b09b9b0 --- /dev/null +++ b/apps/emqx/src/emqx_trace/emqx_trace_json_formatter.erl @@ -0,0 +1,130 @@ +%%-------------------------------------------------------------------- +%% Copyright (c) 2020-2024 EMQ Technologies Co., Ltd. All Rights Reserved. +%% +%% Licensed under the Apache License, Version 2.0 (the "License"); +%% you may not use this file except in compliance with the License. +%% You may obtain a copy of the License at +%% +%% http://www.apache.org/licenses/LICENSE-2.0 +%% +%% Unless required by applicable law or agreed to in writing, software +%% distributed under the License is distributed on an "AS IS" BASIS, +%% WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +%% See the License for the specific language governing permissions and +%% limitations under the License. +%%-------------------------------------------------------------------- +-module(emqx_trace_json_formatter). + +-include("emqx_mqtt.hrl"). + +-export([format/2]). + +%% logger_formatter:config/0 is not exported. +-type config() :: map(). + +%%%----------------------------------------------------------------- +%%% Callback Function +%%%----------------------------------------------------------------- + +-spec format(LogEvent, Config) -> unicode:chardata() when + LogEvent :: logger:log_event(), + Config :: config(). +format( + LogMap, + #{payload_encode := PEncode} +) -> + %% We just make some basic transformations on the input LogMap and then do + %% an external call to create the JSON text + Time = emqx_utils_calendar:now_to_rfc3339(microsecond), + LogMap1 = LogMap#{time => Time}, + LogMap2 = prepare_log_map(LogMap1, PEncode), + [emqx_logger_jsonfmt:best_effort_json(LogMap2, [force_utf8]), "\n"]. + +%%%----------------------------------------------------------------- +%%% Helper Functions +%%%----------------------------------------------------------------- + +prepare_log_map(LogMap, PEncode) -> + NewKeyValuePairs = [prepare_key_value(K, V, PEncode) || {K, V} <- maps:to_list(LogMap)], + maps:from_list(NewKeyValuePairs). + +prepare_key_value(payload = K, V, PEncode) -> + NewV = + try + format_payload(V, PEncode) + catch + _:_ -> + V + end, + {K, NewV}; +prepare_key_value(packet = K, V, PEncode) -> + NewV = + try + format_packet(V, PEncode) + catch + _:_ -> + V + end, + {K, NewV}; +prepare_key_value(rule_ids = K, V, _PEncode) -> + NewV = + try + format_map_set_to_list(V) + catch + _:_ -> + V + end, + {K, NewV}; +prepare_key_value(client_ids = K, V, _PEncode) -> + NewV = + try + format_map_set_to_list(V) + catch + _:_ -> + V + end, + {K, NewV}; +prepare_key_value(action_id = K, V, _PEncode) -> + try + {action_info, format_action_info(V)} + catch + _:_ -> + {K, V} + end; +prepare_key_value(K, V, PEncode) when is_map(V) -> + {K, prepare_log_map(V, PEncode)}; +prepare_key_value(K, V, _PEncode) -> + {K, V}. + +format_packet(undefined, _) -> ""; +format_packet(Packet, Encode) -> emqx_packet:format(Packet, Encode). + +format_payload(undefined, _) -> + ""; +format_payload(_, hidden) -> + "******"; +format_payload(Payload, text) when ?MAX_PAYLOAD_FORMAT_LIMIT(Payload) -> + unicode:characters_to_list(Payload); +format_payload(Payload, hex) when ?MAX_PAYLOAD_FORMAT_LIMIT(Payload) -> binary:encode_hex(Payload); +format_payload(<> = Payload, Type) -> + emqx_packet:format_truncated_payload(Part, byte_size(Payload), Type). + +format_map_set_to_list(Map) -> + Items = [ + begin + %% Assert that it is really a map set + true = V, + %% Assert that the keys have the expected type + true = is_binary(K), + K + end + || {K, V} <- maps:to_list(Map) + ], + lists:sort(Items). + +format_action_info(V) -> + [<<"action">>, Type, Name | _] = binary:split(V, <<":">>, [global]), + #{ + type => Type, + name => Name + }. diff --git a/apps/emqx/test/emqx_client_SUITE.erl b/apps/emqx/test/emqx_client_SUITE.erl index ba38d92ff..f0afe6195 100644 --- a/apps/emqx/test/emqx_client_SUITE.erl +++ b/apps/emqx/test/emqx_client_SUITE.erl @@ -395,13 +395,14 @@ t_certdn_as_alias(_) -> test_cert_extraction_as_alias(Which) -> %% extract the first two chars - Re = <<"^(..).*$">>, ClientId = iolist_to_binary(["ClientIdFor_", atom_to_list(Which)]), - emqx_config:put_zone_conf(default, [mqtt, client_attrs_init], #{ - extract_from => Which, - extract_regexp => Re, - extract_as => <<"alias">> - }), + {ok, Compiled} = emqx_variform:compile("substr(" ++ atom_to_list(Which) ++ ",0,2)"), + emqx_config:put_zone_conf(default, [mqtt, client_attrs_init], [ + #{ + expression => Compiled, + set_as_attr => <<"alias">> + } + ]), SslConf = emqx_common_test_helpers:client_mtls('tlsv1.2'), {ok, Client} = emqtt:start_link([ {clientid, ClientId}, {port, 8883}, {ssl, true}, {ssl_opts, SslConf} @@ -416,10 +417,13 @@ test_cert_extraction_as_alias(Which) -> t_client_attr_from_user_property(_Config) -> ClientId = atom_to_binary(?FUNCTION_NAME), - emqx_config:put_zone_conf(default, [mqtt, client_attrs_init], #{ - extract_from => user_property, - extract_as => <<"group">> - }), + {ok, Compiled} = emqx_variform:compile("user_property.group"), + emqx_config:put_zone_conf(default, [mqtt, client_attrs_init], [ + #{ + expression => Compiled, + set_as_attr => <<"group">> + } + ]), SslConf = emqx_common_test_helpers:client_mtls('tlsv1.3'), {ok, Client} = emqtt:start_link([ {clientid, ClientId}, diff --git a/apps/emqx/test/emqx_config_SUITE.erl b/apps/emqx/test/emqx_config_SUITE.erl index b3e60f793..a9b4a8328 100644 --- a/apps/emqx/test/emqx_config_SUITE.erl +++ b/apps/emqx/test/emqx_config_SUITE.erl @@ -454,7 +454,7 @@ zone_global_defaults() -> upgrade_qos => false, use_username_as_clientid => false, wildcard_subscription => true, - client_attrs_init => disabled + client_attrs_init => [] }, overload_protection => #{ diff --git a/apps/emqx/test/emqx_listeners_SUITE.erl b/apps/emqx/test/emqx_listeners_SUITE.erl index acd7656d7..ba84699c6 100644 --- a/apps/emqx/test/emqx_listeners_SUITE.erl +++ b/apps/emqx/test/emqx_listeners_SUITE.erl @@ -150,11 +150,13 @@ t_client_attr_as_mountpoint(_Config) -> <<"limiter">> => #{}, <<"mountpoint">> => <<"groups/${client_attrs.ns}/">> }, - emqx_config:put_zone_conf(default, [mqtt, client_attrs_init], #{ - extract_from => clientid, - extract_regexp => <<"^(.+)-.+$">>, - extract_as => <<"ns">> - }), + {ok, Compiled} = emqx_variform:compile("nth(1,tokens(clientid,'-'))"), + emqx_config:put_zone_conf(default, [mqtt, client_attrs_init], [ + #{ + expression => Compiled, + set_as_attr => <<"ns">> + } + ]), emqx_logger:set_log_level(debug), with_listener(tcp, attr_as_moutpoint, ListenerConf, fun() -> {ok, Client} = emqtt:start_link(#{ @@ -170,7 +172,7 @@ t_client_attr_as_mountpoint(_Config) -> ?assertMatch([_], emqx_router:match_routes(MatchTopic)), emqtt:stop(Client) end), - emqx_config:put_zone_conf(default, [mqtt, client_attrs_init], disabled), + emqx_config:put_zone_conf(default, [mqtt, client_attrs_init], []), ok. t_current_conns_tcp(_Config) -> diff --git a/apps/emqx/test/emqx_persistent_messages_SUITE.erl b/apps/emqx/test/emqx_persistent_messages_SUITE.erl index 94bc58908..492fcaa6b 100644 --- a/apps/emqx/test/emqx_persistent_messages_SUITE.erl +++ b/apps/emqx/test/emqx_persistent_messages_SUITE.erl @@ -476,7 +476,7 @@ t_replication_options(_Config) -> resend_window := 60 } }, - emqx_ds_replication_layer_meta:get_options(?PERSISTENT_MESSAGE_DB) + emqx_ds_replication_layer_meta:db_config(?PERSISTENT_MESSAGE_DB) ), ?assertMatch( #{ @@ -584,6 +584,8 @@ message(Topic, Payload, PublishedAt) -> id = emqx_guid:gen() }. +on_message_dropped(#message{flags = #{sys := true}}, _Context, _Res, _TestPid) -> + ok; on_message_dropped(Msg, Context, Res, TestPid) -> ErrCtx = #{msg => Msg, ctx => Context, res => Res}, ct:pal("this hook should not be called.\n ~p", [ErrCtx]), diff --git a/apps/emqx/test/emqx_persistent_session_ds_state_tests.erl b/apps/emqx/test/emqx_persistent_session_ds_state_tests.erl index 61e0575a8..375b4f4b1 100644 --- a/apps/emqx/test/emqx_persistent_session_ds_state_tests.erl +++ b/apps/emqx/test/emqx_persistent_session_ds_state_tests.erl @@ -74,9 +74,6 @@ session_id() -> topic() -> oneof([<<"foo">>, <<"bar">>, <<"foo/#">>, <<"//+/#">>]). -subid() -> - oneof([[]]). - subscription() -> oneof([#{}]). @@ -129,18 +126,25 @@ put_req() -> {Track, Seqno}, {seqno_track(), seqno()}, {#s.seqno, put_seqno, Track, Seqno} + ), + ?LET( + {Topic, Subscription}, + {topic(), subscription()}, + {#s.subs, put_subscription, Topic, Subscription} ) ]). get_req() -> oneof([ {#s.streams, get_stream, stream_id()}, - {#s.seqno, get_seqno, seqno_track()} + {#s.seqno, get_seqno, seqno_track()}, + {#s.subs, get_subscription, topic()} ]). del_req() -> oneof([ - {#s.streams, del_stream, stream_id()} + {#s.streams, del_stream, stream_id()}, + {#s.subs, del_subscription, topic()} ]). command(S) -> @@ -153,13 +157,6 @@ command(S) -> {2, {call, ?MODULE, reopen, [session_id(S)]}}, {2, {call, ?MODULE, commit, [session_id(S)]}}, - %% Subscriptions: - {3, - {call, ?MODULE, put_subscription, [ - session_id(S), topic(), subid(), subscription() - ]}}, - {3, {call, ?MODULE, del_subscription, [session_id(S), topic(), subid()]}}, - %% Metadata: {3, {call, ?MODULE, put_metadata, [session_id(S), put_metadata()]}}, {3, {call, ?MODULE, get_metadata, [session_id(S), get_metadata()]}}, @@ -170,7 +167,6 @@ command(S) -> {3, {call, ?MODULE, gen_del, [session_id(S), del_req()]}}, %% Getters: - {4, {call, ?MODULE, get_subscriptions, [session_id(S)]}}, {1, {call, ?MODULE, iterate_sessions, [batch_size()]}} ]); false -> @@ -207,19 +203,6 @@ postcondition(S, {call, ?MODULE, gen_get, [SessionId, {Idx, Fun, Key}]}, Result) #{session_id => SessionId, key => Key, 'fun' => Fun} ), true; -postcondition(S, {call, ?MODULE, get_subscriptions, [SessionId]}, Result) -> - #{SessionId := #s{subs = Subs}} = S, - ?assertEqual(maps:size(Subs), emqx_topic_gbt:size(Result)), - maps:foreach( - fun({TopicFilter, Id}, Expected) -> - ?assertEqual( - Expected, - emqx_topic_gbt:lookup(TopicFilter, Id, Result, default) - ) - end, - Subs - ), - true; postcondition(_, _, _) -> true. @@ -227,22 +210,6 @@ next_state(S, _V, {call, ?MODULE, create_new, [SessionId]}) -> S#{SessionId => #s{}}; next_state(S, _V, {call, ?MODULE, delete, [SessionId]}) -> maps:remove(SessionId, S); -next_state(S, _V, {call, ?MODULE, put_subscription, [SessionId, TopicFilter, SubId, Subscription]}) -> - Key = {TopicFilter, SubId}, - update( - SessionId, - #s.subs, - fun(Subs) -> Subs#{Key => Subscription} end, - S - ); -next_state(S, _V, {call, ?MODULE, del_subscription, [SessionId, TopicFilter, SubId]}) -> - Key = {TopicFilter, SubId}, - update( - SessionId, - #s.subs, - fun(Subs) -> maps:remove(Key, Subs) end, - S - ); next_state(S, _V, {call, ?MODULE, put_metadata, [SessionId, {Key, _Fun, Val}]}) -> update( SessionId, @@ -296,19 +263,6 @@ reopen(SessionId) -> {ok, S} = emqx_persistent_session_ds_state:open(SessionId), put_state(SessionId, S). -put_subscription(SessionId, TopicFilter, SubId, Subscription) -> - S = emqx_persistent_session_ds_state:put_subscription( - TopicFilter, SubId, Subscription, get_state(SessionId) - ), - put_state(SessionId, S). - -del_subscription(SessionId, TopicFilter, SubId) -> - S = emqx_persistent_session_ds_state:del_subscription(TopicFilter, SubId, get_state(SessionId)), - put_state(SessionId, S). - -get_subscriptions(SessionId) -> - emqx_persistent_session_ds_state:get_subscriptions(get_state(SessionId)). - put_metadata(SessionId, {_MetaKey, Fun, Value}) -> S = apply(emqx_persistent_session_ds_state, Fun, [Value, get_state(SessionId)]), put_state(SessionId, S). diff --git a/apps/emqx/test/emqx_shared_sub_SUITE.erl b/apps/emqx/test/emqx_shared_sub_SUITE.erl index df713ac74..040b3d295 100644 --- a/apps/emqx/test/emqx_shared_sub_SUITE.erl +++ b/apps/emqx/test/emqx_shared_sub_SUITE.erl @@ -1004,9 +1004,9 @@ t_different_groups_same_topic(Config) when is_list(Config) -> GroupB = <<"bb">>, Topic = <<"t/1">>, - SharedTopicGroupA = ?SHARE(GroupA, Topic), + SharedTopicGroupA = format_share(GroupA, Topic), ?UPDATE_SUB_QOS(C, SharedTopicGroupA, ?QOS_2), - SharedTopicGroupB = ?SHARE(GroupB, Topic), + SharedTopicGroupB = format_share(GroupB, Topic), ?UPDATE_SUB_QOS(C, SharedTopicGroupB, ?QOS_2), ?retry( @@ -1050,11 +1050,11 @@ t_different_groups_update_subopts(Config) when is_list(Config) -> Topic = <<"t/1">>, GroupA = <<"aa">>, GroupB = <<"bb">>, - SharedTopicGroupA = ?SHARE(GroupA, Topic), - SharedTopicGroupB = ?SHARE(GroupB, Topic), + SharedTopicGroupA = format_share(GroupA, Topic), + SharedTopicGroupB = format_share(GroupB, Topic), Fun = fun(Group, QoS) -> - ?UPDATE_SUB_QOS(C, ?SHARE(Group, Topic), QoS), + ?UPDATE_SUB_QOS(C, format_share(Group, Topic), QoS), ?assertMatch( #{qos := QoS}, emqx_broker:get_subopts(ClientId, emqx_topic:make_shared_record(Group, Topic)) @@ -1153,6 +1153,9 @@ t_queue_subscription(Config) when is_list(Config) -> %% help functions %%-------------------------------------------------------------------- +format_share(Group, Topic) -> + emqx_topic:maybe_format_share(emqx_topic:make_shared_record(Group, Topic)). + kill_process(Pid) -> kill_process(Pid, fun(_) -> erlang:exit(Pid, kill) end). diff --git a/apps/emqx/test/emqx_trace_SUITE.erl b/apps/emqx/test/emqx_trace_SUITE.erl index 1652c11b9..ad2991445 100644 --- a/apps/emqx/test/emqx_trace_SUITE.erl +++ b/apps/emqx/test/emqx_trace_SUITE.erl @@ -96,7 +96,7 @@ t_base_create_delete(_Config) -> start_at => Now, end_at => Now + 30 * 60, payload_encode => text, - extra => #{} + formatter => text } ], ?assertEqual(ExpectFormat, emqx_trace:format([TraceRec])), @@ -511,4 +511,13 @@ build_old_trace_data() -> reload() -> catch ok = gen_server:stop(emqx_trace), - {ok, _Pid} = emqx_trace:start_link(). + case emqx_trace:start_link() of + {ok, _Pid} = Res -> + Res; + NotOKRes -> + ct:pal( + "emqx_trace:start_link() gave result: ~p\n" + "(perhaps it is already started)", + [NotOKRes] + ) + end. diff --git a/apps/emqx_auth/src/emqx_authn/emqx_authn_chains.erl b/apps/emqx_auth/src/emqx_authn/emqx_authn_chains.erl index 62163dda3..0d21058e3 100644 --- a/apps/emqx_auth/src/emqx_authn/emqx_authn_chains.erl +++ b/apps/emqx_auth/src/emqx_authn/emqx_authn_chains.erl @@ -353,13 +353,13 @@ init(_Opts) -> ok = emqx_config_handler:add_handler([listeners, '?', '?', ?CONF_ROOT], Module), ok = hook_deny(), {ok, #{hooked => false, providers => #{}, init_done => false}, - {continue, initialize_authentication}}. + {continue, {initialize_authentication, init}}}. handle_call(get_providers, _From, #{providers := Providers} = State) -> reply(Providers, State); handle_call( {register_providers, Providers}, - _From, + From, #{providers := Reg0} = State ) -> case lists:filter(fun({T, _}) -> maps:is_key(T, Reg0) end, Providers) of @@ -371,7 +371,7 @@ handle_call( Reg0, Providers ), - reply(ok, State#{providers := Reg}, initialize_authentication); + reply(ok, State#{providers := Reg}, {initialize_authentication, From}); Clashes -> reply({error, {authentication_type_clash, Clashes}}, State) end; @@ -447,10 +447,10 @@ handle_call(Req, _From, State) -> ?SLOG(error, #{msg => "unexpected_call", call => Req}), {reply, ignored, State}. -handle_continue(initialize_authentication, #{init_done := true} = State) -> +handle_continue({initialize_authentication, _From}, #{init_done := true} = State) -> {noreply, State}; -handle_continue(initialize_authentication, #{providers := Providers} = State) -> - InitDone = initialize_authentication(Providers), +handle_continue({initialize_authentication, From}, #{providers := Providers} = State) -> + InitDone = initialize_authentication(Providers, From), {noreply, maybe_hook(State#{init_done := InitDone})}. handle_cast(Req, State) -> @@ -484,11 +484,13 @@ code_change(_OldVsn, State, _Extra) -> %% Private functions %%------------------------------------------------------------------------------ -initialize_authentication(Providers) -> +initialize_authentication(Providers, From) -> ProviderTypes = maps:keys(Providers), Chains = chain_configs(), HasProviders = has_providers_for_configs(Chains, ProviderTypes), - do_initialize_authentication(Providers, Chains, HasProviders). + Result = do_initialize_authentication(Providers, Chains, HasProviders), + ?tp(info, authn_chains_initialization_done, #{from => From, result => Result}), + Result. do_initialize_authentication(_Providers, _Chains, _HasProviders = false) -> false; @@ -500,7 +502,6 @@ do_initialize_authentication(Providers, Chains, _HasProviders = true) -> Chains ), ok = unhook_deny(), - ?tp(info, authn_chains_initialization_done, #{}), true. initialize_chain_authentication(_Providers, _ChainName, []) -> diff --git a/apps/emqx_auth/test/emqx_authn/emqx_authn_init_SUITE.erl b/apps/emqx_auth/test/emqx_authn/emqx_authn_init_SUITE.erl index fec1f3fa4..78e179ccb 100644 --- a/apps/emqx_auth/test/emqx_authn/emqx_authn_init_SUITE.erl +++ b/apps/emqx_auth/test/emqx_authn/emqx_authn_init_SUITE.erl @@ -69,9 +69,10 @@ t_initialize(_Config) -> emqx_access_control:authenticate(?CLIENTINFO) ), + Self = self(), ?assertWaitEvent( ok = emqx_authn_test_lib:register_fake_providers([{password_based, built_in_database}]), - #{?snk_kind := authn_chains_initialization_done}, + #{?snk_kind := authn_chains_initialization_done, from := {Self, _}}, 100 ), diff --git a/apps/emqx_auth/test/emqx_authz/emqx_authz_SUITE.erl b/apps/emqx_auth/test/emqx_authz/emqx_authz_SUITE.erl index c88dcc244..70dd0bbb6 100644 --- a/apps/emqx_auth/test/emqx_authz/emqx_authz_SUITE.erl +++ b/apps/emqx_auth/test/emqx_authz/emqx_authz_SUITE.erl @@ -557,12 +557,14 @@ t_publish_last_will_testament_denied_topic(_Config) -> t_alias_prefix(_Config) -> {ok, _} = emqx_authz:update(?CMD_REPLACE, [?SOURCE_FILE_CLIENT_ATTR]), - ExtractSuffix = <<"^.*-(.*)$">>, - emqx_config:put_zone_conf(default, [mqtt, client_attrs_init], #{ - extract_from => clientid, - extract_regexp => ExtractSuffix, - extract_as => <<"alias">> - }), + %% '^.*-(.*)$': extract the suffix after the last '-' + {ok, Compiled} = emqx_variform:compile("concat(regex_extract(clientid,'^.*-(.*)$'))"), + emqx_config:put_zone_conf(default, [mqtt, client_attrs_init], [ + #{ + expression => Compiled, + set_as_attr => <<"alias">> + } + ]), ClientId = <<"org1-name2">>, SubTopic = <<"name2/#">>, SubTopicNotAllowed = <<"name3/#">>, @@ -572,7 +574,7 @@ t_alias_prefix(_Config) -> ?assertMatch({ok, _, [?RC_NOT_AUTHORIZED]}, emqtt:subscribe(C, SubTopicNotAllowed)), unlink(C), emqtt:stop(C), - emqx_config:put_zone_conf(default, [mqtt, client_attrs_init], disalbed), + emqx_config:put_zone_conf(default, [mqtt, client_attrs_init], []), ok. %% client is allowed by ACL to publish to its LWT topic, is connected, diff --git a/apps/emqx_bridge/src/emqx_action_info.erl b/apps/emqx_bridge/src/emqx_action_info.erl index 464b2e429..a8aaf9fdd 100644 --- a/apps/emqx_bridge/src/emqx_action_info.erl +++ b/apps/emqx_bridge/src/emqx_action_info.erl @@ -41,6 +41,9 @@ ]). -export([clean_cache/0]). +%% For tests +-export([hard_coded_test_action_info_modules/0]). + -callback bridge_v1_type_name() -> atom() | { @@ -128,8 +131,13 @@ hard_coded_action_info_modules_common() -> emqx_bridge_mqtt_pubsub_action_info ]. +%% This exists so that it can be mocked for test cases +hard_coded_test_action_info_modules() -> []. + hard_coded_action_info_modules() -> - hard_coded_action_info_modules_common() ++ hard_coded_action_info_modules_ee(). + hard_coded_action_info_modules_common() ++ + hard_coded_action_info_modules_ee() ++ + ?MODULE:hard_coded_test_action_info_modules(). %% ==================================================================== %% API diff --git a/apps/emqx_bridge/src/emqx_bridge_v2.erl b/apps/emqx_bridge/src/emqx_bridge_v2.erl index e834dc42e..e6feac7bd 100644 --- a/apps/emqx_bridge/src/emqx_bridge_v2.erl +++ b/apps/emqx_bridge/src/emqx_bridge_v2.erl @@ -1030,7 +1030,26 @@ bridge_v2_type_to_connector_type(Type) -> import_config(RawConf) -> %% actions structure - emqx_bridge:import_config(RawConf, <<"actions">>, ?ROOT_KEY_ACTIONS, config_key_path()). + ActionRes = emqx_bridge:import_config( + RawConf, <<"actions">>, ?ROOT_KEY_ACTIONS, config_key_path() + ), + SourceRes = emqx_bridge:import_config( + RawConf, <<"sources">>, ?ROOT_KEY_SOURCES, config_key_path_sources() + ), + group_import_results([ActionRes, SourceRes]). + +group_import_results(Results0) -> + Results = lists:foldr( + fun + ({ok, OkRes}, {OkAcc, ErrAcc}) -> + {[OkRes | OkAcc], ErrAcc}; + ({error, ErrRes}, {OkAcc, ErrAcc}) -> + {OkAcc, [ErrRes | ErrAcc]} + end, + {[], []}, + Results0 + ), + {results, Results}. %%==================================================================== %% Config Update Handler API diff --git a/apps/emqx_bridge/src/emqx_bridge_v2_api.erl b/apps/emqx_bridge/src/emqx_bridge_v2_api.erl index a7bef1952..e33e1ca07 100644 --- a/apps/emqx_bridge/src/emqx_bridge_v2_api.erl +++ b/apps/emqx_bridge/src/emqx_bridge_v2_api.erl @@ -1007,7 +1007,13 @@ call_operation(NodeOrAll, OperFunc, Args = [_Nodes, _ConfRootKey, BridgeType, Br {error, not_implemented} -> ?NOT_IMPLEMENTED; {error, timeout} -> - ?BAD_REQUEST(<<"Request timeout">>); + BridgeId = emqx_bridge_resource:bridge_id(BridgeType, BridgeName), + ?SLOG(warning, #{ + msg => "bridge_bpapi_call_timeout", + bridge => BridgeId, + call => OperFunc + }), + ?SERVICE_UNAVAILABLE(<<"Request timeout">>); {error, {start_pool_failed, Name, Reason}} -> Msg = bin( io_lib:format("Failed to start ~p pool for reason ~p", [Name, redact(Reason)]) @@ -1018,9 +1024,8 @@ call_operation(NodeOrAll, OperFunc, Args = [_Nodes, _ConfRootKey, BridgeType, Br ?SLOG(warning, #{ msg => "bridge_inconsistent_in_cluster_for_call_operation", reason => not_found, - type => BridgeType, - name => BridgeName, - bridge => BridgeId + bridge => BridgeId, + call => OperFunc }), ?SERVICE_UNAVAILABLE(<<"Bridge not found on remote node: ", BridgeId/binary>>); {error, {node_not_found, Node}} -> diff --git a/apps/emqx_bridge/test/emqx_bridge_api_SUITE.erl b/apps/emqx_bridge/test/emqx_bridge_api_SUITE.erl index 30b6c8b34..08b3270ea 100644 --- a/apps/emqx_bridge/test/emqx_bridge_api_SUITE.erl +++ b/apps/emqx_bridge/test/emqx_bridge_api_SUITE.erl @@ -825,22 +825,53 @@ do_start_stop_bridges(Type, Config) -> %% Connecting to this endpoint should always timeout BadServer = iolist_to_binary(io_lib:format("localhost:~B", [ListenPort])), BadName = <<"bad_", (atom_to_binary(Type))/binary>>, + CreateRes0 = request_json( + post, + uri(["bridges"]), + ?MQTT_BRIDGE(BadServer, BadName), + Config + ), ?assertMatch( {ok, 201, #{ <<"type">> := ?BRIDGE_TYPE_MQTT, <<"name">> := BadName, <<"enable">> := true, - <<"server">> := BadServer, - <<"status">> := <<"connecting">>, - <<"node_status">> := [_ | _] + <<"server">> := BadServer }}, - request_json( - post, - uri(["bridges"]), - ?MQTT_BRIDGE(BadServer, BadName), - Config - ) + CreateRes0 ), + {ok, 201, CreateRes1} = CreateRes0, + case CreateRes1 of + #{ + <<"node_status">> := [ + #{ + <<"status">> := <<"disconnected">>, + <<"status_reason">> := <<"connack_timeout">> + }, + #{<<"status">> := <<"connecting">>} + | _ + ], + %% `inconsistent': one node is `?status_disconnected' (because it has already + %% timed out), the other node is `?status_connecting' (started later and + %% haven't timed out yet) + <<"status">> := <<"inconsistent">>, + <<"status_reason">> := <<"connack_timeout">> + } -> + ok; + #{ + <<"node_status">> := [_, _ | _], + <<"status">> := <<"disconnected">>, + <<"status_reason">> := <<"connack_timeout">> + } -> + ok; + #{ + <<"node_status">> := [_], + <<"status">> := <<"connecting">> + } -> + ok; + _ -> + error({unexpected_result, CreateRes1}) + end, BadBridgeID = emqx_bridge_resource:bridge_id(?BRIDGE_TYPE_MQTT, BadName), ?assertMatch( %% request from product: return 400 on such errors diff --git a/apps/emqx_bridge/test/emqx_bridge_v2_testlib.erl b/apps/emqx_bridge/test/emqx_bridge_v2_testlib.erl index 8a781d6e7..a3316e39d 100644 --- a/apps/emqx_bridge/test/emqx_bridge_v2_testlib.erl +++ b/apps/emqx_bridge/test/emqx_bridge_v2_testlib.erl @@ -705,7 +705,7 @@ t_async_query(Config, MakeMessageFun, IsSuccessCheck, TracePoint) -> ), receive {result, Result} -> IsSuccessCheck(Result) - after 5_000 -> + after 8_000 -> throw(timeout) end, ok. diff --git a/apps/emqx_bridge_cassandra/rebar.config b/apps/emqx_bridge_cassandra/rebar.config index e98146d78..13f95139c 100644 --- a/apps/emqx_bridge_cassandra/rebar.config +++ b/apps/emqx_bridge_cassandra/rebar.config @@ -2,7 +2,7 @@ {erl_opts, [debug_info]}. {deps, [ - {ecql, {git, "https://github.com/emqx/ecql.git", {tag, "v0.6.1"}}}, + {ecql, {git, "https://github.com/emqx/ecql.git", {tag, "v0.7.0"}}}, {emqx_connector, {path, "../../apps/emqx_connector"}}, {emqx_resource, {path, "../../apps/emqx_resource"}}, {emqx_bridge, {path, "../../apps/emqx_bridge"}} diff --git a/apps/emqx_bridge_cassandra/src/emqx_bridge_cassandra.erl b/apps/emqx_bridge_cassandra/src/emqx_bridge_cassandra.erl index d34cb1950..80fbc80d2 100644 --- a/apps/emqx_bridge_cassandra/src/emqx_bridge_cassandra.erl +++ b/apps/emqx_bridge_cassandra/src/emqx_bridge_cassandra.erl @@ -181,7 +181,7 @@ fields("post", Type) -> cql_field() -> {cql, mk( - binary(), + emqx_schema:template(), #{desc => ?DESC("cql_template"), default => ?DEFAULT_CQL, format => <<"sql">>} )}. diff --git a/apps/emqx_bridge_cassandra/test/emqx_bridge_cassandra_SUITE.erl b/apps/emqx_bridge_cassandra/test/emqx_bridge_cassandra_SUITE.erl index 868d0191e..449d1fa51 100644 --- a/apps/emqx_bridge_cassandra/test/emqx_bridge_cassandra_SUITE.erl +++ b/apps/emqx_bridge_cassandra/test/emqx_bridge_cassandra_SUITE.erl @@ -581,7 +581,6 @@ t_write_failure(Config) -> ) end), fun(Trace0) -> - ct:pal("trace: ~p", [Trace0]), Trace = ?of_kind( [buffer_worker_flush_nack, buffer_worker_retry_inflight_failed], Trace0 ), diff --git a/apps/emqx_bridge_clickhouse/src/emqx_bridge_clickhouse.erl b/apps/emqx_bridge_clickhouse/src/emqx_bridge_clickhouse.erl index 833c2570d..1e07f2340 100644 --- a/apps/emqx_bridge_clickhouse/src/emqx_bridge_clickhouse.erl +++ b/apps/emqx_bridge_clickhouse/src/emqx_bridge_clickhouse.erl @@ -184,8 +184,12 @@ fields("post", Type) -> sql_field() -> {sql, mk( - binary(), - #{desc => ?DESC("sql_template"), default => ?DEFAULT_SQL, format => <<"sql">>} + emqx_schema:template(), + #{ + desc => ?DESC("sql_template"), + default => ?DEFAULT_SQL, + format => <<"sql">> + } )}. batch_value_separator_field() -> diff --git a/apps/emqx_bridge_dynamo/src/emqx_bridge_dynamo.erl b/apps/emqx_bridge_dynamo/src/emqx_bridge_dynamo.erl index 13828c0f7..8dafa3922 100644 --- a/apps/emqx_bridge_dynamo/src/emqx_bridge_dynamo.erl +++ b/apps/emqx_bridge_dynamo/src/emqx_bridge_dynamo.erl @@ -87,6 +87,7 @@ connector_values() -> <<"url">> => <<"http://127.0.0.1:8000">>, <<"aws_access_key_id">> => <<"root">>, <<"aws_secret_access_key">> => <<"******">>, + <<"region">> => <<"us-west-2">>, <<"pool_size">> => 8, <<"resource_opts">> => #{ @@ -113,7 +114,8 @@ action_values() -> <<"parameters">> => #{ <<"table">> => <<"mqtt_msg">>, - <<"template">> => ?DEFAULT_TEMPLATE + <<"template">> => ?DEFAULT_TEMPLATE, + <<"hash_key">> => <<"clientid">> } }. @@ -161,10 +163,16 @@ fields(dynamo_action) -> fields(action_parameters) -> Parameters = [ - {template, + {template, template_field_schema()}, + {hash_key, mk( binary(), - #{desc => ?DESC("template"), default => ?DEFAULT_TEMPLATE} + #{desc => ?DESC("hash_key"), required => true} + )}, + {range_key, + mk( + binary(), + #{desc => ?DESC("range_key"), required => false} )} ] ++ emqx_bridge_dynamo_connector:fields(config), lists:foldl( @@ -174,6 +182,7 @@ fields(action_parameters) -> Parameters, [ url, + region, aws_access_key_id, aws_secret_access_key, pool_size, @@ -199,16 +208,22 @@ fields(connector_resource_opts) -> fields("config") -> [ {enable, mk(boolean(), #{desc => ?DESC("config_enable"), default => true})}, - {template, - mk( - binary(), - #{desc => ?DESC("template"), default => ?DEFAULT_TEMPLATE} - )}, + {template, template_field_schema()}, {local_topic, mk( binary(), #{desc => ?DESC("local_topic"), default => undefined} )}, + {hash_key, + mk( + binary(), + #{desc => ?DESC("hash_key"), required => true} + )}, + {range_key, + mk( + binary(), + #{desc => ?DESC("range_key"), required => false} + )}, {resource_opts, mk( ref(?MODULE, "creation_opts"), @@ -230,6 +245,15 @@ fields("put") -> fields("get") -> emqx_bridge_schema:status_fields() ++ fields("post"). +template_field_schema() -> + mk( + emqx_schema:template(), + #{ + desc => ?DESC("template"), + default => ?DEFAULT_TEMPLATE + } + ). + desc("config") -> ?DESC("desc_config"); desc(Method) when Method =:= "get"; Method =:= "put"; Method =:= "post" -> diff --git a/apps/emqx_bridge_dynamo/src/emqx_bridge_dynamo_connector.erl b/apps/emqx_bridge_dynamo/src/emqx_bridge_dynamo_connector.erl index 36f54a63f..372472dda 100644 --- a/apps/emqx_bridge_dynamo/src/emqx_bridge_dynamo_connector.erl +++ b/apps/emqx_bridge_dynamo/src/emqx_bridge_dynamo_connector.erl @@ -45,6 +45,7 @@ roots() -> fields(config) -> [ {url, mk(binary(), #{required => true, desc => ?DESC("url")})}, + {region, mk(binary(), #{required => true, desc => ?DESC("region")})}, {table, mk(binary(), #{required => true, desc => ?DESC("table")})}, {aws_access_key_id, mk( @@ -102,6 +103,12 @@ on_start( pool_name => InstanceId, installed_channels => #{} }, + case Config of + #{region := Region} -> + application:set_env(erlcloud, aws_region, to_str(Region)); + _ -> + ok + end, case emqx_resource_pool:start(InstanceId, ?MODULE, Options) of ok -> {ok, State}; @@ -126,12 +133,20 @@ on_add_channel( create_channel_state( #{parameters := Conf} = _ChannelConfig ) -> - #{ - table := Table - } = Conf, + Keys = maps:with([hash_key, range_key], Conf), + Keys1 = maps:fold( + fun(K, V, Acc) -> + Acc#{K := erlang:binary_to_existing_atom(V)} + end, + Keys, + Keys + ), + + Base = maps:without([template, hash_key, range_key], Conf), + Base1 = maps:merge(Base, Keys1), + Templates = parse_template_from_conf(Conf), - State = #{ - table => Table, + State = Base1#{ templates => Templates }, {ok, State}. @@ -232,11 +247,16 @@ do_query( templates := Templates } = ChannelState, Result = - ecpool:pick_and_do( - PoolName, - {emqx_bridge_dynamo_connector_client, query, [Table, QueryTuple, Templates]}, - no_handover - ), + case ensuare_dynamo_keys(Query, ChannelState) of + true -> + ecpool:pick_and_do( + PoolName, + {emqx_bridge_dynamo_connector_client, query, [Table, QueryTuple, Templates]}, + no_handover + ); + _ -> + {error, missing_filter_or_range_key} + end, case Result of {error, Reason} -> @@ -288,6 +308,25 @@ get_query_tuple([{_ChannelId, {_QueryType, _Data}} | _]) -> get_query_tuple([InsertQuery | _]) -> get_query_tuple(InsertQuery). +ensuare_dynamo_keys({_, Data} = Query, State) when is_map(Data) -> + ensuare_dynamo_keys([Query], State); +ensuare_dynamo_keys([{_, Data} | _] = Queries, State) when is_map(Data) -> + Keys = maps:to_list(maps:with([hash_key, range_key], State)), + lists:all( + fun({_, Query}) -> + lists:all( + fun({_, Key}) -> + maps:is_key(Key, Query) + end, + Keys + ) + end, + Queries + ); +%% this is not a insert query +ensuare_dynamo_keys(_Query, _State) -> + true. + connect(Opts) -> Config = proplists:get_value(config, Opts), {ok, _Pid} = emqx_bridge_dynamo_connector_client:start_link(Config). diff --git a/apps/emqx_bridge_dynamo/test/emqx_bridge_dynamo_SUITE.erl b/apps/emqx_bridge_dynamo/test/emqx_bridge_dynamo_SUITE.erl index dab7b21f0..ff3d5824e 100644 --- a/apps/emqx_bridge_dynamo/test/emqx_bridge_dynamo_SUITE.erl +++ b/apps/emqx_bridge_dynamo/test/emqx_bridge_dynamo_SUITE.erl @@ -16,6 +16,7 @@ -define(TABLE_BIN, to_bin(?TABLE)). -define(ACCESS_KEY_ID, "root"). -define(SECRET_ACCESS_KEY, "public"). +-define(REGION, "us-west-2"). -define(HOST, "dynamo"). -define(PORT, 8000). -define(SCHEMA, "http://"). @@ -177,7 +178,9 @@ dynamo_config(BridgeType, Config) -> "bridges.~s.~s {" "\n enable = true" "\n url = \"http://~s:~p\"" + "\n region = ~p" "\n table = ~p" + "\n hash_key =\"clientid\"" "\n aws_access_key_id = ~p" "\n aws_secret_access_key = ~p" "\n resource_opts = {" @@ -191,6 +194,7 @@ dynamo_config(BridgeType, Config) -> Name, Host, Port, + ?REGION, ?TABLE, ?ACCESS_KEY_ID, %% NOTE: using file-based secrets with HOCON configs @@ -210,7 +214,8 @@ action_config(Config) -> <<"enable">> => true, <<"parameters">> => #{ - <<"table">> => ?TABLE + <<"table">> => ?TABLE, + <<"hash_key">> => <<"clientid">> }, <<"resource_opts">> => #{ @@ -234,6 +239,7 @@ connector_config(Config) -> <<"url">> => URL, <<"aws_access_key_id">> => ?ACCESS_KEY_ID, <<"aws_secret_access_key">> => AccessKey, + <<"region">> => ?REGION, <<"enable">> => true, <<"pool_size">> => 8, <<"resource_opts">> => @@ -355,7 +361,7 @@ t_setup_via_config_and_publish(Config) -> create_bridge(Config) ), MsgId = emqx_utils:gen_id(), - SentData = #{id => MsgId, payload => ?PAYLOAD}, + SentData = #{clientid => <<"clientid">>, id => MsgId, payload => ?PAYLOAD}, ?check_trace( begin ?wait_async_action( @@ -421,7 +427,7 @@ t_setup_via_http_api_and_publish(Config) -> create_bridge_http(PgsqlConfig) ), MsgId = emqx_utils:gen_id(), - SentData = #{id => MsgId, payload => ?PAYLOAD}, + SentData = #{clientid => <<"clientid">>, id => MsgId, payload => ?PAYLOAD}, ?check_trace( begin ?wait_async_action( @@ -486,7 +492,7 @@ t_write_failure(Config) -> #{?snk_kind := resource_connected_enter}, 20_000 ), - SentData = #{id => emqx_utils:gen_id(), payload => ?PAYLOAD}, + SentData = #{clientid => <<"clientid">>, id => emqx_utils:gen_id(), payload => ?PAYLOAD}, emqx_common_test_helpers:with_failure(down, ProxyName, ProxyHost, ProxyPort, fun() -> ?assertMatch( {error, {resource_error, #{reason := timeout}}}, send_message(Config, SentData) @@ -513,12 +519,21 @@ t_simple_query(Config) -> ok. t_missing_data(Config) -> + ?assertMatch( + {ok, _}, + create_bridge(Config) + ), + Result = send_message(Config, #{clientid => <<"clientid">>}), + ?assertMatch({error, {<<"ValidationException">>, <<>>}}, Result), + ok. + +t_missing_hash_key(Config) -> ?assertMatch( {ok, _}, create_bridge(Config) ), Result = send_message(Config, #{}), - ?assertMatch({error, {unrecoverable_error, {invalid_request, _}}}, Result), + ?assertMatch({error, missing_filter_or_range_key}, Result), ok. t_bad_parameter(Config) -> @@ -543,7 +558,9 @@ t_action_create_via_http(Config) -> emqx_bridge_v2_testlib:t_create_via_http(Config). t_action_sync_query(Config) -> - MakeMessageFun = fun() -> #{id => <<"the_message_id">>, payload => ?PAYLOAD} end, + MakeMessageFun = fun() -> + #{clientid => <<"clientid">>, id => <<"the_message_id">>, payload => ?PAYLOAD} + end, IsSuccessCheck = fun(Result) -> ?assertEqual({ok, []}, Result) end, TracePoint = dynamo_connector_query_return, emqx_bridge_v2_testlib:t_sync_query(Config, MakeMessageFun, IsSuccessCheck, TracePoint). diff --git a/apps/emqx_bridge_es/src/emqx_bridge_es.erl b/apps/emqx_bridge_es/src/emqx_bridge_es.erl index 97f3986e4..def0b76f7 100644 --- a/apps/emqx_bridge_es/src/emqx_bridge_es.erl +++ b/apps/emqx_bridge_es/src/emqx_bridge_es.erl @@ -135,7 +135,7 @@ overwrite() -> index() -> {index, ?HOCON( - binary(), + emqx_schema:template(), #{ required => true, example => <<"${payload.index}">>, @@ -146,7 +146,7 @@ index() -> id(Required) -> {id, ?HOCON( - binary(), + emqx_schema:template(), #{ required => Required, example => <<"${payload.id}">>, @@ -157,7 +157,7 @@ id(Required) -> doc() -> {doc, ?HOCON( - binary(), + emqx_schema:template(), #{ required => false, example => <<"${payload.doc}">>, @@ -187,7 +187,7 @@ doc_as_upsert() -> routing() -> {routing, ?HOCON( - binary(), + emqx_schema:template(), #{ required => false, example => <<"${payload.routing}">>, diff --git a/apps/emqx_bridge_gcp_pubsub/src/emqx_bridge_gcp_pubsub.erl b/apps/emqx_bridge_gcp_pubsub/src/emqx_bridge_gcp_pubsub.erl index 007bbc1a0..a5991af81 100644 --- a/apps/emqx_bridge_gcp_pubsub/src/emqx_bridge_gcp_pubsub.erl +++ b/apps/emqx_bridge_gcp_pubsub/src/emqx_bridge_gcp_pubsub.erl @@ -122,7 +122,7 @@ fields(producer) -> )}, {ordering_key_template, sc( - binary(), + emqx_schema:template(), #{ default => <<>>, desc => ?DESC("ordering_key_template") @@ -130,7 +130,7 @@ fields(producer) -> )}, {payload_template, sc( - binary(), + emqx_schema:template(), #{ default => <<>>, desc => ?DESC("payload_template") @@ -201,8 +201,11 @@ fields(consumer_topic_mapping) -> {qos, mk(emqx_schema:qos(), #{default => 0, desc => ?DESC(consumer_mqtt_qos)})}, {payload_template, mk( - string(), - #{default => <<"${.}">>, desc => ?DESC(consumer_mqtt_payload)} + emqx_schema:template(), + #{ + default => <<"${.}">>, + desc => ?DESC(consumer_mqtt_payload) + } )} ]; fields("consumer_resource_opts") -> @@ -221,14 +224,18 @@ fields("consumer_resource_opts") -> fields(key_value_pair) -> [ {key, - mk(binary(), #{ + mk(emqx_schema:template(), #{ required => true, validator => [ emqx_resource_validator:not_empty("Key templates must not be empty") ], desc => ?DESC(kv_pair_key) })}, - {value, mk(binary(), #{required => true, desc => ?DESC(kv_pair_value)})} + {value, + mk(emqx_schema:template(), #{ + required => true, + desc => ?DESC(kv_pair_value) + })} ]; fields("get_producer") -> emqx_bridge_schema:status_fields() ++ fields("post_producer"); diff --git a/apps/emqx_bridge_gcp_pubsub/test/emqx_bridge_gcp_pubsub_producer_SUITE.erl b/apps/emqx_bridge_gcp_pubsub/test/emqx_bridge_gcp_pubsub_producer_SUITE.erl index 6666a3fd0..d96157f8c 100644 --- a/apps/emqx_bridge_gcp_pubsub/test/emqx_bridge_gcp_pubsub_producer_SUITE.erl +++ b/apps/emqx_bridge_gcp_pubsub/test/emqx_bridge_gcp_pubsub_producer_SUITE.erl @@ -1929,7 +1929,6 @@ t_bad_attributes(Config) -> ok end, fun(Trace) -> - ct:pal("trace:\n ~p", [Trace]), ?assertMatch( [ #{placeholder := [<<"payload">>, <<"ok">>], value := #{}}, diff --git a/apps/emqx_bridge_greptimedb/rebar.config b/apps/emqx_bridge_greptimedb/rebar.config index bb37de16e..c1039e6f4 100644 --- a/apps/emqx_bridge_greptimedb/rebar.config +++ b/apps/emqx_bridge_greptimedb/rebar.config @@ -6,7 +6,7 @@ {emqx_connector, {path, "../../apps/emqx_connector"}}, {emqx_resource, {path, "../../apps/emqx_resource"}}, {emqx_bridge, {path, "../../apps/emqx_bridge"}}, - {greptimedb, {git, "https://github.com/GreptimeTeam/greptimedb-client-erl", {tag, "v0.1.7"}}} + {greptimedb, {git, "https://github.com/GreptimeTeam/greptimedb-ingester-erl", {tag, "v0.1.8"}}} ]}. {plugins, [rebar3_path_deps]}. {project_plugins, [erlfmt]}. diff --git a/apps/emqx_bridge_greptimedb/test/emqx_bridge_greptimedb_SUITE.erl b/apps/emqx_bridge_greptimedb/test/emqx_bridge_greptimedb_SUITE.erl index 6e7a23637..96cf0d7c9 100644 --- a/apps/emqx_bridge_greptimedb/test/emqx_bridge_greptimedb_SUITE.erl +++ b/apps/emqx_bridge_greptimedb/test/emqx_bridge_greptimedb_SUITE.erl @@ -324,7 +324,7 @@ query_by_clientid(Topic, ClientId, Config) -> {"Content-Type", "application/x-www-form-urlencoded"} ], Body = <<"sql=select * from \"", Topic/binary, "\" where clientid='", ClientId/binary, "'">>, - {ok, 200, _Headers, RawBody0} = + {ok, StatusCode, _Headers, RawBody0} = ehttpc:request( EHttpcPoolName, post, @@ -335,7 +335,6 @@ query_by_clientid(Topic, ClientId, Config) -> case emqx_utils_json:decode(RawBody0, [return_maps]) of #{ - <<"code">> := 0, <<"output">> := [ #{ <<"records">> := #{ @@ -344,12 +343,12 @@ query_by_clientid(Topic, ClientId, Config) -> } } ] - } -> + } when StatusCode >= 200 andalso StatusCode =< 300 -> make_row(Schema, Rows); #{ <<"code">> := Code, <<"error">> := Error - } -> + } when StatusCode > 300 -> GreptimedbName = ?config(greptimedb_name, Config), Type = greptimedb_type_bin(?config(greptimedb_type, Config)), BridgeId = emqx_bridge_resource:bridge_id(Type, GreptimedbName), @@ -367,7 +366,9 @@ query_by_clientid(Topic, ClientId, Config) -> _ -> %% Table not found #{} - end + end; + Error -> + {error, Error} end. make_row(null, _Rows) -> @@ -910,69 +911,6 @@ t_start_exception(Config) -> ), ok. -t_write_failure(Config) -> - ProxyName = ?config(proxy_name, Config), - ProxyPort = ?config(proxy_port, Config), - ProxyHost = ?config(proxy_host, Config), - QueryMode = ?config(query_mode, Config), - {ok, _} = create_bridge(Config), - ClientId = emqx_guid:to_hexstr(emqx_guid:gen()), - Payload = #{ - int_key => -123, - bool => true, - float_key => 24.5, - uint_key => 123 - }, - SentData = #{ - <<"clientid">> => ClientId, - <<"topic">> => atom_to_binary(?FUNCTION_NAME), - <<"timestamp">> => erlang:system_time(millisecond), - <<"payload">> => Payload - }, - ?check_trace( - emqx_common_test_helpers:with_failure(down, ProxyName, ProxyHost, ProxyPort, fun() -> - case QueryMode of - sync -> - ?wait_async_action( - ?assertMatch( - {error, {resource_error, #{reason := timeout}}}, - send_message(Config, SentData) - ), - #{?snk_kind := handle_async_reply, action := nack}, - 1_000 - ); - async -> - ?wait_async_action( - ?assertEqual(ok, send_message(Config, SentData)), - #{?snk_kind := handle_async_reply}, - 1_000 - ) - end - end), - fun(Trace0) -> - case QueryMode of - sync -> - Trace = ?of_kind(handle_async_reply, Trace0), - ?assertMatch([_ | _], Trace), - [#{result := Result} | _] = Trace, - ?assert( - not emqx_bridge_greptimedb_connector:is_unrecoverable_error(Result), - #{got => Result} - ); - async -> - Trace = ?of_kind(handle_async_reply, Trace0), - ?assertMatch([_ | _], Trace), - [#{result := Result} | _] = Trace, - ?assert( - not emqx_bridge_greptimedb_connector:is_unrecoverable_error(Result), - #{got => Result} - ) - end, - ok - end - ), - ok. - t_missing_field(Config) -> BatchSize = ?config(batch_size, Config), IsBatch = BatchSize > 1, diff --git a/apps/emqx_bridge_hstreamdb/src/emqx_bridge_hstreamdb.erl b/apps/emqx_bridge_hstreamdb/src/emqx_bridge_hstreamdb.erl index 7fa19c9a4..7024a2e07 100644 --- a/apps/emqx_bridge_hstreamdb/src/emqx_bridge_hstreamdb.erl +++ b/apps/emqx_bridge_hstreamdb/src/emqx_bridge_hstreamdb.erl @@ -167,13 +167,13 @@ fields(action_parameters) -> })}, {partition_key, - mk(binary(), #{ - required => false, desc => ?DESC(emqx_bridge_hstreamdb_connector, "partition_key") + mk(emqx_schema:template(), #{ + required => false, + desc => ?DESC(emqx_bridge_hstreamdb_connector, "partition_key") })}, {grpc_flush_timeout, fun grpc_flush_timeout/1}, - {record_template, - mk(binary(), #{default => <<"${payload}">>, desc => ?DESC("record_template")})}, + {record_template, record_template_schema()}, {aggregation_pool_size, mk(pos_integer(), #{ default => ?DEFAULT_AGG_POOL_SIZE, desc => ?DESC("aggregation_pool_size") @@ -222,6 +222,12 @@ fields("put") -> hstream_bridge_common_fields() ++ connector_fields(). +record_template_schema() -> + mk(emqx_schema:template(), #{ + default => <<"${payload}">>, + desc => ?DESC("record_template") + }). + grpc_timeout(type) -> emqx_schema:timeout_duration_ms(); grpc_timeout(desc) -> ?DESC(emqx_bridge_hstreamdb_connector, "grpc_timeout"); grpc_timeout(default) -> ?DEFAULT_GRPC_TIMEOUT_RAW; @@ -239,8 +245,7 @@ hstream_bridge_common_fields() -> [ {direction, mk(egress, #{desc => ?DESC("config_direction"), default => egress})}, {local_topic, mk(binary(), #{desc => ?DESC("local_topic")})}, - {record_template, - mk(binary(), #{default => <<"${payload}">>, desc => ?DESC("record_template")})} + {record_template, record_template_schema()} ] ++ emqx_resource_schema:fields("resource_opts"). diff --git a/apps/emqx_bridge_http/src/emqx_bridge_http_connector.erl b/apps/emqx_bridge_http/src/emqx_bridge_http_connector.erl index ae1e727ca..9be7457e1 100644 --- a/apps/emqx_bridge_http/src/emqx_bridge_http_connector.erl +++ b/apps/emqx_bridge_http/src/emqx_bridge_http_connector.erl @@ -128,9 +128,10 @@ fields("request") -> desc => ?DESC("method"), validator => fun ?MODULE:validate_method/1 })}, - {path, hoconsc:mk(binary(), #{required => false, desc => ?DESC("path")})}, - {body, hoconsc:mk(binary(), #{required => false, desc => ?DESC("body")})}, - {headers, hoconsc:mk(map(), #{required => false, desc => ?DESC("headers")})}, + {path, hoconsc:mk(emqx_schema:template(), #{required => false, desc => ?DESC("path")})}, + {body, hoconsc:mk(emqx_schema:template(), #{required => false, desc => ?DESC("body")})}, + {headers, + hoconsc:mk(map(), #{required => false, desc => ?DESC("headers"), is_template => true})}, {max_retries, sc( non_neg_integer(), @@ -315,7 +316,7 @@ on_query(InstId, {send_message, Msg}, State) -> ClientId = maps:get(clientid, Msg, undefined), on_query( InstId, - {ClientId, Method, {Path, Headers, Body}, Timeout, Retry}, + {undefined, ClientId, Method, {Path, Headers, Body}, Timeout, Retry}, State ) end; @@ -345,19 +346,19 @@ on_query( ClientId = clientid(Msg), on_query( InstId, - {ClientId, Method, {Path, Headers, Body}, Timeout, Retry}, + {ActionId, ClientId, Method, {Path, Headers, Body}, Timeout, Retry}, State ) end; on_query(InstId, {Method, Request}, State) -> %% TODO: Get retry from State - on_query(InstId, {undefined, Method, Request, 5000, _Retry = 2}, State); + on_query(InstId, {undefined, undefined, Method, Request, 5000, _Retry = 2}, State); on_query(InstId, {Method, Request, Timeout}, State) -> %% TODO: Get retry from State - on_query(InstId, {undefined, Method, Request, Timeout, _Retry = 2}, State); + on_query(InstId, {undefined, undefined, Method, Request, Timeout, _Retry = 2}, State); on_query( InstId, - {KeyOrNum, Method, Request, Timeout, Retry}, + {ActionId, KeyOrNum, Method, Request, Timeout, Retry}, #{base_path := BasePath} = State ) -> ?TRACE( @@ -367,10 +368,12 @@ on_query( request => redact_request(Request), note => ?READACT_REQUEST_NOTE, connector => InstId, + action_id => ActionId, state => redact(State) } ), NRequest = formalize_request(Method, BasePath, Request), + trace_rendered_action_template(ActionId, Method, NRequest, Timeout), Worker = resolve_pool_worker(State, KeyOrNum), Result0 = ehttpc:request( Worker, @@ -427,7 +430,7 @@ on_query_async(InstId, {send_message, Msg}, ReplyFunAndArgs, State) -> ClientId = maps:get(clientid, Msg, undefined), on_query_async( InstId, - {ClientId, Method, {Path, Headers, Body}, Timeout}, + {undefined, ClientId, Method, {Path, Headers, Body}, Timeout}, ReplyFunAndArgs, State ) @@ -457,14 +460,14 @@ on_query_async( ClientId = clientid(Msg), on_query_async( InstId, - {ClientId, Method, {Path, Headers, Body}, Timeout}, + {ActionId, ClientId, Method, {Path, Headers, Body}, Timeout}, ReplyFunAndArgs, State ) end; on_query_async( InstId, - {KeyOrNum, Method, Request, Timeout}, + {ActionId, KeyOrNum, Method, Request, Timeout}, ReplyFunAndArgs, #{base_path := BasePath} = State ) -> @@ -480,6 +483,7 @@ on_query_async( } ), NRequest = formalize_request(Method, BasePath, Request), + trace_rendered_action_template(ActionId, Method, NRequest, Timeout), MaxAttempts = maps:get(max_attempts, State, 3), Context = #{ attempt => 1, @@ -499,6 +503,31 @@ on_query_async( ), {ok, Worker}. +trace_rendered_action_template(ActionId, Method, NRequest, Timeout) -> + case NRequest of + {Path, Headers} -> + emqx_trace:rendered_action_template( + ActionId, + #{ + path => Path, + method => Method, + headers => emqx_utils_redact:redact_headers(Headers), + timeout => Timeout + } + ); + {Path, Headers, Body} -> + emqx_trace:rendered_action_template( + ActionId, + #{ + path => Path, + method => Method, + headers => emqx_utils_redact:redact_headers(Headers), + timeout => Timeout, + body => Body + } + ) + end. + resolve_pool_worker(State, undefined) -> resolve_pool_worker(State, self()); resolve_pool_worker(#{pool_name := PoolName} = State, Key) -> diff --git a/apps/emqx_bridge_http/src/emqx_bridge_http_schema.erl b/apps/emqx_bridge_http/src/emqx_bridge_http_schema.erl index 43f3d1748..cadbcf0d2 100644 --- a/apps/emqx_bridge_http/src/emqx_bridge_http_schema.erl +++ b/apps/emqx_bridge_http/src/emqx_bridge_http_schema.erl @@ -114,7 +114,7 @@ fields("parameters_opts") -> [ {path, mk( - binary(), + emqx_schema:template(), #{ desc => ?DESC("config_path"), required => false @@ -270,7 +270,8 @@ headers_field() -> <<"content-type">> => <<"application/json">>, <<"keep-alive">> => <<"timeout=5">> }, - desc => ?DESC("config_headers") + desc => ?DESC("config_headers"), + is_template => true } )}. @@ -287,7 +288,7 @@ method_field() -> body_field() -> {body, mk( - binary(), + emqx_schema:template(), #{ default => undefined, desc => ?DESC("config_body") diff --git a/apps/emqx_bridge_http/test/emqx_bridge_http_SUITE.erl b/apps/emqx_bridge_http/test/emqx_bridge_http_SUITE.erl index 3da04012d..9d215d815 100644 --- a/apps/emqx_bridge_http/test/emqx_bridge_http_SUITE.erl +++ b/apps/emqx_bridge_http/test/emqx_bridge_http_SUITE.erl @@ -30,8 +30,8 @@ -include_lib("snabbkaffe/include/snabbkaffe.hrl"). -include_lib("emqx/include/asserts.hrl"). --define(BRIDGE_TYPE, <<"webhook">>). --define(BRIDGE_NAME, atom_to_binary(?MODULE)). +-define(BRIDGE_TYPE, emqx_bridge_http_test_lib:bridge_type()). +-define(BRIDGE_NAME, emqx_bridge_http_test_lib:bridge_name()). all() -> emqx_common_test_helpers:all(?MODULE). @@ -73,21 +73,10 @@ suite() -> init_per_testcase(t_bad_bridge_config, Config) -> Config; -init_per_testcase(t_send_async_connection_timeout, Config) -> - HTTPPath = <<"/path">>, - ServerSSLOpts = false, - {ok, {HTTPPort, _Pid}} = emqx_bridge_http_connector_test_server:start_link( - _Port = random, HTTPPath, ServerSSLOpts - ), - ResponseDelayMS = 500, - ok = emqx_bridge_http_connector_test_server:set_handler( - success_http_handler(#{response_delay => ResponseDelayMS}) - ), - [ - {http_server, #{port => HTTPPort, path => HTTPPath}}, - {response_delay_ms, ResponseDelayMS} - | Config - ]; +init_per_testcase(Case, Config) when + Case =:= t_send_async_connection_timeout orelse Case =:= t_send_get_trace_messages +-> + emqx_bridge_http_test_lib:init_http_success_server(Config); init_per_testcase(t_path_not_found, Config) -> HTTPPath = <<"/nonexisting/path">>, ServerSSLOpts = false, @@ -115,7 +104,9 @@ init_per_testcase(t_bridge_probes_header_atoms, Config) -> {ok, {HTTPPort, _Pid}} = emqx_bridge_http_connector_test_server:start_link( _Port = random, HTTPPath, ServerSSLOpts ), - ok = emqx_bridge_http_connector_test_server:set_handler(success_http_handler()), + ok = emqx_bridge_http_connector_test_server:set_handler( + emqx_bridge_http_test_lib:success_http_handler() + ), [{http_server, #{port => HTTPPort, path => HTTPPath}} | Config]; init_per_testcase(_TestCase, Config) -> Server = start_http_server(#{response_delay_ms => 0}), @@ -126,7 +117,8 @@ end_per_testcase(TestCase, _Config) when TestCase =:= t_too_many_requests; TestCase =:= t_rule_action_expired; TestCase =:= t_bridge_probes_header_atoms; - TestCase =:= t_send_async_connection_timeout + TestCase =:= t_send_async_connection_timeout; + TestCase =:= t_send_get_trace_messages -> ok = emqx_bridge_http_connector_test_server:stop(), persistent_term:erase({?MODULE, times_called}), @@ -250,115 +242,8 @@ get_metrics(Name) -> Type = <<"http">>, emqx_bridge:get_metrics(Type, Name). -bridge_async_config(#{port := Port} = Config) -> - Type = maps:get(type, Config, ?BRIDGE_TYPE), - Name = maps:get(name, Config, ?BRIDGE_NAME), - Host = maps:get(host, Config, "localhost"), - Path = maps:get(path, Config, ""), - PoolSize = maps:get(pool_size, Config, 1), - QueryMode = maps:get(query_mode, Config, "async"), - ConnectTimeout = maps:get(connect_timeout, Config, "1s"), - RequestTimeout = maps:get(request_timeout, Config, "10s"), - ResumeInterval = maps:get(resume_interval, Config, "1s"), - HealthCheckInterval = maps:get(health_check_interval, Config, "200ms"), - ResourceRequestTTL = maps:get(resource_request_ttl, Config, "infinity"), - LocalTopic = - case maps:find(local_topic, Config) of - {ok, LT} -> - lists:flatten(["local_topic = \"", LT, "\""]); - error -> - "" - end, - ConfigString = io_lib:format( - "bridges.~s.~s {\n" - " url = \"http://~s:~p~s\"\n" - " connect_timeout = \"~p\"\n" - " enable = true\n" - %% local_topic - " ~s\n" - " enable_pipelining = 100\n" - " max_retries = 2\n" - " method = \"post\"\n" - " pool_size = ~p\n" - " pool_type = \"random\"\n" - " request_timeout = \"~s\"\n" - " body = \"${id}\"\n" - " resource_opts {\n" - " inflight_window = 100\n" - " health_check_interval = \"~s\"\n" - " max_buffer_bytes = \"1GB\"\n" - " query_mode = \"~s\"\n" - " request_ttl = \"~p\"\n" - " resume_interval = \"~s\"\n" - " start_after_created = \"true\"\n" - " start_timeout = \"5s\"\n" - " worker_pool_size = \"1\"\n" - " }\n" - " ssl {\n" - " enable = false\n" - " }\n" - "}\n", - [ - Type, - Name, - Host, - Port, - Path, - ConnectTimeout, - LocalTopic, - PoolSize, - RequestTimeout, - HealthCheckInterval, - QueryMode, - ResourceRequestTTL, - ResumeInterval - ] - ), - ct:pal(ConfigString), - parse_and_check(ConfigString, Type, Name). - -parse_and_check(ConfigString, BridgeType, Name) -> - {ok, RawConf} = hocon:binary(ConfigString, #{format => map}), - hocon_tconf:check_plain(emqx_bridge_schema, RawConf, #{required => false, atom_key => false}), - #{<<"bridges">> := #{BridgeType := #{Name := RetConfig}}} = RawConf, - RetConfig. - make_bridge(Config) -> - Type = ?BRIDGE_TYPE, - Name = ?BRIDGE_NAME, - BridgeConfig = bridge_async_config(Config#{ - name => Name, - type => Type - }), - {ok, _} = emqx_bridge:create( - Type, - Name, - BridgeConfig - ), - emqx_bridge_resource:bridge_id(Type, Name). - -success_http_handler() -> - success_http_handler(#{response_delay => 0}). - -success_http_handler(Opts) -> - ResponseDelay = maps:get(response_delay, Opts, 0), - TestPid = self(), - fun(Req0, State) -> - {ok, Body, Req} = cowboy_req:read_body(Req0), - Headers = cowboy_req:headers(Req), - ct:pal("http request received: ~p", [ - #{body => Body, headers => Headers, response_delay => ResponseDelay} - ]), - ResponseDelay > 0 andalso timer:sleep(ResponseDelay), - TestPid ! {http, Headers, Body}, - Rep = cowboy_req:reply( - 200, - #{<<"content-type">> => <<"text/plain">>}, - <<"hello">>, - Req - ), - {ok, Rep, State} - end. + emqx_bridge_http_test_lib:make_bridge(Config). not_found_http_handler() -> TestPid = self(), @@ -452,6 +337,102 @@ t_send_async_connection_timeout(Config) -> receive_request_notifications(MessageIDs, ResponseDelayMS, []), ok. +t_send_get_trace_messages(Config) -> + ResponseDelayMS = ?config(response_delay_ms, Config), + #{port := Port, path := Path} = ?config(http_server, Config), + BridgeID = make_bridge(#{ + port => Port, + path => Path, + pool_size => 1, + query_mode => "async", + connect_timeout => integer_to_list(ResponseDelayMS * 2) ++ "ms", + request_timeout => "10s", + resume_interval => "200ms", + health_check_interval => "200ms", + resource_request_ttl => "infinity" + }), + RuleTopic = iolist_to_binary([<<"my_rule_topic/">>, atom_to_binary(?FUNCTION_NAME)]), + SQL = <<"SELECT payload.id as id FROM \"", RuleTopic/binary, "\"">>, + {ok, #{<<"id">> := RuleId}} = + emqx_bridge_testlib:create_rule_and_action_http( + ?BRIDGE_TYPE, + RuleTopic, + Config, + #{sql => SQL} + ), + %% =================================== + %% Create trace for RuleId + %% =================================== + Now = erlang:system_time(second) - 10, + Start = Now, + End = Now + 60, + TraceName = atom_to_binary(?FUNCTION_NAME), + Trace = #{ + name => TraceName, + type => ruleid, + ruleid => RuleId, + start_at => Start, + end_at => End + }, + emqx_trace_SUITE:reload(), + ok = emqx_trace:clear(), + {ok, _} = emqx_trace:create(Trace), + %% =================================== + + ResourceId = emqx_bridge_resource:resource_id(BridgeID), + ?retry( + _Interval0 = 200, + _NAttempts0 = 20, + ?assertMatch({ok, connected}, emqx_resource_manager:health_check(ResourceId)) + ), + ?retry( + _Interval0 = 200, + _NAttempts0 = 20, + ?assertEqual(<<>>, read_rule_trace_file(TraceName, Now)) + ), + Msg = emqx_message:make(RuleTopic, <<"{\"id\": 1}">>), + emqx:publish(Msg), + ?retry( + _Interval = 500, + _NAttempts = 20, + ?assertMatch( + #{ + counters := #{ + 'matched' := 1, + 'actions.failed' := 0, + 'actions.failed.unknown' := 0, + 'actions.success' := 1, + 'actions.total' := 1 + } + }, + emqx_metrics_worker:get_metrics(rule_metrics, RuleId) + ) + ), + + ok = emqx_trace_handler_SUITE:filesync(TraceName, ruleid), + {ok, Bin} = file:read_file(emqx_trace:log_file(TraceName, Now)), + + ?retry( + _Interval0 = 200, + _NAttempts0 = 20, + begin + Bin = read_rule_trace_file(TraceName, Now), + ?assertNotEqual(nomatch, binary:match(Bin, [<<"rule_activated">>])), + ?assertNotEqual(nomatch, binary:match(Bin, [<<"SQL_yielded_result">>])), + ?assertNotEqual(nomatch, binary:match(Bin, [<<"bridge_action">>])), + ?assertNotEqual(nomatch, binary:match(Bin, [<<"action_template_rendered">>])), + ?assertNotEqual(nomatch, binary:match(Bin, [<<"QUERY_ASYNC">>])) + end + ), + emqx_trace:delete(TraceName), + ok. + +read_rule_trace_file(TraceName, From) -> + emqx_trace:check(), + ok = emqx_trace_handler_SUITE:filesync(TraceName, ruleid), + {ok, Bin} = file:read_file(emqx_trace:log_file(TraceName, From)), + Bin. + t_async_free_retries(Config) -> #{port := Port} = ?config(http_server, Config), _BridgeID = make_bridge(#{ @@ -518,7 +499,7 @@ t_async_common_retries(Config) -> ok. t_bad_bridge_config(_Config) -> - BridgeConfig = bridge_async_config(#{port => 12345}), + BridgeConfig = emqx_bridge_http_test_lib:bridge_async_config(#{port => 12345}), ?assertMatch( {ok, {{_, 201, _}, _Headers, #{ @@ -540,7 +521,7 @@ t_bad_bridge_config(_Config) -> t_start_stop(Config) -> #{port := Port} = ?config(http_server, Config), - BridgeConfig = bridge_async_config(#{ + BridgeConfig = emqx_bridge_http_test_lib:bridge_async_config(#{ type => ?BRIDGE_TYPE, name => ?BRIDGE_NAME, port => Port @@ -554,7 +535,7 @@ t_path_not_found(Config) -> begin #{port := Port, path := Path} = ?config(http_server, Config), MQTTTopic = <<"t/webhook">>, - BridgeConfig = bridge_async_config(#{ + BridgeConfig = emqx_bridge_http_test_lib:bridge_async_config(#{ type => ?BRIDGE_TYPE, name => ?BRIDGE_NAME, local_topic => MQTTTopic, @@ -593,7 +574,7 @@ t_too_many_requests(Config) -> begin #{port := Port, path := Path} = ?config(http_server, Config), MQTTTopic = <<"t/webhook">>, - BridgeConfig = bridge_async_config(#{ + BridgeConfig = emqx_bridge_http_test_lib:bridge_async_config(#{ type => ?BRIDGE_TYPE, name => ?BRIDGE_NAME, local_topic => MQTTTopic, @@ -633,7 +614,7 @@ t_rule_action_expired(Config) -> ?check_trace( begin RuleTopic = <<"t/webhook/rule">>, - BridgeConfig = bridge_async_config(#{ + BridgeConfig = emqx_bridge_http_test_lib:bridge_async_config(#{ type => ?BRIDGE_TYPE, name => ?BRIDGE_NAME, host => "non.existent.host", @@ -689,7 +670,7 @@ t_bridge_probes_header_atoms(Config) -> ?check_trace( begin LocalTopic = <<"t/local/topic">>, - BridgeConfig0 = bridge_async_config(#{ + BridgeConfig0 = emqx_bridge_http_test_lib:bridge_async_config(#{ type => ?BRIDGE_TYPE, name => ?BRIDGE_NAME, port => Port, diff --git a/apps/emqx_bridge_http/test/emqx_bridge_http_test_lib.erl b/apps/emqx_bridge_http/test/emqx_bridge_http_test_lib.erl new file mode 100644 index 000000000..4959a24c3 --- /dev/null +++ b/apps/emqx_bridge_http/test/emqx_bridge_http_test_lib.erl @@ -0,0 +1,161 @@ +%%-------------------------------------------------------------------- +%% Copyright (c) 2020-2024 EMQ Technologies Co., Ltd. All Rights Reserved. +%% +%% Licensed under the Apache License, Version 2.0 (the "License"); +%% you may not use this file except in compliance with the License. +%% You may obtain a copy of the License at +%% http://www.apache.org/licenses/LICENSE-2.0 +%% +%% Unless required by applicable law or agreed to in writing, software +%% distributed under the License is distributed on an "AS IS" BASIS, +%% WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +%% See the License for the specific language governing permissions and +%% limitations under the License. +%%-------------------------------------------------------------------- + +-module(emqx_bridge_http_test_lib). + +-export([ + bridge_type/0, + bridge_name/0, + make_bridge/1, + bridge_async_config/1, + init_http_success_server/1, + success_http_handler/0 +]). + +-define(BRIDGE_TYPE, bridge_type()). +-define(BRIDGE_NAME, bridge_name()). + +bridge_type() -> + <<"webhook">>. + +bridge_name() -> + atom_to_binary(?MODULE). + +make_bridge(Config) -> + Type = ?BRIDGE_TYPE, + Name = ?BRIDGE_NAME, + BridgeConfig = bridge_async_config(Config#{ + name => Name, + type => Type + }), + {ok, _} = emqx_bridge:create( + Type, + Name, + BridgeConfig + ), + emqx_bridge_resource:bridge_id(Type, Name). + +bridge_async_config(#{port := Port} = Config) -> + Type = maps:get(type, Config, ?BRIDGE_TYPE), + Name = maps:get(name, Config, ?BRIDGE_NAME), + Host = maps:get(host, Config, "localhost"), + Path = maps:get(path, Config, ""), + PoolSize = maps:get(pool_size, Config, 1), + QueryMode = maps:get(query_mode, Config, "async"), + ConnectTimeout = maps:get(connect_timeout, Config, "1s"), + RequestTimeout = maps:get(request_timeout, Config, "10s"), + ResumeInterval = maps:get(resume_interval, Config, "1s"), + HealthCheckInterval = maps:get(health_check_interval, Config, "200ms"), + ResourceRequestTTL = maps:get(resource_request_ttl, Config, "infinity"), + LocalTopic = + case maps:find(local_topic, Config) of + {ok, LT} -> + lists:flatten(["local_topic = \"", LT, "\""]); + error -> + "" + end, + ConfigString = io_lib:format( + "bridges.~s.~s {\n" + " url = \"http://~s:~p~s\"\n" + " connect_timeout = \"~p\"\n" + " enable = true\n" + %% local_topic + " ~s\n" + " enable_pipelining = 100\n" + " max_retries = 2\n" + " method = \"post\"\n" + " pool_size = ~p\n" + " pool_type = \"random\"\n" + " request_timeout = \"~s\"\n" + " body = \"${id}\"\n" + " resource_opts {\n" + " inflight_window = 100\n" + " health_check_interval = \"~s\"\n" + " max_buffer_bytes = \"1GB\"\n" + " query_mode = \"~s\"\n" + " request_ttl = \"~p\"\n" + " resume_interval = \"~s\"\n" + " start_after_created = \"true\"\n" + " start_timeout = \"5s\"\n" + " worker_pool_size = \"1\"\n" + " }\n" + " ssl {\n" + " enable = false\n" + " }\n" + "}\n", + [ + Type, + Name, + Host, + Port, + Path, + ConnectTimeout, + LocalTopic, + PoolSize, + RequestTimeout, + HealthCheckInterval, + QueryMode, + ResourceRequestTTL, + ResumeInterval + ] + ), + ct:pal(ConfigString), + parse_and_check(ConfigString, Type, Name). + +parse_and_check(ConfigString, BridgeType, Name) -> + {ok, RawConf} = hocon:binary(ConfigString, #{format => map}), + hocon_tconf:check_plain(emqx_bridge_schema, RawConf, #{required => false, atom_key => false}), + #{<<"bridges">> := #{BridgeType := #{Name := RetConfig}}} = RawConf, + RetConfig. + +success_http_handler() -> + success_http_handler(#{response_delay => 0}). + +success_http_handler(Opts) -> + ResponseDelay = maps:get(response_delay, Opts, 0), + TestPid = self(), + fun(Req0, State) -> + {ok, Body, Req} = cowboy_req:read_body(Req0), + Headers = cowboy_req:headers(Req), + ct:pal("http request received: ~p", [ + #{body => Body, headers => Headers, response_delay => ResponseDelay} + ]), + ResponseDelay > 0 andalso timer:sleep(ResponseDelay), + TestPid ! {http, Headers, Body}, + Rep = cowboy_req:reply( + 200, + #{<<"content-type">> => <<"text/plain">>}, + <<"hello">>, + Req + ), + {ok, Rep, State} + end. + +init_http_success_server(Config) -> + HTTPPath = <<"/path">>, + ServerSSLOpts = false, + {ok, {HTTPPort, _Pid}} = emqx_bridge_http_connector_test_server:start_link( + _Port = random, HTTPPath, ServerSSLOpts + ), + ResponseDelayMS = 500, + ok = emqx_bridge_http_connector_test_server:set_handler( + success_http_handler(#{response_delay => ResponseDelayMS}) + ), + [ + {http_server, #{port => HTTPPort, path => HTTPPath}}, + {response_delay_ms, ResponseDelayMS}, + {bridge_name, ?BRIDGE_NAME} + | Config + ]. diff --git a/apps/emqx_bridge_influxdb/src/emqx_bridge_influxdb.erl b/apps/emqx_bridge_influxdb/src/emqx_bridge_influxdb.erl index a62effe51..59d36cd5f 100644 --- a/apps/emqx_bridge_influxdb/src/emqx_bridge_influxdb.erl +++ b/apps/emqx_bridge_influxdb/src/emqx_bridge_influxdb.erl @@ -42,7 +42,7 @@ %% api write_syntax_type() -> - typerefl:alias("string", write_syntax()). + typerefl:alias("template", write_syntax()). %% Examples conn_bridge_examples(Method) -> diff --git a/apps/emqx_bridge_influxdb/src/emqx_bridge_influxdb_connector.erl b/apps/emqx_bridge_influxdb/src/emqx_bridge_influxdb_connector.erl index e17b9b3fe..94419c7d9 100644 --- a/apps/emqx_bridge_influxdb/src/emqx_bridge_influxdb_connector.erl +++ b/apps/emqx_bridge_influxdb/src/emqx_bridge_influxdb_connector.erl @@ -59,6 +59,9 @@ -define(DEFAULT_TIMESTAMP_TMPL, "${timestamp}"). +-define(set_tag, set_tag). +-define(set_field, set_field). + -define(IS_HTTP_ERROR(STATUS_CODE), (is_integer(STATUS_CODE) andalso (STATUS_CODE < 200 orelse STATUS_CODE >= 300)) @@ -710,8 +713,8 @@ line_to_point( precision := Precision } = Item ) -> - {_, EncodedTags} = maps:fold(fun maps_config_to_data/3, {Data, #{}}, Tags), - {_, EncodedFields} = maps:fold(fun maps_config_to_data/3, {Data, #{}}, Fields), + {_, EncodedTags, _} = maps:fold(fun maps_config_to_data/3, {Data, #{}, ?set_tag}, Tags), + {_, EncodedFields, _} = maps:fold(fun maps_config_to_data/3, {Data, #{}, ?set_field}, Fields), maps:without([precision], Item#{ measurement => emqx_placeholder:proc_tmpl(Measurement, Data), tags => EncodedTags, @@ -727,34 +730,43 @@ time_unit(ms) -> millisecond; time_unit(us) -> microsecond; time_unit(ns) -> nanosecond. -maps_config_to_data(K, V, {Data, Res}) -> +maps_config_to_data(K, V, {Data, Res, SetType}) -> KTransOptions = #{return => rawlist, var_trans => fun key_filter/1}, VTransOptions = #{return => rawlist, var_trans => fun data_filter/1}, NK = emqx_placeholder:proc_tmpl(K, Data, KTransOptions), NV = proc_quoted(V, Data, VTransOptions), case {NK, NV} of {[undefined], _} -> - {Data, Res}; + {Data, Res, SetType}; %% undefined value in normal format [undefined] or int/uint format [undefined, <<"i">>] {_, [undefined | _]} -> - {Data, Res}; + {Data, Res, SetType}; {_, {quoted, [undefined | _]}} -> - {Data, Res}; + {Data, Res, SetType}; _ -> - {Data, Res#{ - list_to_binary(NK) => value_type(NV, tmpl_type(V)) - }} + NRes = Res#{ + list_to_binary(NK) => value_type(NV, #{ + tmpl_type => tmpl_type(V), set_type => SetType + }) + }, + {Data, NRes, SetType} end. +value_type([Number], #{set_type := ?set_tag}) when is_number(Number) -> + %% all `tag` values are treated as string + %% See also: https://docs.influxdata.com/influxdb/v2/reference/syntax/line-protocol/#tag-set + emqx_utils_conv:bin(Number); +value_type([Str], #{set_type := ?set_tag}) when is_binary(Str) -> + Str; value_type({quoted, ValList}, _) -> {string_list, ValList}; -value_type([Int, <<"i">>], mixed) when is_integer(Int) -> +value_type([Int, <<"i">>], #{tmpl_type := mixed}) when is_integer(Int) -> {int, Int}; -value_type([UInt, <<"u">>], mixed) when is_integer(UInt) -> +value_type([UInt, <<"u">>], #{tmpl_type := mixed}) when is_integer(UInt) -> {uint, UInt}; %% write `1`, `1.0`, `-1.0` all as float %% see also: https://docs.influxdata.com/influxdb/v2.7/reference/syntax/line-protocol/#float -value_type([Number], _) when is_number(Number) -> +value_type([Number], #{set_type := ?set_field}) when is_number(Number) -> {float, Number}; value_type([<<"t">>], _) -> 't'; @@ -776,9 +788,9 @@ value_type([<<"FALSE">>], _) -> 'FALSE'; value_type([<<"False">>], _) -> 'False'; -value_type([Str], variable) when is_binary(Str) -> +value_type([Str], #{tmpl_type := variable}) when is_binary(Str) -> Str; -value_type([Str], literal) when is_binary(Str) -> +value_type([Str], #{tmpl_type := literal, set_type := ?set_field}) when is_binary(Str) -> %% if Str is a literal string suffixed with `i` or `u`, we should convert it to int/uint. %% otherwise, we should convert it to float. NumStr = binary:part(Str, 0, byte_size(Str) - 1), diff --git a/apps/emqx_bridge_influxdb/test/emqx_bridge_influxdb_SUITE.erl b/apps/emqx_bridge_influxdb/test/emqx_bridge_influxdb_SUITE.erl index e30e8b361..3d50282ab 100644 --- a/apps/emqx_bridge_influxdb/test/emqx_bridge_influxdb_SUITE.erl +++ b/apps/emqx_bridge_influxdb/test/emqx_bridge_influxdb_SUITE.erl @@ -864,6 +864,53 @@ t_any_num_as_float(Config) -> TimeReturned = pad_zero(TimeReturned0), ?assertEqual(TsStr, TimeReturned). +t_tag_set_use_literal_value(Config) -> + QueryMode = ?config(query_mode, Config), + Const = erlang:system_time(nanosecond), + ConstBin = integer_to_binary(Const), + TsStr = iolist_to_binary( + calendar:system_time_to_rfc3339(Const, [{unit, nanosecond}, {offset, "Z"}]) + ), + ?assertMatch( + {ok, _}, + create_bridge( + Config, + #{ + <<"write_syntax">> => + <<"mqtt,clientid=${clientid},tag_key1=100,tag_key2=123.4,tag_key3=66i,tag_key4=${payload.float_dp}", + " ", + "field_key1=100.1,field_key2=100i,field_key3=${payload.float_dp},bar=5i", + " ", ConstBin/binary>> + } + ) + ), + ClientId = emqx_guid:to_hexstr(emqx_guid:gen()), + Payload = #{ + %% with decimal point + float_dp => 123.4 + }, + SentData = #{ + <<"clientid">> => ClientId, + <<"topic">> => atom_to_binary(?FUNCTION_NAME), + <<"payload">> => Payload, + <<"timestamp">> => erlang:system_time(millisecond) + }, + case QueryMode of + sync -> + ?assertMatch({ok, 204, _}, send_message(Config, SentData)), + ok; + async -> + ?assertMatch(ok, send_message(Config, SentData)) + end, + %% sleep is still need even in sync mode, or we would get an empty result sometimes + ct:sleep(1500), + PersistedData = query_by_clientid(ClientId, Config), + Expected = #{field_key1 => <<"100.1">>, field_key2 => <<"100">>, field_key3 => <<"123.4">>}, + assert_persisted_data(ClientId, Expected, PersistedData), + TimeReturned0 = maps:get(<<"_time">>, maps:get(<<"field_key1">>, PersistedData)), + TimeReturned = pad_zero(TimeReturned0), + ?assertEqual(TsStr, TimeReturned). + t_bad_timestamp(Config) -> InfluxDBType = ?config(influxdb_type, Config), InfluxDBName = ?config(influxdb_name, Config), diff --git a/apps/emqx_bridge_iotdb/include/emqx_bridge_iotdb.hrl b/apps/emqx_bridge_iotdb/include/emqx_bridge_iotdb.hrl index 8ce7bce6d..6cf0c5508 100644 --- a/apps/emqx_bridge_iotdb/include/emqx_bridge_iotdb.hrl +++ b/apps/emqx_bridge_iotdb/include/emqx_bridge_iotdb.hrl @@ -5,6 +5,8 @@ -ifndef(EMQX_BRIDGE_IOTDB_HRL). -define(EMQX_BRIDGE_IOTDB_HRL, true). +-define(VSN_1_3_X, 'v1.3.x'). +-define(VSN_1_2_X, 'v1.2.x'). -define(VSN_1_1_X, 'v1.1.x'). -define(VSN_1_0_X, 'v1.0.x'). -define(VSN_0_13_X, 'v0.13.x'). diff --git a/apps/emqx_bridge_iotdb/src/emqx_bridge_iotdb.erl b/apps/emqx_bridge_iotdb/src/emqx_bridge_iotdb.erl index 134868978..19c1b6320 100644 --- a/apps/emqx_bridge_iotdb/src/emqx_bridge_iotdb.erl +++ b/apps/emqx_bridge_iotdb/src/emqx_bridge_iotdb.erl @@ -66,12 +66,7 @@ fields(action_config) -> ] ); fields(action_resource_opts) -> - lists:filter( - fun({K, _V}) -> - not lists:member(K, unsupported_opts()) - end, - emqx_bridge_v2_schema:action_resource_opts_fields() - ); + emqx_bridge_v2_schema:action_resource_opts_fields(); fields(action_parameters) -> [ {is_aligned, @@ -84,7 +79,7 @@ fields(action_parameters) -> )}, {device_id, mk( - binary(), + emqx_schema:template(), #{ desc => ?DESC("config_device_id") } @@ -114,7 +109,7 @@ fields(action_parameters_data) -> )}, {measurement, mk( - binary(), + emqx_schema:template(), #{ required => true, desc => ?DESC("config_parameters_measurement") @@ -122,7 +117,9 @@ fields(action_parameters_data) -> )}, {data_type, mk( - hoconsc:union([enum([text, boolean, int32, int64, float, double]), binary()]), + hoconsc:union([ + enum([text, boolean, int32, int64, float, double]), emqx_schema:template() + ]), #{ required => true, desc => ?DESC("config_parameters_data_type") @@ -130,7 +127,7 @@ fields(action_parameters_data) -> )}, {value, mk( - binary(), + emqx_schema:template(), #{ required => true, desc => ?DESC("config_parameters_value") @@ -150,7 +147,7 @@ fields("get_bridge_v2") -> fields("config") -> basic_config() ++ request_config(); fields("creation_opts") -> - proplists_without(unsupported_opts(), emqx_resource_schema:fields("creation_opts")); + emqx_resource_schema:fields("creation_opts"); fields(auth_basic) -> [ {username, mk(binary(), #{required => true, desc => ?DESC("config_auth_basic_username")})}, @@ -220,10 +217,10 @@ basic_config() -> )}, {iotdb_version, mk( - hoconsc:enum([?VSN_1_1_X, ?VSN_1_0_X, ?VSN_0_13_X]), + hoconsc:enum([?VSN_1_3_X, ?VSN_1_1_X, ?VSN_1_0_X, ?VSN_0_13_X]), #{ desc => ?DESC("config_iotdb_version"), - default => ?VSN_1_1_X + default => ?VSN_1_3_X } )} ] ++ resource_creation_opts() ++ @@ -268,12 +265,6 @@ resource_creation_opts() -> )} ]. -unsupported_opts() -> - [ - batch_size, - batch_time - ]. - %%------------------------------------------------------------------------------------------------- %% v2 examples %%------------------------------------------------------------------------------------------------- diff --git a/apps/emqx_bridge_iotdb/src/emqx_bridge_iotdb_connector.erl b/apps/emqx_bridge_iotdb/src/emqx_bridge_iotdb_connector.erl index f68ed02e3..d26b47f73 100644 --- a/apps/emqx_bridge_iotdb/src/emqx_bridge_iotdb_connector.erl +++ b/apps/emqx_bridge_iotdb/src/emqx_bridge_iotdb_connector.erl @@ -21,6 +21,8 @@ on_get_status/2, on_query/3, on_query_async/4, + on_batch_query/3, + on_batch_query_async/4, on_add_channel/4, on_remove_channel/3, on_get_channels/1, @@ -94,7 +96,7 @@ connector_example_values() -> name => <<"iotdb_connector">>, type => iotdb, enable => true, - iotdb_version => ?VSN_1_1_X, + iotdb_version => ?VSN_1_3_X, authentication => #{ <<"username">> => <<"root">>, <<"password">> => <<"******">> @@ -133,10 +135,10 @@ fields("connection_fields") -> )}, {iotdb_version, mk( - hoconsc:enum([?VSN_1_1_X, ?VSN_1_0_X, ?VSN_0_13_X]), + hoconsc:enum([?VSN_1_3_X, ?VSN_1_1_X, ?VSN_1_0_X, ?VSN_0_13_X]), #{ desc => ?DESC(emqx_bridge_iotdb, "config_iotdb_version"), - default => ?VSN_1_1_X + default => ?VSN_1_3_X } )}, {authentication, @@ -280,8 +282,8 @@ on_query( state => emqx_utils:redact(State) }), - case try_render_message(Req, IoTDBVsn, Channels) of - {ok, IoTDBPayload} -> + case try_render_messages([Req], IoTDBVsn, Channels) of + {ok, [IoTDBPayload]} -> handle_response( emqx_bridge_http_connector:on_query( InstanceId, {ChannelId, IoTDBPayload}, State @@ -306,8 +308,8 @@ on_query_async( send_message => Req, state => emqx_utils:redact(State) }), - case try_render_message(Req, IoTDBVsn, Channels) of - {ok, IoTDBPayload} -> + case try_render_messages([Req], IoTDBVsn, Channels) of + {ok, [IoTDBPayload]} -> ReplyFunAndArgs = { fun(Result) -> @@ -323,6 +325,71 @@ on_query_async( Error end. +on_batch_query_async( + InstId, + Requests, + Callback, + #{iotdb_version := IoTDBVsn, channels := Channels} = State +) -> + ?tp(iotdb_bridge_on_batch_query_async, #{instance_id => InstId}), + [{ChannelId, _Message} | _] = Requests, + ?SLOG(debug, #{ + msg => "iotdb_bridge_on_query_batch_async_called", + instance_id => InstId, + send_message => Requests, + state => emqx_utils:redact(State) + }), + case try_render_messages(Requests, IoTDBVsn, Channels) of + {ok, IoTDBPayloads} -> + ReplyFunAndArgs = + { + fun(Result) -> + Response = handle_response(Result), + emqx_resource:apply_reply_fun(Callback, Response) + end, + [] + }, + lists:map( + fun(IoTDBPayload) -> + emqx_bridge_http_connector:on_query_async( + InstId, {ChannelId, IoTDBPayload}, ReplyFunAndArgs, State + ) + end, + IoTDBPayloads + ); + Error -> + Error + end. + +on_batch_query( + InstId, + [{ChannelId, _Message}] = Requests, + #{iotdb_version := IoTDBVsn, channels := Channels} = State +) -> + ?tp(iotdb_bridge_on_batch_query, #{instance_id => InstId}), + ?SLOG(debug, #{ + msg => "iotdb_bridge_on_batch_query_called", + instance_id => InstId, + send_message => Requests, + state => emqx_utils:redact(State) + }), + + case try_render_messages(Requests, IoTDBVsn, Channels) of + {ok, IoTDBPayloads} -> + lists:map( + fun(IoTDBPayload) -> + handle_response( + emqx_bridge_http_connector:on_query( + InstId, {ChannelId, IoTDBPayload}, State + ) + ) + end, + IoTDBPayloads + ); + Error -> + Error + end. + on_add_channel( InstanceId, #{iotdb_version := Version, channels := Channels} = OldState0, @@ -342,6 +409,7 @@ on_add_channel( Path = case Version of ?VSN_1_1_X -> InsertTabletPathV2; + ?VSN_1_3_X -> InsertTabletPathV2; _ -> InsertTabletPathV1 end, @@ -442,14 +510,14 @@ maybe_preproc_tmpl(Value) when is_binary(Value) -> maybe_preproc_tmpl(Value) -> Value. -proc_data(PreProcessedData, Msg) -> +proc_data(PreProcessedData, Msg, IoTDBVsn) -> NowNS = erlang:system_time(nanosecond), Nows = #{ now_ms => erlang:convert_time_unit(NowNS, nanosecond, millisecond), now_us => erlang:convert_time_unit(NowNS, nanosecond, microsecond), now_ns => NowNS }, - proc_data(PreProcessedData, Msg, Nows, []). + proc_data(PreProcessedData, Msg, Nows, IoTDBVsn, []). proc_data( [ @@ -463,15 +531,16 @@ proc_data( ], Msg, Nows, + IotDbVsn, Acc ) -> DataType = list_to_binary( string:uppercase(binary_to_list(emqx_placeholder:proc_tmpl(DataType0, Msg))) ), try - proc_data(T, Msg, Nows, [ + proc_data(T, Msg, Nows, IotDbVsn, [ #{ - timestamp => iot_timestamp(TimestampTkn, Msg, Nows), + timestamp => iot_timestamp(IotDbVsn, TimestampTkn, Msg, Nows), measurement => emqx_placeholder:proc_tmpl(Measurement, Msg), data_type => DataType, value => proc_value(DataType, ValueTkn, Msg) @@ -485,23 +554,28 @@ proc_data( ?SLOG(debug, #{exception => Error, reason => Reason, stacktrace => Stacktrace}), {error, invalid_data} end; -proc_data([], _Msg, _Nows, Acc) -> +proc_data([], _Msg, _Nows, _IotDbVsn, Acc) -> {ok, lists:reverse(Acc)}. -iot_timestamp(Timestamp, _, _) when is_integer(Timestamp) -> +iot_timestamp(_IotDbVsn, Timestamp, _, _) when is_integer(Timestamp) -> Timestamp; -iot_timestamp(TimestampTkn, Msg, Nows) -> - iot_timestamp(emqx_placeholder:proc_tmpl(TimestampTkn, Msg), Nows). +iot_timestamp(IotDbVsn, TimestampTkn, Msg, Nows) -> + iot_timestamp(IotDbVsn, emqx_placeholder:proc_tmpl(TimestampTkn, Msg), Nows). -iot_timestamp(<<"now_us">>, #{now_us := NowUs}) -> +%% > v1.3.0 don't allow write nanoseconds nor microseconds +iot_timestamp(?VSN_1_3_X, <<"now_us">>, #{now_ms := NowMs}) -> + NowMs; +iot_timestamp(?VSN_1_3_X, <<"now_ns">>, #{now_ms := NowMs}) -> + NowMs; +iot_timestamp(_IotDbVsn, <<"now_us">>, #{now_us := NowUs}) -> NowUs; -iot_timestamp(<<"now_ns">>, #{now_ns := NowNs}) -> +iot_timestamp(_IotDbVsn, <<"now_ns">>, #{now_ns := NowNs}) -> NowNs; -iot_timestamp(Timestamp, #{now_ms := NowMs}) when +iot_timestamp(_IotDbVsn, Timestamp, #{now_ms := NowMs}) when Timestamp =:= <<"now">>; Timestamp =:= <<"now_ms">>; Timestamp =:= <<>> -> NowMs; -iot_timestamp(Timestamp, _) when is_binary(Timestamp) -> +iot_timestamp(_IotDbVsn, Timestamp, _) when is_binary(Timestamp) -> binary_to_integer(Timestamp). proc_value(<<"TEXT">>, ValueTkn, Msg) -> @@ -526,6 +600,7 @@ replace_var(Val, _Data) -> convert_bool(B) when is_boolean(B) -> B; convert_bool(null) -> null; +convert_bool(undefined) -> null; convert_bool(1) -> true; convert_bool(0) -> false; convert_bool(<<"1">>) -> true; @@ -568,11 +643,10 @@ convert_float(undefined) -> make_iotdb_insert_request(DataList, IsAligned, DeviceId, IoTDBVsn) -> InitAcc = #{timestamps => [], measurements => [], dtypes => [], values => []}, Rows = replace_dtypes(aggregate_rows(DataList, InitAcc), IoTDBVsn), - {ok, - maps:merge(Rows, #{ - iotdb_field_key(is_aligned, IoTDBVsn) => IsAligned, - iotdb_field_key(device_id, IoTDBVsn) => DeviceId - })}. + maps:merge(Rows, #{ + iotdb_field_key(is_aligned, IoTDBVsn) => IsAligned, + iotdb_field_key(device_id, IoTDBVsn) => DeviceId + }). replace_dtypes(Rows0, IoTDBVsn) -> {Types, Rows} = maps:take(dtypes, Rows0), @@ -632,18 +706,24 @@ insert_value(1, Data, [Value | Values]) -> insert_value(Index, Data, [Value | Values]) -> [[null | Value] | insert_value(Index - 1, Data, Values)]. +iotdb_field_key(is_aligned, ?VSN_1_3_X) -> + <<"is_aligned">>; iotdb_field_key(is_aligned, ?VSN_1_1_X) -> <<"is_aligned">>; iotdb_field_key(is_aligned, ?VSN_1_0_X) -> <<"is_aligned">>; iotdb_field_key(is_aligned, ?VSN_0_13_X) -> <<"isAligned">>; +iotdb_field_key(device_id, ?VSN_1_3_X) -> + <<"device">>; iotdb_field_key(device_id, ?VSN_1_1_X) -> <<"device">>; iotdb_field_key(device_id, ?VSN_1_0_X) -> <<"device">>; iotdb_field_key(device_id, ?VSN_0_13_X) -> <<"deviceId">>; +iotdb_field_key(data_types, ?VSN_1_3_X) -> + <<"data_types">>; iotdb_field_key(data_types, ?VSN_1_1_X) -> <<"data_types">>; iotdb_field_key(data_types, ?VSN_1_0_X) -> @@ -706,14 +786,37 @@ preproc_data_template(DataList) -> DataList ). -try_render_message({ChannelId, Msg}, IoTDBVsn, Channels) -> +try_render_messages([{ChannelId, _} | _] = Msgs, IoTDBVsn, Channels) -> case maps:find(ChannelId, Channels) of {ok, Channel} -> - render_channel_message(Channel, IoTDBVsn, Msg); + case do_render_message(Msgs, Channel, IoTDBVsn, #{}) of + RenderMsgs when is_map(RenderMsgs) -> + {ok, + lists:map( + fun({{DeviceId, IsAligned}, DataList}) -> + make_iotdb_insert_request(DataList, IsAligned, DeviceId, IoTDBVsn) + end, + maps:to_list(RenderMsgs) + )}; + Error -> + Error + end; _ -> {error, {unrecoverable_error, {invalid_channel_id, ChannelId}}} end. +do_render_message([], _Channel, _IoTDBVsn, Acc) -> + Acc; +do_render_message([{_, Msg} | Msgs], Channel, IoTDBVsn, Acc) -> + case render_channel_message(Channel, IoTDBVsn, Msg) of + {ok, NewDataList, DeviceId, IsAligned} -> + Fun = fun(V) -> NewDataList ++ V end, + Acc1 = maps:update_with({DeviceId, IsAligned}, Fun, NewDataList, Acc), + do_render_message(Msgs, Channel, IoTDBVsn, Acc1); + Error -> + Error + end. + render_channel_message(#{is_aligned := IsAligned} = Channel, IoTDBVsn, Message) -> Payloads = to_list(parse_payload(get_payload(Message))), case device_id(Message, Payloads, Channel) of @@ -724,9 +827,9 @@ render_channel_message(#{is_aligned := IsAligned} = Channel, IoTDBVsn, Message) [] -> {error, invalid_template}; DataTemplate -> - case proc_data(DataTemplate, Message) of + case proc_data(DataTemplate, Message, IoTDBVsn) of {ok, DataList} -> - make_iotdb_insert_request(DataList, IsAligned, DeviceId, IoTDBVsn); + {ok, DataList, DeviceId, IsAligned}; Error -> Error end diff --git a/apps/emqx_bridge_iotdb/test/emqx_bridge_iotdb_impl_SUITE.erl b/apps/emqx_bridge_iotdb/test/emqx_bridge_iotdb_impl_SUITE.erl index 693f16d05..d5661e2fe 100644 --- a/apps/emqx_bridge_iotdb/test/emqx_bridge_iotdb_impl_SUITE.erl +++ b/apps/emqx_bridge_iotdb/test/emqx_bridge_iotdb_impl_SUITE.erl @@ -20,14 +20,16 @@ all() -> [ - {group, plain}, + {group, iotdb110}, + {group, iotdb130}, {group, legacy} ]. groups() -> AllTCs = emqx_common_test_helpers:all(?MODULE), [ - {plain, AllTCs}, + {iotdb110, AllTCs}, + {iotdb130, AllTCs}, {legacy, AllTCs} ]. @@ -37,10 +39,15 @@ init_per_suite(Config) -> end_per_suite(Config) -> emqx_bridge_v2_testlib:end_per_suite(Config). -init_per_group(plain = Type, Config0) -> +init_per_group(Type, Config0) when Type =:= iotdb110 orelse Type =:= iotdb130 -> Host = os:getenv("IOTDB_PLAIN_HOST", "toxiproxy.emqx.net"), - Port = list_to_integer(os:getenv("IOTDB_PLAIN_PORT", "18080")), - ProxyName = "iotdb", + ProxyName = atom_to_list(Type), + {IotDbVersion, DefaultPort} = + case Type of + iotdb110 -> {?VSN_1_1_X, "18080"}; + iotdb130 -> {?VSN_1_3_X, "28080"} + end, + Port = list_to_integer(os:getenv("IOTDB_PLAIN_PORT", DefaultPort)), case emqx_common_test_helpers:is_tcp_server_available(Host, Port) of true -> Config = emqx_bridge_v2_testlib:init_per_group(Type, ?BRIDGE_TYPE_BIN, Config0), @@ -48,7 +55,7 @@ init_per_group(plain = Type, Config0) -> {bridge_host, Host}, {bridge_port, Port}, {proxy_name, ProxyName}, - {iotdb_version, ?VSN_1_1_X}, + {iotdb_version, IotDbVersion}, {iotdb_rest_prefix, <<"/rest/v2/">>} | Config ]; @@ -87,7 +94,8 @@ init_per_group(_Group, Config) -> Config. end_per_group(Group, Config) when - Group =:= plain; + Group =:= iotdb110; + Group =:= iotdb130; Group =:= legacy -> emqx_bridge_v2_testlib:end_per_group(Config), @@ -245,7 +253,9 @@ iotdb_query(Config, Query) -> iotdb_request(Config, Path, Body, Opts). is_success_check({ok, 200, _, Body}) -> - ?assert(is_code(200, emqx_utils_json:decode(Body))). + ?assert(is_code(200, emqx_utils_json:decode(Body))); +is_success_check(Other) -> + throw(Other). is_code(Code, #{<<"code">> := Code}) -> true; is_code(_, _) -> false. @@ -359,89 +369,96 @@ t_async_query(Config) -> t_sync_query_aggregated(Config) -> DeviceId = iotdb_device(Config), + MS = erlang:system_time(millisecond) - 5000, Payload = [ - make_iotdb_payload(DeviceId, "temp", "INT32", "36", 1685112026290), - make_iotdb_payload(DeviceId, "temp", "INT32", 37, 1685112026291), - make_iotdb_payload(DeviceId, "temp", "INT32", 38.7, 1685112026292), - make_iotdb_payload(DeviceId, "temp", "INT32", "39", <<"1685112026293">>), - make_iotdb_payload(DeviceId, "temp", "INT64", "36", 1685112026294), - make_iotdb_payload(DeviceId, "temp", "INT64", 36, 1685112026295), - make_iotdb_payload(DeviceId, "temp", "INT64", 36.7, 1685112026296), - %% implicit 'now()' timestamp - make_iotdb_payload(DeviceId, "temp", "INT32", "40"), + make_iotdb_payload(DeviceId, "temp", "INT32", "36", MS - 7000), + make_iotdb_payload(DeviceId, "temp", "INT32", 37, MS - 6000), + make_iotdb_payload(DeviceId, "temp", "INT64", 38.7, MS - 5000), + make_iotdb_payload(DeviceId, "temp", "INT64", "39", integer_to_binary(MS - 4000)), + make_iotdb_payload(DeviceId, "temp", "INT64", "34", MS - 3000), + make_iotdb_payload(DeviceId, "temp", "INT32", 33.7, MS - 2000), + make_iotdb_payload(DeviceId, "temp", "INT32", 32, MS - 1000), %% [FIXME] neither nanoseconds nor microseconds don't seem to be supported by IoTDB (make_iotdb_payload(DeviceId, "temp", "INT32", "41"))#{timestamp => <<"now_us">>}, - (make_iotdb_payload(DeviceId, "temp", "INT32", "42"))#{timestamp => <<"now_ns">>}, - make_iotdb_payload(DeviceId, "weight", "FLOAT", "87.3", 1685112026290), - make_iotdb_payload(DeviceId, "weight", "FLOAT", 87.3, 1685112026291), - make_iotdb_payload(DeviceId, "weight", "FLOAT", 87, 1685112026292), - make_iotdb_payload(DeviceId, "weight", "DOUBLE", "87.3", 1685112026293), - make_iotdb_payload(DeviceId, "weight", "DOUBLE", 87.3, 1685112026294), - make_iotdb_payload(DeviceId, "weight", "DOUBLE", 87, 1685112026295), + make_iotdb_payload(DeviceId, "weight", "FLOAT", "87.3", MS - 6000), + make_iotdb_payload(DeviceId, "weight", "FLOAT", 87.3, MS - 5000), + make_iotdb_payload(DeviceId, "weight", "FLOAT", 87, MS - 4000), + make_iotdb_payload(DeviceId, "weight", "DOUBLE", "87.3", MS - 3000), + make_iotdb_payload(DeviceId, "weight", "DOUBLE", 87.3, MS - 2000), + make_iotdb_payload(DeviceId, "weight", "DOUBLE", 87, MS - 1000), - make_iotdb_payload(DeviceId, "charged", "BOOLEAN", "1", 1685112026300), - make_iotdb_payload(DeviceId, "floated", "BOOLEAN", 1, 1685112026300), - make_iotdb_payload(DeviceId, "started", "BOOLEAN", true, 1685112026300), - make_iotdb_payload(DeviceId, "stoked", "BOOLEAN", "true", 1685112026300), - make_iotdb_payload(DeviceId, "enriched", "BOOLEAN", "TRUE", 1685112026300), - make_iotdb_payload(DeviceId, "gutted", "BOOLEAN", "True", 1685112026300), - make_iotdb_payload(DeviceId, "drained", "BOOLEAN", "0", 1685112026300), - make_iotdb_payload(DeviceId, "toasted", "BOOLEAN", 0, 1685112026300), - make_iotdb_payload(DeviceId, "uncharted", "BOOLEAN", false, 1685112026300), - make_iotdb_payload(DeviceId, "dazzled", "BOOLEAN", "false", 1685112026300), - make_iotdb_payload(DeviceId, "unplugged", "BOOLEAN", "FALSE", 1685112026300), - make_iotdb_payload(DeviceId, "unraveled", "BOOLEAN", "False", 1685112026300), - make_iotdb_payload(DeviceId, "undecided", "BOOLEAN", null, 1685112026300), + make_iotdb_payload(DeviceId, "charged", "BOOLEAN", "1", MS + 1000), + make_iotdb_payload(DeviceId, "floated", "BOOLEAN", 1, MS + 1000), + make_iotdb_payload(DeviceId, "started", "BOOLEAN", true, MS + 1000), + make_iotdb_payload(DeviceId, "stoked", "BOOLEAN", "true", MS + 1000), + make_iotdb_payload(DeviceId, "enriched", "BOOLEAN", "TRUE", MS + 1000), + make_iotdb_payload(DeviceId, "gutted", "BOOLEAN", "True", MS + 1000), + make_iotdb_payload(DeviceId, "drained", "BOOLEAN", "0", MS + 1000), + make_iotdb_payload(DeviceId, "toasted", "BOOLEAN", 0, MS + 1000), + make_iotdb_payload(DeviceId, "uncharted", "BOOLEAN", false, MS + 1000), + make_iotdb_payload(DeviceId, "dazzled", "BOOLEAN", "false", MS + 1000), + make_iotdb_payload(DeviceId, "unplugged", "BOOLEAN", "FALSE", MS + 1000), + make_iotdb_payload(DeviceId, "unraveled", "BOOLEAN", "False", MS + 1000), + make_iotdb_payload(DeviceId, "undecided", "BOOLEAN", null, MS + 1000), - make_iotdb_payload(DeviceId, "foo", "TEXT", "bar", 1685112026300) + make_iotdb_payload(DeviceId, "foo", "TEXT", "bar", MS + 1000) ], MakeMessageFun = make_message_fun(iotdb_topic(Config), Payload), ok = emqx_bridge_v2_testlib:t_sync_query( Config, MakeMessageFun, fun is_success_check/1, iotdb_bridge_on_query ), - %% check temp - QueryTemp = <<"select temp from ", DeviceId/binary>>, - {ok, {{_, 200, _}, _, ResultTemp}} = iotdb_query(Config, QueryTemp), - ?assertMatch( - #{<<"values">> := [[36, 37, 38, 39, 36, 36, 36, 40, 41, 42]]}, - emqx_utils_json:decode(ResultTemp) - ), + Time = integer_to_binary(MS - 20000), %% check weight - QueryWeight = <<"select weight from ", DeviceId/binary>>, + QueryWeight = <<"select weight from ", DeviceId/binary, " where time > ", Time/binary>>, {ok, {{_, 200, _}, _, ResultWeight}} = iotdb_query(Config, QueryWeight), ?assertMatch( #{<<"values">> := [[87.3, 87.3, 87.0, 87.3, 87.3, 87.0]]}, emqx_utils_json:decode(ResultWeight) ), - %% check rest ts = 1685112026300 - QueryRest = <<"select * from ", DeviceId/binary, " where time = 1685112026300">>, - {ok, {{_, 200, _}, _, ResultRest}} = iotdb_query(Config, QueryRest), - #{<<"values">> := Values, <<"expressions">> := Expressions} = emqx_utils_json:decode( - ResultRest - ), - Results = maps:from_list(lists:zipwith(fun(K, [V]) -> {K, V} end, Expressions, Values)), - Exp = #{ - exp(DeviceId, "charged") => true, - exp(DeviceId, "floated") => true, - exp(DeviceId, "started") => true, - exp(DeviceId, "stoked") => true, - exp(DeviceId, "enriched") => true, - exp(DeviceId, "gutted") => true, - exp(DeviceId, "drained") => false, - exp(DeviceId, "toasted") => false, - exp(DeviceId, "uncharted") => false, - exp(DeviceId, "dazzled") => false, - exp(DeviceId, "unplugged") => false, - exp(DeviceId, "unraveled") => false, - exp(DeviceId, "undecided") => null, - exp(DeviceId, "foo") => <<"bar">>, - exp(DeviceId, "temp") => null, - exp(DeviceId, "weight") => null - }, - ?assertEqual(Exp, Results), + %% [FIXME] https://github.com/apache/iotdb/issues/12375 + %% null don't seem to be supported by IoTDB insertTablet when 1.3.0 + case ?config(iotdb_version, Config) of + ?VSN_1_3_X -> + skip; + _ -> + %% check rest ts = MS + 1000 + CheckTime = integer_to_binary(MS + 1000), + QueryRest = <<"select * from ", DeviceId/binary, " where time = ", CheckTime/binary>>, + {ok, {{_, 200, _}, _, ResultRest}} = iotdb_query(Config, QueryRest), + #{<<"values">> := Values, <<"expressions">> := Expressions} = emqx_utils_json:decode( + ResultRest + ), + Results = maps:from_list(lists:zipwith(fun(K, [V]) -> {K, V} end, Expressions, Values)), + Exp = #{ + exp(DeviceId, "charged") => true, + exp(DeviceId, "floated") => true, + exp(DeviceId, "started") => true, + exp(DeviceId, "stoked") => true, + exp(DeviceId, "enriched") => true, + exp(DeviceId, "gutted") => true, + exp(DeviceId, "drained") => false, + exp(DeviceId, "toasted") => false, + exp(DeviceId, "uncharted") => false, + exp(DeviceId, "dazzled") => false, + exp(DeviceId, "unplugged") => false, + exp(DeviceId, "unraveled") => false, + exp(DeviceId, "undecided") => null, + exp(DeviceId, "foo") => <<"bar">>, + exp(DeviceId, "temp") => null, + exp(DeviceId, "weight") => null + }, + ?assertEqual(Exp, Results), + %% check temp + QueryTemp = <<"select temp from ", DeviceId/binary, " where time > ", Time/binary>>, + {ok, {{_, 200, _}, _, ResultTemp}} = iotdb_query(Config, QueryTemp), + ?assertMatch( + #{<<"values">> := [[36, 37, 38, 39, 34, 33, 32, 41]]}, + emqx_utils_json:decode(ResultTemp) + ) + end, ok. exp(Dev, M0) -> diff --git a/apps/emqx_bridge_kafka/src/emqx_bridge_kafka.erl b/apps/emqx_bridge_kafka/src/emqx_bridge_kafka.erl index ff9d19c0d..83bc33266 100644 --- a/apps/emqx_bridge_kafka/src/emqx_bridge_kafka.erl +++ b/apps/emqx_bridge_kafka/src/emqx_bridge_kafka.erl @@ -389,7 +389,7 @@ fields(producer_kafka_opts) -> )}, {kafka_headers, mk( - binary(), + emqx_schema:template(), #{ required => false, validator => fun kafka_header_validator/1, @@ -462,12 +462,12 @@ fields(producer_kafka_ext_headers) -> [ {kafka_ext_header_key, mk( - binary(), + emqx_schema:template(), #{required => true, desc => ?DESC(producer_kafka_ext_header_key)} )}, {kafka_ext_header_value, mk( - binary(), + emqx_schema:template(), #{ required => true, validator => fun kafka_ext_header_value_validator/1, @@ -477,11 +477,20 @@ fields(producer_kafka_ext_headers) -> ]; fields(kafka_message) -> [ - {key, mk(string(), #{default => <<"${.clientid}">>, desc => ?DESC(kafka_message_key)})}, - {value, mk(string(), #{default => <<"${.}">>, desc => ?DESC(kafka_message_value)})}, + {key, + mk(emqx_schema:template(), #{ + default => <<"${.clientid}">>, + desc => ?DESC(kafka_message_key) + })}, + {value, + mk(emqx_schema:template(), #{ + default => <<"${.}">>, + desc => ?DESC(kafka_message_value) + })}, {timestamp, - mk(string(), #{ - default => <<"${.timestamp}">>, desc => ?DESC(kafka_message_timestamp) + mk(emqx_schema:template(), #{ + default => <<"${.timestamp}">>, + desc => ?DESC(kafka_message_timestamp) })} ]; fields(producer_buffer) -> @@ -536,8 +545,11 @@ fields(consumer_topic_mapping) -> {qos, mk(emqx_schema:qos(), #{default => 0, desc => ?DESC(consumer_mqtt_qos)})}, {payload_template, mk( - string(), - #{default => <<"${.}">>, desc => ?DESC(consumer_mqtt_payload)} + emqx_schema:template(), + #{ + default => <<"${.}">>, + desc => ?DESC(consumer_mqtt_payload) + } )} ]; fields(consumer_kafka_opts) -> @@ -744,8 +756,8 @@ producer_strategy_key_validator( producer_strategy_key_validator(emqx_utils_maps:binary_key_map(Conf)); producer_strategy_key_validator(#{ <<"partition_strategy">> := key_dispatch, - <<"message">> := #{<<"key">> := ""} -}) -> + <<"message">> := #{<<"key">> := Key} +}) when Key =:= "" orelse Key =:= <<>> -> {error, "Message key cannot be empty when `key_dispatch` strategy is used"}; producer_strategy_key_validator(_) -> ok. diff --git a/apps/emqx_bridge_kafka/test/emqx_bridge_kafka_tests.erl b/apps/emqx_bridge_kafka/test/emqx_bridge_kafka_tests.erl index 2f20099ae..54f6f9efc 100644 --- a/apps/emqx_bridge_kafka/test/emqx_bridge_kafka_tests.erl +++ b/apps/emqx_bridge_kafka/test/emqx_bridge_kafka_tests.erl @@ -357,7 +357,7 @@ kafka_consumer_hocon() -> %% assert compatibility bridge_schema_json_test() -> - JSON = iolist_to_binary(emqx_conf:bridge_schema_json()), + JSON = iolist_to_binary(emqx_dashboard_schema_api:bridge_schema_json()), Map = emqx_utils_json:decode(JSON), Path = [<<"components">>, <<"schemas">>, <<"bridge_kafka.post_producer">>, <<"properties">>], ?assertMatch(#{<<"kafka">> := _}, emqx_utils_maps:deep_get(Path, Map)). diff --git a/apps/emqx_bridge_kinesis/src/emqx_bridge_kinesis.erl b/apps/emqx_bridge_kinesis/src/emqx_bridge_kinesis.erl index 3c22e41e2..40849a29d 100644 --- a/apps/emqx_bridge_kinesis/src/emqx_bridge_kinesis.erl +++ b/apps/emqx_bridge_kinesis/src/emqx_bridge_kinesis.erl @@ -150,7 +150,7 @@ fields(producer) -> [ {payload_template, sc( - binary(), + emqx_schema:template(), #{ default => <<"${.}">>, desc => ?DESC("payload_template") diff --git a/apps/emqx_bridge_mongodb/src/emqx_bridge_mongodb.erl b/apps/emqx_bridge_mongodb/src/emqx_bridge_mongodb.erl index c81df1334..593bf6ff8 100644 --- a/apps/emqx_bridge_mongodb/src/emqx_bridge_mongodb.erl +++ b/apps/emqx_bridge_mongodb/src/emqx_bridge_mongodb.erl @@ -44,8 +44,10 @@ roots() -> []. fields("config") -> [ {enable, mk(boolean(), #{desc => ?DESC("enable"), default => true})}, - {collection, mk(binary(), #{desc => ?DESC("collection"), default => <<"mqtt">>})}, - {payload_template, mk(binary(), #{required => false, desc => ?DESC("payload_template")})}, + {collection, + mk(emqx_schema:template(), #{desc => ?DESC("collection"), default => <<"mqtt">>})}, + {payload_template, + mk(emqx_schema:template(), #{required => false, desc => ?DESC("payload_template")})}, {resource_opts, mk( ref(?MODULE, "creation_opts"), diff --git a/apps/emqx_bridge_mqtt/src/emqx_bridge_mqtt_connector.erl b/apps/emqx_bridge_mqtt/src/emqx_bridge_mqtt_connector.erl index 0b8f71a7f..900f6143f 100644 --- a/apps/emqx_bridge_mqtt/src/emqx_bridge_mqtt_connector.erl +++ b/apps/emqx_bridge_mqtt/src/emqx_bridge_mqtt_connector.erl @@ -450,7 +450,6 @@ connect(Options) -> options => emqx_utils:redact(Options) }), Name = proplists:get_value(name, Options), - WorkerId = proplists:get_value(ecpool_worker_id, Options), ClientOpts = proplists:get_value(client_opts, Options), case emqtt:start_link(mk_client_opts(Name, WorkerId, ClientOpts)) of {ok, Pid} -> diff --git a/apps/emqx_bridge_mqtt/src/emqx_bridge_mqtt_connector_schema.erl b/apps/emqx_bridge_mqtt/src/emqx_bridge_mqtt_connector_schema.erl index 7103e53ee..bc2939c24 100644 --- a/apps/emqx_bridge_mqtt/src/emqx_bridge_mqtt_connector_schema.erl +++ b/apps/emqx_bridge_mqtt/src/emqx_bridge_mqtt_connector_schema.erl @@ -200,7 +200,7 @@ fields("ingress_local") -> [ {topic, mk( - binary(), + emqx_schema:template(), #{ validator => fun emqx_schema:non_empty_string/1, desc => ?DESC("ingress_local_topic"), @@ -217,7 +217,7 @@ fields("ingress_local") -> )}, {retain, mk( - hoconsc:union([boolean(), binary()]), + hoconsc:union([boolean(), emqx_schema:template()]), #{ default => <<"${retain}">>, desc => ?DESC("retain") @@ -225,7 +225,7 @@ fields("ingress_local") -> )}, {payload, mk( - binary(), + emqx_schema:template(), #{ default => undefined, desc => ?DESC("payload") @@ -268,7 +268,7 @@ fields("egress_remote") -> [ {topic, mk( - binary(), + emqx_schema:template(), #{ required => true, validator => fun emqx_schema:non_empty_string/1, @@ -286,7 +286,7 @@ fields("egress_remote") -> )}, {retain, mk( - hoconsc:union([boolean(), binary()]), + hoconsc:union([boolean(), emqx_schema:template()]), #{ required => false, default => false, @@ -295,7 +295,7 @@ fields("egress_remote") -> )}, {payload, mk( - binary(), + emqx_schema:template(), #{ default => undefined, desc => ?DESC("payload") @@ -344,7 +344,7 @@ desc(_) -> undefined. qos() -> - hoconsc:union([emqx_schema:qos(), binary()]). + hoconsc:union([emqx_schema:qos(), emqx_schema:template()]). parse_server(Str) -> #{hostname := Host, port := Port} = emqx_schema:parse_server(Str, ?MQTT_HOST_OPTS), diff --git a/apps/emqx_bridge_mysql/src/emqx_bridge_mysql.erl b/apps/emqx_bridge_mysql/src/emqx_bridge_mysql.erl index ee7487760..24b11b930 100644 --- a/apps/emqx_bridge_mysql/src/emqx_bridge_mysql.erl +++ b/apps/emqx_bridge_mysql/src/emqx_bridge_mysql.erl @@ -117,7 +117,7 @@ fields("config") -> {enable, mk(boolean(), #{desc => ?DESC("config_enable"), default => true})}, {sql, mk( - binary(), + emqx_schema:template(), #{desc => ?DESC("sql_template"), default => ?DEFAULT_SQL, format => <<"sql">>} )}, {local_topic, diff --git a/apps/emqx_bridge_mysql/test/emqx_bridge_mysql_SUITE.erl b/apps/emqx_bridge_mysql/test/emqx_bridge_mysql_SUITE.erl index 8b719de9a..9ad2fbc5a 100644 --- a/apps/emqx_bridge_mysql/test/emqx_bridge_mysql_SUITE.erl +++ b/apps/emqx_bridge_mysql/test/emqx_bridge_mysql_SUITE.erl @@ -517,7 +517,6 @@ t_write_failure(Config) -> ok end, fun(Trace0) -> - ct:pal("trace: ~p", [Trace0]), Trace = ?of_kind(buffer_worker_flush_nack, Trace0), ?assertMatch([#{result := {error, _}} | _], Trace), [#{result := {error, Error}} | _] = Trace, diff --git a/apps/emqx_bridge_opents/src/emqx_bridge_opents.erl b/apps/emqx_bridge_opents/src/emqx_bridge_opents.erl index d38ed8eb4..25c0ce88d 100644 --- a/apps/emqx_bridge_opents/src/emqx_bridge_opents.erl +++ b/apps/emqx_bridge_opents/src/emqx_bridge_opents.erl @@ -146,7 +146,7 @@ fields(action_parameters_data) -> [ {timestamp, mk( - binary(), + emqx_schema:template(), #{ desc => ?DESC("config_parameters_timestamp"), required => false @@ -154,7 +154,7 @@ fields(action_parameters_data) -> )}, {metric, mk( - binary(), + emqx_schema:template(), #{ required => true, desc => ?DESC("config_parameters_metric") @@ -162,7 +162,7 @@ fields(action_parameters_data) -> )}, {tags, mk( - hoconsc:union([map(), binary()]), + hoconsc:union([map(), emqx_schema:template()]), #{ required => true, desc => ?DESC("config_parameters_tags"), @@ -188,7 +188,7 @@ fields(action_parameters_data) -> )}, {value, mk( - hoconsc:union([integer(), float(), binary()]), + hoconsc:union([integer(), float(), emqx_schema:template()]), #{ required => true, desc => ?DESC("config_parameters_value") diff --git a/apps/emqx_bridge_oracle/src/emqx_bridge_oracle.erl b/apps/emqx_bridge_oracle/src/emqx_bridge_oracle.erl index fb485c16b..c3b4160ab 100644 --- a/apps/emqx_bridge_oracle/src/emqx_bridge_oracle.erl +++ b/apps/emqx_bridge_oracle/src/emqx_bridge_oracle.erl @@ -158,7 +158,7 @@ fields(action_parameters) -> [ {sql, hoconsc:mk( - binary(), + emqx_schema:template(), #{desc => ?DESC("sql_template"), default => ?DEFAULT_SQL, format => <<"sql">>} )} ]; @@ -177,7 +177,7 @@ fields("config") -> )}, {sql, hoconsc:mk( - binary(), + emqx_schema:template(), #{desc => ?DESC("sql_template"), default => ?DEFAULT_SQL, format => <<"sql">>} )}, {local_topic, diff --git a/apps/emqx_bridge_pgsql/src/emqx_bridge_pgsql.erl b/apps/emqx_bridge_pgsql/src/emqx_bridge_pgsql.erl index 7d02e8cca..5a0b9eb5b 100644 --- a/apps/emqx_bridge_pgsql/src/emqx_bridge_pgsql.erl +++ b/apps/emqx_bridge_pgsql/src/emqx_bridge_pgsql.erl @@ -61,7 +61,7 @@ fields(action_parameters) -> [ {sql, hoconsc:mk( - binary(), + emqx_schema:template(), #{desc => ?DESC("sql_template"), default => default_sql(), format => <<"sql">>} )} ]; diff --git a/apps/emqx_bridge_pgsql/test/emqx_bridge_pgsql_SUITE.erl b/apps/emqx_bridge_pgsql/test/emqx_bridge_pgsql_SUITE.erl index 3e9428c88..f4917f387 100644 --- a/apps/emqx_bridge_pgsql/test/emqx_bridge_pgsql_SUITE.erl +++ b/apps/emqx_bridge_pgsql/test/emqx_bridge_pgsql_SUITE.erl @@ -520,7 +520,6 @@ t_write_failure(Config) -> ) end), fun(Trace0) -> - ct:pal("trace: ~p", [Trace0]), Trace = ?of_kind(buffer_worker_flush_nack, Trace0), ?assertMatch([#{result := {error, _}} | _], Trace), [#{result := {error, Error}} | _] = Trace, diff --git a/apps/emqx_bridge_pulsar/src/emqx_bridge_pulsar_pubsub_schema.erl b/apps/emqx_bridge_pulsar/src/emqx_bridge_pulsar_pubsub_schema.erl index ccf985ba8..dff62843e 100644 --- a/apps/emqx_bridge_pulsar/src/emqx_bridge_pulsar_pubsub_schema.erl +++ b/apps/emqx_bridge_pulsar/src/emqx_bridge_pulsar_pubsub_schema.erl @@ -51,12 +51,12 @@ fields(action_parameters) -> fields(producer_pulsar_message) -> [ {key, - ?HOCON(string(), #{ + ?HOCON(emqx_schema:template(), #{ default => <<"${.clientid}">>, desc => ?DESC("producer_key_template") })}, {value, - ?HOCON(string(), #{ + ?HOCON(emqx_schema:template(), #{ default => <<"${.}">>, desc => ?DESC("producer_value_template") })} diff --git a/apps/emqx_bridge_pulsar/test/emqx_bridge_pulsar_connector_SUITE.erl b/apps/emqx_bridge_pulsar/test/emqx_bridge_pulsar_connector_SUITE.erl index b3c351da0..cd54e2194 100644 --- a/apps/emqx_bridge_pulsar/test/emqx_bridge_pulsar_connector_SUITE.erl +++ b/apps/emqx_bridge_pulsar/test/emqx_bridge_pulsar_connector_SUITE.erl @@ -1235,7 +1235,7 @@ t_resilience(Config) -> after 1_000 -> ct:fail("producer didn't stop!") end, Consumed = lists:flatmap( - fun(_) -> receive_consumed(5_000) end, lists:seq(1, NumProduced) + fun(_) -> receive_consumed(10_000) end, lists:seq(1, NumProduced) ), ?assertEqual(NumProduced, length(Consumed)), ExpectedPayloads = lists:map(fun integer_to_binary/1, lists:seq(1, NumProduced)), diff --git a/apps/emqx_bridge_rabbitmq/src/emqx_bridge_rabbitmq_pubsub_schema.erl b/apps/emqx_bridge_rabbitmq/src/emqx_bridge_rabbitmq_pubsub_schema.erl index 9a9741226..b0c254fc4 100644 --- a/apps/emqx_bridge_rabbitmq/src/emqx_bridge_rabbitmq_pubsub_schema.erl +++ b/apps/emqx_bridge_rabbitmq/src/emqx_bridge_rabbitmq_pubsub_schema.erl @@ -99,7 +99,7 @@ fields(action_parameters) -> )}, {payload_template, hoconsc:mk( - binary(), + emqx_schema:template(), #{ default => <<"">>, desc => ?DESC(?CONNECTOR_SCHEMA, "payload_template") diff --git a/apps/emqx_bridge_rabbitmq/test/emqx_bridge_rabbitmq_test_utils.erl b/apps/emqx_bridge_rabbitmq/test/emqx_bridge_rabbitmq_test_utils.erl index 47df47976..2110a0520 100644 --- a/apps/emqx_bridge_rabbitmq/test/emqx_bridge_rabbitmq_test_utils.erl +++ b/apps/emqx_bridge_rabbitmq/test/emqx_bridge_rabbitmq_test_utils.erl @@ -52,7 +52,7 @@ init_per_group(_Group, Config) -> common_init_per_group(Opts) -> emqx_common_test_helpers:render_and_load_app_config(emqx_conf), ok = emqx_common_test_helpers:start_apps([ - emqx_conf, emqx_bridge, emqx_bridge_rabbitmq, emqx_rule_engine + emqx_conf, emqx_bridge, emqx_bridge_rabbitmq, emqx_rule_engine, emqx_modules ]), ok = emqx_connector_test_helpers:start_apps([emqx_resource]), {ok, _} = application:ensure_all_started(emqx_connector), @@ -116,7 +116,9 @@ end_per_group(_Group, Config) -> } = get_channel_connection(Config), amqp_channel:call(Channel, #'queue.purge'{queue = rabbit_mq_queue()}), emqx_mgmt_api_test_util:end_suite(), - ok = emqx_common_test_helpers:stop_apps([emqx_conf, emqx_bridge_rabbitmq, emqx_rule_engine]), + ok = emqx_common_test_helpers:stop_apps([ + emqx_conf, emqx_bridge_rabbitmq, emqx_rule_engine, emqx_modules + ]), ok = emqx_connector_test_helpers:stop_apps([emqx_resource]), _ = application:stop(emqx_connector), _ = application:stop(emqx_bridge), diff --git a/apps/emqx_bridge_redis/src/emqx_bridge_redis.erl b/apps/emqx_bridge_redis/src/emqx_bridge_redis.erl index c80f9ead1..c9b2a35b9 100644 --- a/apps/emqx_bridge_redis/src/emqx_bridge_redis.erl +++ b/apps/emqx_bridge_redis/src/emqx_bridge_redis.erl @@ -211,7 +211,7 @@ desc(_) -> undefined. command_template(type) -> - list(binary()); + hoconsc:array(emqx_schema:template()); command_template(required) -> true; command_template(validator) -> diff --git a/apps/emqx_bridge_rocketmq/src/emqx_bridge_rocketmq.erl b/apps/emqx_bridge_rocketmq/src/emqx_bridge_rocketmq.erl index 589719486..f7e6d9b57 100644 --- a/apps/emqx_bridge_rocketmq/src/emqx_bridge_rocketmq.erl +++ b/apps/emqx_bridge_rocketmq/src/emqx_bridge_rocketmq.erl @@ -162,8 +162,13 @@ fields(action_parameters) -> [ {template, mk( - binary(), + emqx_schema:template(), #{desc => ?DESC("template"), default => ?DEFAULT_TEMPLATE} + )}, + {strategy, + mk( + hoconsc:union([roundrobin, binary()]), + #{desc => ?DESC("strategy"), default => roundrobin} )} ] ++ emqx_bridge_rocketmq_connector:fields(config), lists:foldl( @@ -173,6 +178,7 @@ fields(action_parameters) -> Parameters, [ servers, + namespace, pool_size, auto_reconnect, access_key, @@ -205,17 +211,21 @@ fields("config") -> {enable, mk(boolean(), #{desc => ?DESC("config_enable"), default => true})}, {template, mk( - binary(), + emqx_schema:template(), #{desc => ?DESC("template"), default => ?DEFAULT_TEMPLATE} )}, {local_topic, mk( binary(), #{desc => ?DESC("local_topic"), required => false} + )}, + {strategy, + mk( + hoconsc:union([roundrobin, binary()]), + #{desc => ?DESC("strategy"), default => roundrobin} )} ] ++ emqx_resource_schema:fields("resource_opts") ++ - (emqx_bridge_rocketmq_connector:fields(config) -- - emqx_connector_schema_lib:prepare_statement_fields()); + emqx_bridge_rocketmq_connector:fields(config); fields("post") -> [type_field(), name_field() | fields("config")]; fields("put") -> diff --git a/apps/emqx_bridge_rocketmq/src/emqx_bridge_rocketmq_connector.erl b/apps/emqx_bridge_rocketmq/src/emqx_bridge_rocketmq_connector.erl index 1af520a93..f9b4ec5d4 100644 --- a/apps/emqx_bridge_rocketmq/src/emqx_bridge_rocketmq_connector.erl +++ b/apps/emqx_bridge_rocketmq/src/emqx_bridge_rocketmq_connector.erl @@ -45,9 +45,14 @@ roots() -> fields(config) -> [ {servers, servers()}, - {topic, + {namespace, mk( binary(), + #{required => false, desc => ?DESC(namespace)} + )}, + {topic, + mk( + emqx_schema:template(), #{default => <<"TopicTest">>, desc => ?DESC(topic)} )}, {access_key, @@ -107,7 +112,7 @@ on_start( ), ClientId = client_id(InstanceId), ACLInfo = acl_info(AccessKey, SecretKey, SecurityToken), - ClientCfg = #{acl_info => ACLInfo}, + ClientCfg = namespace(#{acl_info => ACLInfo}, Config), State = #{ client_id => ClientId, @@ -156,10 +161,12 @@ create_channel_state( TopicTks = emqx_placeholder:preproc_tmpl(Topic), ProducerOpts = make_producer_opts(Conf, ACLInfo), Templates = parse_template(Conf), + DispatchStrategy = parse_dispatch_strategy(Conf), State = #{ topic => Topic, topic_tokens => TopicTks, templates => Templates, + dispatch_strategy => DispatchStrategy, sync_timeout => SyncTimeout, acl_info => ACLInfo, producers_opts => ProducerOpts @@ -202,7 +209,7 @@ on_stop(InstanceId, _State) -> ({_, client_id, ClientId}) -> destory_producers_map(ClientId), ok = rocketmq:stop_and_delete_supervised_client(ClientId); - ({_, _Topic, Producer}) -> + ({_, _ProducerGroup, Producer}) -> _ = rocketmq:stop_and_delete_supervised_producers(Producer) end, emqx_resource:get_allocated_resources_list(InstanceId) @@ -250,15 +257,16 @@ do_query( #{ topic_tokens := TopicTks, templates := Templates, + dispatch_strategy := DispatchStrategy, sync_timeout := RequestTimeout, producers_opts := ProducerOpts } = maps:get(ChannelId, Channels), TopicKey = get_topic_key(Query, TopicTks), - Data = apply_template(Query, Templates), + Data = apply_template(Query, Templates, DispatchStrategy), Result = safe_do_produce( - InstanceId, QueryFunc, ClientId, TopicKey, Data, ProducerOpts, RequestTimeout + ChannelId, InstanceId, QueryFunc, ClientId, TopicKey, Data, ProducerOpts, RequestTimeout ), case Result of {error, Reason} -> @@ -284,9 +292,11 @@ do_query( get_channel_id({ChannelId, _}) -> ChannelId; get_channel_id([{ChannelId, _} | _]) -> ChannelId. -safe_do_produce(InstanceId, QueryFunc, ClientId, TopicKey, Data, ProducerOpts, RequestTimeout) -> +safe_do_produce( + ChannelId, InstanceId, QueryFunc, ClientId, TopicKey, Data, ProducerOpts, RequestTimeout +) -> try - Producers = get_producers(InstanceId, ClientId, TopicKey, ProducerOpts), + Producers = get_producers(ChannelId, InstanceId, ClientId, TopicKey, ProducerOpts), produce(InstanceId, QueryFunc, Producers, Data, RequestTimeout) catch _Type:Reason -> @@ -315,24 +325,57 @@ parse_template([{Key, H} | T], Templates) -> parse_template([], Templates) -> Templates. +%% returns a procedure to generate the produce context +parse_dispatch_strategy(#{strategy := roundrobin}) -> + fun(_) -> + #{} + end; +parse_dispatch_strategy(#{strategy := Template}) -> + Tokens = emqx_placeholder:preproc_tmpl(Template), + fun(Msg) -> + #{ + key => + case emqx_placeholder:proc_tmpl(Tokens, Msg) of + <<"undefined">> -> + %% Since the key may be absent on some kinds of events (ex: + %% `topic' is absent in `client.disconnected'), and this key is + %% used for routing, we generate a random key when it's absent to + %% better distribute the load, effectively making it `random' + %% dispatch if the key is absent and we are using `key_dispatch'. + %% Otherwise, it'll be deterministic. + emqx_guid:gen(); + Key -> + Key + end + } + end. + get_topic_key({_, Msg}, TopicTks) -> emqx_placeholder:proc_tmpl(TopicTks, Msg); get_topic_key([Query | _], TopicTks) -> get_topic_key(Query, TopicTks). -apply_template({Key, Msg} = _Req, Templates) -> +%% return a message data and its context, +%% {binary(), rocketmq_producers:produce_context()}) +apply_template({Key, Msg} = _Req, Templates, DispatchStrategy) -> + { + case maps:get(Key, Templates, undefined) of + undefined -> + emqx_utils_json:encode(Msg); + Template -> + emqx_placeholder:proc_tmpl(Template, Msg) + end, + DispatchStrategy(Msg) + }; +apply_template([{Key, _} | _] = Reqs, Templates, DispatchStrategy) -> case maps:get(Key, Templates, undefined) of undefined -> - emqx_utils_json:encode(Msg); + [{emqx_utils_json:encode(Msg), DispatchStrategy(Msg)} || {_, Msg} <- Reqs]; Template -> - emqx_placeholder:proc_tmpl(Template, Msg) - end; -apply_template([{Key, _} | _] = Reqs, Templates) -> - case maps:get(Key, Templates, undefined) of - undefined -> - [emqx_utils_json:encode(Msg) || {_, Msg} <- Reqs]; - Template -> - [emqx_placeholder:proc_tmpl(Template, Msg) || {_, Msg} <- Reqs] + [ + {emqx_placeholder:proc_tmpl(Template, Msg), DispatchStrategy(Msg)} + || {_, Msg} <- Reqs + ] end. client_id(ResourceId) -> @@ -377,6 +420,10 @@ acl_info(AccessKey, SecretKey, SecurityToken) when is_binary(AccessKey) -> acl_info(_, _, _) -> #{}. +namespace(ClientCfg, Config) -> + Namespace = maps:get(namespace, Config, <<>>), + ClientCfg#{namespace => Namespace}. + create_producers_map(ClientId) -> _ = ets:new(ClientId, [public, named_table, {read_concurrency, true}]), ok. @@ -391,16 +438,21 @@ destory_producers_map(ClientId) -> ets:delete(Tid) end. -get_producers(InstanceId, ClientId, Topic, ProducerOpts) -> - case ets:lookup(ClientId, Topic) of +get_producers(ChannelId, InstanceId, ClientId, Topic, ProducerOpts) -> + %% The topic need to be included in the name since we can have multiple + %% topics per channel due to templating. + ProducerGroup = iolist_to_binary([ChannelId, "_", Topic]), + case ets:lookup(ClientId, ProducerGroup) of [{_, Producers}] -> Producers; _ -> - ProducerGroup = iolist_to_binary([atom_to_list(ClientId), "_", Topic]), + %% TODO: the name needs to be an atom but this may cause atom leak so we + %% should figure out a way to avoid this + ProducerOpts2 = ProducerOpts#{name => binary_to_atom(ProducerGroup)}, {ok, Producers} = rocketmq:ensure_supervised_producers( - ClientId, ProducerGroup, Topic, ProducerOpts + ClientId, ProducerGroup, Topic, ProducerOpts2 ), - ok = emqx_resource:allocate_resource(InstanceId, Topic, Producers), - ets:insert(ClientId, {Topic, Producers}), + ok = emqx_resource:allocate_resource(InstanceId, ProducerGroup, Producers), + ets:insert(ClientId, {ProducerGroup, Producers}), Producers end. diff --git a/apps/emqx_bridge_rocketmq/test/emqx_bridge_rocketmq_SUITE.erl b/apps/emqx_bridge_rocketmq/test/emqx_bridge_rocketmq_SUITE.erl index a056ae3d2..7af6c7eea 100644 --- a/apps/emqx_bridge_rocketmq/test/emqx_bridge_rocketmq_SUITE.erl +++ b/apps/emqx_bridge_rocketmq/test/emqx_bridge_rocketmq_SUITE.erl @@ -263,6 +263,60 @@ t_setup_via_http_api_and_publish(Config) -> ), ok. +t_setup_two_actions_via_http_api_and_publish(Config) -> + BridgeType = ?GET_CONFIG(rocketmq_bridge_type, Config), + Name = ?GET_CONFIG(rocketmq_name, Config), + RocketMQConf = ?GET_CONFIG(rocketmq_config, Config), + RocketMQConf2 = RocketMQConf#{ + <<"name">> => Name, + <<"type">> => BridgeType + }, + ?assertMatch( + {ok, _}, + create_bridge_http(RocketMQConf2) + ), + {ok, #{raw_config := ActionConf}} = emqx_bridge_v2:lookup(actions, BridgeType, Name), + Topic2 = <<"Topic2">>, + ActionConf2 = emqx_utils_maps:deep_force_put( + [<<"parameters">>, <<"topic">>], ActionConf, Topic2 + ), + Action2Name = atom_to_binary(?FUNCTION_NAME), + {ok, _} = emqx_bridge_v2:create(BridgeType, Action2Name, ActionConf2), + SentData = #{payload => ?PAYLOAD}, + ?check_trace( + begin + ?wait_async_action( + ?assertEqual(ok, send_message(Config, SentData)), + #{?snk_kind := rocketmq_connector_query_return}, + 10_000 + ), + ok + end, + fun(Trace0) -> + Trace = ?of_kind(rocketmq_connector_query_return, Trace0), + ?assertMatch([#{result := ok}], Trace), + ok + end + ), + Config2 = proplists:delete(rocketmq_name, Config), + Config3 = [{rocketmq_name, Action2Name} | Config2], + ?check_trace( + begin + ?wait_async_action( + ?assertEqual(ok, send_message(Config3, SentData)), + #{?snk_kind := rocketmq_connector_query_return}, + 10_000 + ), + ok + end, + fun(Trace0) -> + Trace = ?of_kind(rocketmq_connector_query_return, Trace0), + ?assertMatch([#{result := ok}], Trace), + ok + end + ), + ok. + t_get_status(Config) -> ?assertMatch( {ok, _}, diff --git a/apps/emqx_bridge_s3/src/emqx_bridge_s3.erl b/apps/emqx_bridge_s3/src/emqx_bridge_s3.erl index 5d7e176e3..79cc560d2 100644 --- a/apps/emqx_bridge_s3/src/emqx_bridge_s3.erl +++ b/apps/emqx_bridge_s3/src/emqx_bridge_s3.erl @@ -77,7 +77,7 @@ fields(s3_upload_parameters) -> [ {content, hoconsc:mk( - string(), + emqx_schema:template(), #{ required => false, default => <<"${.}">>, diff --git a/apps/emqx_bridge_sqlserver/src/emqx_bridge_sqlserver.erl b/apps/emqx_bridge_sqlserver/src/emqx_bridge_sqlserver.erl index e9df1fdb6..af66b8a88 100644 --- a/apps/emqx_bridge_sqlserver/src/emqx_bridge_sqlserver.erl +++ b/apps/emqx_bridge_sqlserver/src/emqx_bridge_sqlserver.erl @@ -192,7 +192,7 @@ fields(action_parameters) -> [ {sql, mk( - binary(), + emqx_schema:template(), #{desc => ?DESC("sql_template"), default => ?DEFAULT_SQL, format => <<"sql">>} )} ]; diff --git a/apps/emqx_bridge_syskeeper/src/emqx_bridge_syskeeper.erl b/apps/emqx_bridge_syskeeper/src/emqx_bridge_syskeeper.erl index 9ac0efe8a..547562f26 100644 --- a/apps/emqx_bridge_syskeeper/src/emqx_bridge_syskeeper.erl +++ b/apps/emqx_bridge_syskeeper/src/emqx_bridge_syskeeper.erl @@ -112,7 +112,7 @@ fields("parameters") -> [ {target_topic, mk( - binary(), + emqx_schema:template(), #{desc => ?DESC("target_topic"), default => <<"${topic}">>} )}, {target_qos, @@ -122,7 +122,7 @@ fields("parameters") -> )}, {template, mk( - binary(), + emqx_schema:template(), #{desc => ?DESC("template"), default => <<"${payload}">>} )} ]; diff --git a/apps/emqx_bridge_tdengine/src/emqx_bridge_tdengine.erl b/apps/emqx_bridge_tdengine/src/emqx_bridge_tdengine.erl index 6e71da87e..f086f00dc 100644 --- a/apps/emqx_bridge_tdengine/src/emqx_bridge_tdengine.erl +++ b/apps/emqx_bridge_tdengine/src/emqx_bridge_tdengine.erl @@ -83,7 +83,7 @@ fields("config") -> {enable, mk(boolean(), #{desc => ?DESC("config_enable"), default => true})}, {sql, mk( - binary(), + emqx_schema:template(), #{ desc => ?DESC("sql_template"), default => ?DEFAULT_SQL, @@ -125,7 +125,7 @@ fields(action_parameters) -> {database, fun emqx_connector_schema_lib:database/1}, {sql, mk( - binary(), + emqx_schema:template(), #{ desc => ?DESC("sql_template"), default => ?DEFAULT_SQL, diff --git a/apps/emqx_conf/src/emqx_cluster_rpc.erl b/apps/emqx_conf/src/emqx_cluster_rpc.erl index 39d471ed9..37f052f56 100644 --- a/apps/emqx_conf/src/emqx_cluster_rpc.erl +++ b/apps/emqx_conf/src/emqx_cluster_rpc.erl @@ -224,6 +224,7 @@ reset() -> gen_server:call(?MODULE, reset). status() -> transaction(fun ?MODULE:trans_status/0, []). +%% DO NOT delete this on_leave_clean/0, It's use when rpc before v560. on_leave_clean() -> on_leave_clean(node()). @@ -367,7 +368,7 @@ handle_call({fast_forward_to_commit, ToTnxId}, _From, State) -> NodeId = do_fast_forward_to_commit(ToTnxId, State), {reply, NodeId, State, catch_up(State)}; handle_call(on_leave, _From, State) -> - {atomic, ok} = transaction(fun ?MODULE:on_leave_clean/0, []), + {atomic, ok} = transaction(fun ?MODULE:on_leave_clean/1, [node()]), {reply, ok, State#{is_leaving := true}}; handle_call(_, _From, State) -> {reply, ok, State, catch_up(State)}. diff --git a/apps/emqx_conf/src/emqx_conf.erl b/apps/emqx_conf/src/emqx_conf.erl index 122998eeb..23dda6b02 100644 --- a/apps/emqx_conf/src/emqx_conf.erl +++ b/apps/emqx_conf/src/emqx_conf.erl @@ -31,13 +31,6 @@ -export([dump_schema/2, reformat_schema_dump/2]). -export([schema_module/0]). -%% TODO: move to emqx_dashboard when we stop building api schema at build time --export([ - hotconf_schema_json/0, - bridge_schema_json/0, - hocon_schema_to_spec/2 -]). - %% for rpc -export([get_node_and_config/1]). @@ -311,12 +304,22 @@ gen_flat_doc(RootNames, #{full_name := FullName, fields := Fields} = S, DescReso false -> ok end, - #{ - text => short_name(FullName), - hash => format_hash(FullName), - doc => maps:get(desc, S, <<"">>), - fields => format_fields(Fields, DescResolver) - }. + try + #{ + text => short_name(FullName), + hash => format_hash(FullName), + doc => maps:get(desc, S, <<"">>), + fields => format_fields(Fields, DescResolver) + } + catch + throw:Reason -> + io:format( + standard_error, + "failed_to_build_doc for ~s:~n~p~n", + [FullName, Reason] + ), + error(failed_to_build_doc) + end. format_fields(Fields, DescResolver) -> [format_field(F, DescResolver) || F <- Fields]. @@ -456,17 +459,6 @@ warn_bad_namespace(Namespace) -> ok end. -%% TODO: move this function to emqx_dashboard when we stop generating this JSON at build time. -hotconf_schema_json() -> - SchemaInfo = #{title => <<"EMQX Hot Conf API Schema">>, version => <<"0.1.0">>}, - gen_api_schema_json_iodata(emqx_mgmt_api_configs, SchemaInfo). - -%% TODO: move this function to emqx_dashboard when we stop generating this JSON at build time. -bridge_schema_json() -> - Version = <<"0.1.0">>, - SchemaInfo = #{title => <<"EMQX Data Bridge API Schema">>, version => Version}, - gen_api_schema_json_iodata(emqx_bridge_api, SchemaInfo). - %% @doc return the root schema module. -spec schema_module() -> module(). schema_module() -> @@ -506,57 +498,6 @@ make_desc_resolver(Lang) -> unicode:characters_to_binary(Desc) end. -gen_api_schema_json_iodata(SchemaMod, SchemaInfo) -> - emqx_dashboard_swagger:gen_api_schema_json_iodata( - SchemaMod, - SchemaInfo, - fun ?MODULE:hocon_schema_to_spec/2 - ). - --define(TO_REF(_N_, _F_), iolist_to_binary([to_bin(_N_), ".", to_bin(_F_)])). --define(TO_COMPONENTS_SCHEMA(_M_, _F_), - iolist_to_binary([ - <<"#/components/schemas/">>, - ?TO_REF(emqx_dashboard_swagger:namespace(_M_), _F_) - ]) -). - -hocon_schema_to_spec(?R_REF(Module, StructName), _LocalModule) -> - {#{<<"$ref">> => ?TO_COMPONENTS_SCHEMA(Module, StructName)}, [{Module, StructName}]}; -hocon_schema_to_spec(?REF(StructName), LocalModule) -> - {#{<<"$ref">> => ?TO_COMPONENTS_SCHEMA(LocalModule, StructName)}, [{LocalModule, StructName}]}; -hocon_schema_to_spec(Type, LocalModule) when ?IS_TYPEREFL(Type) -> - {typename_to_spec(typerefl:name(Type), LocalModule), []}; -hocon_schema_to_spec(?ARRAY(Item), LocalModule) -> - {Schema, Refs} = hocon_schema_to_spec(Item, LocalModule), - {#{type => array, items => Schema}, Refs}; -hocon_schema_to_spec(?ENUM(Items), _LocalModule) -> - {#{type => enum, symbols => Items}, []}; -hocon_schema_to_spec(?MAP(Name, Type), LocalModule) -> - {Schema, SubRefs} = hocon_schema_to_spec(Type, LocalModule), - { - #{ - <<"type">> => object, - <<"properties">> => #{<<"$", (to_bin(Name))/binary>> => Schema} - }, - SubRefs - }; -hocon_schema_to_spec(?UNION(Types, _DisplayName), LocalModule) -> - {OneOf, Refs} = lists:foldl( - fun(Type, {Acc, RefsAcc}) -> - {Schema, SubRefs} = hocon_schema_to_spec(Type, LocalModule), - {[Schema | Acc], SubRefs ++ RefsAcc} - end, - {[], []}, - hoconsc:union_members(Types) - ), - {#{<<"oneOf">> => OneOf}, Refs}; -hocon_schema_to_spec(Atom, _LocalModule) when is_atom(Atom) -> - {#{type => enum, symbols => [Atom]}, []}. - -typename_to_spec(TypeStr, Module) -> - emqx_conf_schema_types:readable_dashboard(Module, TypeStr). - join_format(Snippets) -> case [S || S <- Snippets, S =/= undefined] of [] -> diff --git a/apps/emqx_conf/src/emqx_conf_schema_types.erl b/apps/emqx_conf/src/emqx_conf_schema_types.erl index f530ee872..bcc9c1469 100644 --- a/apps/emqx_conf/src/emqx_conf_schema_types.erl +++ b/apps/emqx_conf/src/emqx_conf_schema_types.erl @@ -33,8 +33,19 @@ readable(Module, TypeStr) when is_list(TypeStr) -> %% Module is ignored so far as all types are distinguished by their names readable(TypeStr) catch - throw:unknown_type -> - fail(#{reason => unknown_type, type => TypeStr, module => Module}) + throw:Reason -> + throw(#{ + reason => Reason, + type => TypeStr, + module => Module + }); + error:Reason:Stacktrace -> + throw(#{ + reason => Reason, + stacktrace => Stacktrace, + type => TypeStr, + module => Module + }) end. readable_swagger(Module, TypeStr) -> @@ -49,22 +60,28 @@ readable_docgen(Module, TypeStr) -> get_readable(Module, TypeStr, Flavor) -> Map = readable(Module, TypeStr), case maps:get(Flavor, Map, undefined) of - undefined -> fail(#{reason => unknown_type, module => Module, type => TypeStr}); + undefined -> throw(#{reason => unknown_type, module => Module, type => TypeStr}); Value -> Value end. -%% Fail the build or test. Production code should never get here. --spec fail(_) -> no_return(). -fail(Reason) -> - io:format(standard_error, "ERROR: ~p~n", [Reason]), - error(Reason). - readable("boolean()") -> #{ swagger => #{type => boolean}, dashboard => #{type => boolean}, docgen => #{type => "Boolean"} }; +readable("template()") -> + #{ + swagger => #{type => string}, + dashboard => #{type => string, is_template => true}, + docgen => #{type => "String", desc => ?DESC(template)} + }; +readable("template_str()") -> + #{ + swagger => #{type => string}, + dashboard => #{type => string, is_template => true}, + docgen => #{type => "String", desc => ?DESC(template)} + }; readable("binary()") -> #{ swagger => #{type => string}, diff --git a/apps/emqx_connector/src/emqx_connector_info.erl b/apps/emqx_connector/src/emqx_connector_info.erl index 766f34168..e87c2ad7e 100644 --- a/apps/emqx_connector/src/emqx_connector_info.erl +++ b/apps/emqx_connector/src/emqx_connector_info.erl @@ -31,6 +31,9 @@ -export([clean_cache/0]). +%% For tests +-export([hard_coded_test_connector_info_modules/0]). + %% The type name for the conncector -callback type_name() -> atom(). @@ -117,8 +120,13 @@ hard_coded_connector_info_modules_common() -> emqx_bridge_mqtt_pubsub_connector_info ]. +%% This exists so that it can be mocked for test cases +hard_coded_test_connector_info_modules() -> []. + hard_coded_connector_info_modules() -> - hard_coded_connector_info_modules_common() ++ hard_coded_connector_info_modules_ee(). + hard_coded_connector_info_modules_common() ++ + hard_coded_connector_info_modules_ee() ++ + ?MODULE:hard_coded_test_connector_info_modules(). %% -------------------------------------------------------------------- %% Atom macros to avoid typos diff --git a/apps/emqx_dashboard/include/emqx_dashboard.hrl b/apps/emqx_dashboard/include/emqx_dashboard.hrl index 13458b4b4..40f2ba2b3 100644 --- a/apps/emqx_dashboard/include/emqx_dashboard.hrl +++ b/apps/emqx_dashboard/include/emqx_dashboard.hrl @@ -67,7 +67,8 @@ %, sent_bytes validation_succeeded, validation_failed, - dropped + dropped, + persisted ]). -define(GAUGE_SAMPLER_LIST, [ @@ -87,7 +88,8 @@ sent => sent_msg_rate, validation_succeeded => validation_succeeded_rate, validation_failed => validation_failed_rate, - dropped => dropped_msg_rate + dropped => dropped_msg_rate, + persisted => persisted_rate }). -define(CURRENT_SAMPLE_NON_RATE, diff --git a/apps/emqx_dashboard/src/emqx_dashboard_monitor.erl b/apps/emqx_dashboard/src/emqx_dashboard_monitor.erl index 6a9e868dd..fe0476e6d 100644 --- a/apps/emqx_dashboard/src/emqx_dashboard_monitor.erl +++ b/apps/emqx_dashboard/src/emqx_dashboard_monitor.erl @@ -428,7 +428,8 @@ stats(sent) -> emqx_metrics:val('messages.sent'); stats(sent_bytes) -> emqx_metrics:val('bytes.sent'); stats(validation_succeeded) -> emqx_metrics:val('messages.validation_succeeded'); stats(validation_failed) -> emqx_metrics:val('messages.validation_failed'); -stats(dropped) -> emqx_metrics:val('messages.dropped'). +stats(dropped) -> emqx_metrics:val('messages.dropped'); +stats(persisted) -> emqx_metrics:val('messages.persisted'). %% ------------------------------------------------------------------------------------------------- %% Retained && License Quota diff --git a/apps/emqx_dashboard/src/emqx_dashboard_monitor_api.erl b/apps/emqx_dashboard/src/emqx_dashboard_monitor_api.erl index 3ffadc1b2..1b6773b87 100644 --- a/apps/emqx_dashboard/src/emqx_dashboard_monitor_api.erl +++ b/apps/emqx_dashboard/src/emqx_dashboard_monitor_api.erl @@ -192,6 +192,8 @@ swagger_desc(validation_succeeded) -> swagger_desc_format("Message validations succeeded "); swagger_desc(validation_failed) -> swagger_desc_format("Message validations failed "); +swagger_desc(persisted) -> + swagger_desc_format("Messages saved to the durable storage "); swagger_desc(subscriptions) -> <<"Subscriptions at the time of sampling.", ?APPROXIMATE_DESC>>; swagger_desc(topics) -> @@ -218,6 +220,8 @@ swagger_desc(validation_succeeded_rate) -> swagger_desc_format("Message validations succeeded ", per); swagger_desc(validation_failed_rate) -> swagger_desc_format("Message validations failed ", per); +swagger_desc(persisted_rate) -> + swagger_desc_format("Messages saved to the durable storage ", per); swagger_desc(retained_msg_count) -> <<"Retained messages count at the time of sampling.", ?APPROXIMATE_DESC>>; swagger_desc(shared_subscriptions) -> diff --git a/apps/emqx_dashboard/src/emqx_dashboard_schema_api.erl b/apps/emqx_dashboard/src/emqx_dashboard_schema_api.erl index 9b5c45e71..632c8b8d4 100644 --- a/apps/emqx_dashboard/src/emqx_dashboard_schema_api.erl +++ b/apps/emqx_dashboard/src/emqx_dashboard_schema_api.erl @@ -30,9 +30,22 @@ -export([get_schema/2]). +%% for test +-export([bridge_schema_json/0]). + -define(TAGS, [<<"dashboard">>]). -define(BAD_REQUEST, 'BAD_REQUEST'). +-define(TO_REF(_N_, _F_), iolist_to_binary([to_bin(_N_), ".", to_bin(_F_)])). +-define(TO_COMPONENTS_SCHEMA(_M_, _F_), + iolist_to_binary([ + <<"#/components/schemas/">>, + ?TO_REF(emqx_dashboard_swagger:namespace(_M_), _F_) + ]) +). + +-define(SCHEMA_VERSION, <<"0.2.0">>). + %%-------------------------------------------------------------------- %% minirest API and schema %%-------------------------------------------------------------------- @@ -77,26 +90,86 @@ get_schema(get, _) -> {400, ?BAD_REQUEST, <<"unknown">>}. gen_schema(hotconf) -> - emqx_conf:hotconf_schema_json(); + hotconf_schema_json(); gen_schema(bridges) -> - emqx_conf:bridge_schema_json(); + bridge_schema_json(); gen_schema(actions) -> actions_schema_json(); gen_schema(connectors) -> connectors_schema_json(). +hotconf_schema_json() -> + SchemaInfo = #{ + title => <<"Hot Conf Schema">>, + version => ?SCHEMA_VERSION + }, + gen_api_schema_json_iodata(emqx_mgmt_api_configs, SchemaInfo). + +bridge_schema_json() -> + SchemaInfo = #{ + title => <<"Data Bridge Schema">>, + version => ?SCHEMA_VERSION + }, + gen_api_schema_json_iodata(emqx_bridge_api, SchemaInfo). + actions_schema_json() -> - SchemaInfo = #{title => <<"EMQX Data Actions API Schema">>, version => <<"0.1.0">>}, - %% Note: this will be moved to `emqx_actions' application in the future. + SchemaInfo = #{ + title => <<"Actions and Sources Schema">>, + version => ?SCHEMA_VERSION + }, gen_api_schema_json_iodata(emqx_bridge_v2_api, SchemaInfo). connectors_schema_json() -> - SchemaInfo = #{title => <<"EMQX Connectors Schema">>, version => <<"0.1.0">>}, + SchemaInfo = #{ + title => <<"Connectors Schema">>, + version => ?SCHEMA_VERSION + }, gen_api_schema_json_iodata(emqx_connector_api, SchemaInfo). gen_api_schema_json_iodata(SchemaMod, SchemaInfo) -> emqx_dashboard_swagger:gen_api_schema_json_iodata( SchemaMod, SchemaInfo, - fun emqx_conf:hocon_schema_to_spec/2 + fun hocon_schema_to_spec/2 ). + +hocon_schema_to_spec(?R_REF(Module, StructName), _LocalModule) -> + {#{<<"$ref">> => ?TO_COMPONENTS_SCHEMA(Module, StructName)}, [{Module, StructName}]}; +hocon_schema_to_spec(?REF(StructName), LocalModule) -> + {#{<<"$ref">> => ?TO_COMPONENTS_SCHEMA(LocalModule, StructName)}, [{LocalModule, StructName}]}; +hocon_schema_to_spec(Type, LocalModule) when ?IS_TYPEREFL(Type) -> + {typename_to_spec(typerefl:name(Type), LocalModule), []}; +hocon_schema_to_spec(?ARRAY(Item), LocalModule) -> + {Schema, Refs} = hocon_schema_to_spec(Item, LocalModule), + {#{type => array, items => Schema}, Refs}; +hocon_schema_to_spec(?ENUM(Items), _LocalModule) -> + {#{type => enum, symbols => Items}, []}; +hocon_schema_to_spec(?MAP(Name, Type), LocalModule) -> + {Schema, SubRefs} = hocon_schema_to_spec(Type, LocalModule), + { + #{ + <<"type">> => object, + <<"properties">> => #{<<"$", (to_bin(Name))/binary>> => Schema} + }, + SubRefs + }; +hocon_schema_to_spec(?UNION(Types, _DisplayName), LocalModule) -> + {OneOf, Refs} = lists:foldl( + fun(Type, {Acc, RefsAcc}) -> + {Schema, SubRefs} = hocon_schema_to_spec(Type, LocalModule), + {[Schema | Acc], SubRefs ++ RefsAcc} + end, + {[], []}, + hoconsc:union_members(Types) + ), + {#{<<"oneOf">> => OneOf}, Refs}; +hocon_schema_to_spec(Atom, _LocalModule) when is_atom(Atom) -> + {#{type => enum, symbols => [Atom]}, []}. + +typename_to_spec(TypeStr, Module) -> + emqx_conf_schema_types:readable_dashboard(Module, TypeStr). + +to_bin(List) when is_list(List) -> iolist_to_binary(List); +to_bin(Boolean) when is_boolean(Boolean) -> Boolean; +to_bin(Atom) when is_atom(Atom) -> atom_to_binary(Atom, utf8); +to_bin(X) -> X. diff --git a/apps/emqx_dashboard/src/emqx_dashboard_swagger.erl b/apps/emqx_dashboard/src/emqx_dashboard_swagger.erl index a6038bcb7..4ada5994c 100644 --- a/apps/emqx_dashboard/src/emqx_dashboard_swagger.erl +++ b/apps/emqx_dashboard/src/emqx_dashboard_swagger.erl @@ -57,7 +57,11 @@ allowEmptyValue, deprecated, minimum, - maximum + maximum, + %% is_template is a type property, + %% but some exceptions are made for them to be field property + %% for example, HTTP headers (which is a map type) + is_template ]). -define(INIT_SCHEMA, #{ @@ -81,7 +85,7 @@ ]) ). --define(SPECIAL_LANG_MSGID, <<"$msgid">>). +-define(NO_I18N, undefined). -define(MAX_ROW_LIMIT, 10000). -define(DEFAULT_ROW, 100). @@ -164,6 +168,14 @@ fields(limit) -> ]), Meta = #{in => query, desc => Desc, default => ?DEFAULT_ROW, example => 50}, [{limit, hoconsc:mk(range(1, ?MAX_ROW_LIMIT), Meta)}]; +fields(cursor) -> + Desc = <<"Opaque value representing the current iteration state.">>, + Meta = #{default => none, in => query, desc => Desc}, + [{cursor, hoconsc:mk(hoconsc:union([none, binary()]), Meta)}]; +fields(cursor_response) -> + Desc = <<"Opaque value representing the current iteration state.">>, + Meta = #{desc => Desc, required => false}, + [{cursor, hoconsc:mk(binary(), Meta)}]; fields(count) -> Desc = << "Total number of records matching the query.
" @@ -197,6 +209,8 @@ fields(start) -> [{start, hoconsc:mk(hoconsc:union([none, binary()]), Meta)}]; fields(meta) -> fields(page) ++ fields(limit) ++ fields(count) ++ fields(hasnext); +fields(meta_with_cursor) -> + fields(count) ++ fields(hasnext) ++ fields(cursor_response); fields(continuation_meta) -> fields(start) ++ fields(position). @@ -257,7 +271,7 @@ gen_api_schema_json_iodata(SchemaMod, SchemaInfo, Converter) -> SchemaMod, #{ schema_converter => Converter, - i18n_lang => ?SPECIAL_LANG_MSGID + i18n_lang => ?NO_I18N } ), ApiSpec = lists:foldl( @@ -642,19 +656,6 @@ trans_required(Spec, true, _) -> Spec#{required => true}; trans_required(Spec, _, path) -> Spec#{required => true}; trans_required(Spec, _, _) -> Spec. -trans_desc(Init, Hocon, Func, Name, Options) -> - Spec0 = trans_description(Init, Hocon, Options), - case Func =:= fun hocon_schema_to_spec/2 of - true -> - Spec0; - false -> - Spec1 = trans_label(Spec0, Hocon, Name, Options), - case Spec1 of - #{description := _} -> Spec1; - _ -> Spec1#{description => <>} - end - end. - trans_description(Spec, Hocon, Options) -> Desc = case desc_struct(Hocon) of @@ -662,10 +663,10 @@ trans_description(Spec, Hocon, Options) -> ?DESC(_, _) = Struct -> get_i18n(<<"desc">>, Struct, undefined, Options); Text -> to_bin(Text) end, - case Desc of - undefined -> + case Desc =:= undefined of + true -> Spec; - Desc -> + false -> Desc1 = binary:replace(Desc, [<<"\n">>], <<"
">>, [global]), Spec#{description => Desc1} end. @@ -673,8 +674,8 @@ trans_description(Spec, Hocon, Options) -> get_i18n(Tag, ?DESC(Namespace, Id), Default, Options) -> Lang = get_lang(Options), case Lang of - ?SPECIAL_LANG_MSGID -> - make_msgid(Namespace, Id, Tag); + ?NO_I18N -> + undefined; _ -> get_i18n_text(Lang, Namespace, Id, Tag, Default) end. @@ -687,27 +688,11 @@ get_i18n_text(Lang, Namespace, Id, Tag, Default) -> Text end. -%% Format:$msgid:Namespace.Id.Tag -%% e.g. $msgid:emqx_schema.key.desc -%% $msgid:emqx_schema.key.label -%% if needed, the consumer of this schema JSON can use this msgid to -%% resolve the text in the i18n database. -make_msgid(Namespace, Id, Tag) -> - iolist_to_binary(["$msgid:", to_bin(Namespace), ".", to_bin(Id), ".", Tag]). - %% So far i18n_lang in options is only used at build time. %% At runtime, it's still the global config which controls the language. get_lang(#{i18n_lang := Lang}) -> Lang; get_lang(_) -> emqx:get_config([dashboard, i18n_lang]). -trans_label(Spec, Hocon, Default, Options) -> - Label = - case desc_struct(Hocon) of - ?DESC(_, _) = Struct -> get_i18n(<<"label">>, Struct, Default, Options); - _ -> Default - end, - Spec#{label => Label}. - desc_struct(Hocon) -> R = case hocon_schema:field_schema(Hocon, desc) of @@ -765,7 +750,7 @@ response(Status, #{content := _} = Content, {Acc, RefsAcc, Module, Options}) -> response(Status, ?REF(StructName), {Acc, RefsAcc, Module, Options}) -> response(Status, ?R_REF(Module, StructName), {Acc, RefsAcc, Module, Options}); response(Status, ?R_REF(_Mod, _Name) = RRef, {Acc, RefsAcc, Module, Options}) -> - SchemaToSpec = schema_converter(Options), + SchemaToSpec = get_schema_converter(Options), {Spec, Refs} = SchemaToSpec(RRef, Module), Content = content(Spec), { @@ -903,7 +888,7 @@ parse_object(PropList = [_ | _], Module, Options) when is_list(PropList) -> parse_object(Other, Module, Options) -> erlang:throw( {error, #{ - msg => <<"Object only supports not empty proplists">>, + msg => <<"Object only supports non-empty fields list">>, args => Other, module => Module, options => Options @@ -943,10 +928,10 @@ parse_object_loop([{Name, Hocon} | Rest], Module, Options, Props, Required, Refs true -> HoconType = hocon_schema:field_schema(Hocon, type), Init0 = init_prop([default | ?DEFAULT_FIELDS], #{}, Hocon), - SchemaToSpec = schema_converter(Options), + SchemaToSpec = get_schema_converter(Options), Init = maps:remove( summary, - trans_desc(Init0, Hocon, SchemaToSpec, NameBin, Options) + trans_description(Init0, Hocon, Options) ), {Prop, Refs1} = SchemaToSpec(HoconType, Module), NewRequiredAcc = @@ -995,7 +980,7 @@ to_ref(Mod, StructName, Acc, RefsAcc) -> Ref = #{<<"$ref">> => ?TO_COMPONENTS_PARAM(Mod, StructName)}, {[Ref | Acc], [{Mod, StructName, parameter} | RefsAcc]}. -schema_converter(Options) -> +get_schema_converter(Options) -> maps:get(schema_converter, Options, fun hocon_schema_to_spec/2). hocon_error_msg(Reason) -> diff --git a/apps/emqx_dashboard/test/emqx_swagger_requestBody_SUITE.erl b/apps/emqx_dashboard/test/emqx_swagger_requestBody_SUITE.erl index 917c24ffa..0f2448480 100644 --- a/apps/emqx_dashboard/test/emqx_swagger_requestBody_SUITE.erl +++ b/apps/emqx_dashboard/test/emqx_swagger_requestBody_SUITE.erl @@ -359,7 +359,7 @@ t_bad_ref(_Config) -> Refs = [{?MODULE, bad_ref}], Fields = fields(bad_ref), ?assertThrow( - {error, #{msg := <<"Object only supports not empty proplists">>, args := Fields}}, + {error, #{msg := <<"Object only supports non-empty fields list">>, args := Fields}}, validate(Path, Spec, Refs) ), ok. diff --git a/apps/emqx_dashboard/test/emqx_swagger_response_SUITE.erl b/apps/emqx_dashboard/test/emqx_swagger_response_SUITE.erl index 6fa3dbd3d..e9397f643 100644 --- a/apps/emqx_dashboard/test/emqx_swagger_response_SUITE.erl +++ b/apps/emqx_dashboard/test/emqx_swagger_response_SUITE.erl @@ -189,7 +189,7 @@ t_nest_object(_Config) -> t_empty(_Config) -> ?assertThrow( {error, #{ - msg := <<"Object only supports not empty proplists">>, + msg := <<"Object only supports non-empty fields list">>, args := [], module := ?MODULE }}, @@ -273,7 +273,7 @@ t_bad_ref(_Config) -> ?assertThrow( {error, #{ module := ?MODULE, - msg := <<"Object only supports not empty proplists">> + msg := <<"Object only supports non-empty fields list">> }}, validate(Path, Object, ExpectRefs) ), diff --git a/apps/emqx_durable_storage/README.md b/apps/emqx_durable_storage/README.md index f613085bb..f67cc3e24 100644 --- a/apps/emqx_durable_storage/README.md +++ b/apps/emqx_durable_storage/README.md @@ -13,7 +13,7 @@ This makes the storage disk requirements very predictable: only the number of _p DS _backend_ is a callback module that implements `emqx_ds` behavior. -EMQX repository contains the "builtin" backend, implemented in `emqx_ds_replication_layer` module, that uses RocksDB as the main storage. +EMQX repository contains the "builtin" backend, implemented in `emqx_ds_replication_layer` module, that uses Raft algorithm for data replication, and RocksDB as the main storage. Note that builtin backend introduces the concept of **site** to alleviate the problem of changing node names. Site IDs are persistent, and they are randomly generated at the first startup of the node. @@ -64,6 +64,10 @@ Messages are organized in the following hierarchy: The consumer of the messages can replay the stream using an _iterator_. +## Saving messages to the durable storage + +`emqx_ds` provides `store_batch/3` function that saves a list of MQTT messages to the durable storage. + ## Message replay All the API functions in EMQX DS are batch-oriented. @@ -95,10 +99,10 @@ Consumption of messages is done in several stages: # Limitation -- Builtin backend currently doesn't replicate data across different sites - There is no local cache of messages, which may result in transferring the same data multiple times # Documentation links + TBD # Usage @@ -120,9 +124,24 @@ The following application environment variables are available: - `emqx_durable_storage.egress_flush_interval`: period at which the batches of messages are committed to the durable storage. +Runtime settings for the durable storages can be modified via CLI as well as the REST API. +The following CLI commands are available: + +- `emqx ctl ds info` — get a quick overview of the durable storage state +- `emqx ctl ds set_replicas ...` — update the list of replicas for a durable storage. +- `emqx ctl ds join ` — add a replica of durable storage on the site +- `emqx ctl ds leave ` — remove a replica of a durable storage from the site + # HTTP APIs -None +The following REST APIs are available for managing the builtin durable storages: + +- `/ds/sites` — list known sites. +- `/ds/sites/:site` — get information about the site (its status, current EMQX node name managing the site, etc.) +- `/ds/storages` — list durable storages +- `/ds/storages/:ds` — get information about the durable storage and its shards +- `/ds/storages/:ds/replicas` — list or update sites that contain replicas of a durable storage +- `/ds/storages/:ds/replicas/:site` — add or remove replica of the durable storage on the site # Other TBD diff --git a/apps/emqx_durable_storage/include/emqx_ds.hrl b/apps/emqx_durable_storage/include/emqx_ds.hrl index f24605175..cc7a7431f 100644 --- a/apps/emqx_durable_storage/include/emqx_ds.hrl +++ b/apps/emqx_durable_storage/include/emqx_ds.hrl @@ -13,7 +13,7 @@ %% See the License for the specific language governing permissions and %% limitations under the License. %%-------------------------------------------------------------------- --ifndef(EMQX_DS_HRL_HRL). --define(EMQX_DS_HRL_HRL, true). +-ifndef(EMQX_DS_HRL). +-define(EMQX_DS_HRL, true). -endif. diff --git a/apps/emqx_durable_storage/include/emqx_ds_metrics.hrl b/apps/emqx_durable_storage/include/emqx_ds_metrics.hrl new file mode 100644 index 000000000..0a82a6682 --- /dev/null +++ b/apps/emqx_durable_storage/include/emqx_ds_metrics.hrl @@ -0,0 +1,49 @@ +%%-------------------------------------------------------------------- +%% Copyright (c) 2023-2024 EMQ Technologies Co., Ltd. All Rights Reserved. +%% +%% Licensed under the Apache License, Version 2.0 (the "License"); +%% you may not use this file except in compliance with the License. +%% You may obtain a copy of the License at +%% +%% http://www.apache.org/licenses/LICENSE-2.0 +%% +%% Unless required by applicable law or agreed to in writing, software +%% distributed under the License is distributed on an "AS IS" BASIS, +%% WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +%% See the License for the specific language governing permissions and +%% limitations under the License. +%%-------------------------------------------------------------------- +-ifndef(EMQX_DS_METRICS_HRL). +-define(EMQX_DS_METRICS_HRL, true). + +%%%% Egress metrics: + +%% Number of successfully flushed batches: +-define(DS_EGRESS_BATCHES, emqx_ds_egress_batches). +%% Number of batch flush retries: +-define(DS_EGRESS_BATCHES_RETRY, emqx_ds_egress_batches_retry). +%% Number of batches that weren't flushed due to unrecoverable errors: +-define(DS_EGRESS_BATCHES_FAILED, emqx_ds_egress_batches_failed). +%% Total number of messages that were successfully committed to the storage: +-define(DS_EGRESS_MESSAGES, emqx_ds_egress_messages). +%% Total size of payloads that were successfully committed to the storage: +-define(DS_EGRESS_BYTES, emqx_ds_egress_bytes). +%% Sliding average of flush time (microseconds): +-define(DS_EGRESS_FLUSH_TIME, emqx_ds_egress_flush_time). + +%%%% Storage layer metrics: +-define(DS_STORE_BATCH_TIME, emqx_ds_store_batch_time). +-define(DS_BUILTIN_NEXT_TIME, emqx_ds_builtin_next_time). + +%%% LTS Storage counters: + +%% This counter is incremented when the iterator seeks to the next interval: +-define(DS_LTS_SEEK_COUNTER, emqx_ds_storage_bitfield_lts_counter_seek). +%% This counter is incremented when the iterator proceeds to the next +%% key within the interval (this is is best case scenario): +-define(DS_LTS_NEXT_COUNTER, emqx_ds_storage_bitfield_lts_counter_next). +%% This counter is incremented when the key passes bitmask check, but +%% the value is rejected by the subsequent post-processing: +-define(DS_LTS_COLLISION_COUNTER, emqx_ds_storage_bitfield_lts_counter_collision). + +-endif. diff --git a/apps/emqx_durable_storage/src/emqx_ds_builtin_db_sup.erl b/apps/emqx_durable_storage/src/emqx_ds_builtin_db_sup.erl index a93a94168..ef1600500 100644 --- a/apps/emqx_durable_storage/src/emqx_ds_builtin_db_sup.erl +++ b/apps/emqx_durable_storage/src/emqx_ds_builtin_db_sup.erl @@ -21,8 +21,17 @@ -behaviour(supervisor). %% API: --export([start_db/2, start_shard/1, start_egress/1, stop_shard/1, ensure_shard/1, ensure_egress/1]). --export([which_shards/1]). +-export([ + start_db/2, + start_shard/1, + start_egress/1, + stop_shard/1, + terminate_storage/1, + restart_storage/1, + ensure_shard/1, + ensure_egress/1 +]). +-export([which_dbs/0, which_shards/1]). %% behaviour callbacks: -export([init/1]). @@ -64,12 +73,22 @@ start_shard({DB, Shard}) -> start_egress({DB, Shard}) -> supervisor:start_child(?via(#?egress_sup{db = DB}), egress_spec(DB, Shard)). --spec stop_shard(emqx_ds_storage_layer:shard_id()) -> ok | {error, _}. -stop_shard(Shard = {DB, _}) -> +-spec stop_shard(emqx_ds_storage_layer:shard_id()) -> ok. +stop_shard({DB, Shard}) -> Sup = ?via(#?shards_sup{db = DB}), ok = supervisor:terminate_child(Sup, Shard), ok = supervisor:delete_child(Sup, Shard). +-spec terminate_storage(emqx_ds_storage_layer:shard_id()) -> ok | {error, _Reason}. +terminate_storage({DB, Shard}) -> + Sup = ?via(#?shard_sup{db = DB, shard = Shard}), + supervisor:terminate_child(Sup, {Shard, storage}). + +-spec restart_storage(emqx_ds_storage_layer:shard_id()) -> {ok, _Child} | {error, _Reason}. +restart_storage({DB, Shard}) -> + Sup = ?via(#?shard_sup{db = DB, shard = Shard}), + supervisor:restart_child(Sup, {Shard, storage}). + -spec ensure_shard(emqx_ds_storage_layer:shard_id()) -> ok | {error, _Reason}. ensure_shard(Shard) -> @@ -85,6 +104,13 @@ ensure_egress(Shard) -> which_shards(DB) -> supervisor:which_children(?via(#?shards_sup{db = DB})). +%% @doc Return the list of builtin DS databases that are currently +%% active on the node. +-spec which_dbs() -> [emqx_ds:db()]. +which_dbs() -> + Key = {n, l, #?db_sup{_ = '_', db = '$1'}}, + gproc:select({local, names}, [{{Key, '_', '_'}, [], ['$1']}]). + %%================================================================================ %% behaviour callbacks %%================================================================================ @@ -92,12 +118,13 @@ which_shards(DB) -> init({#?db_sup{db = DB}, DefaultOpts}) -> %% Spec for the top-level supervisor for the database: logger:notice("Starting DS DB ~p", [DB]), + emqx_ds_builtin_metrics:init_for_db(DB), Opts = emqx_ds_replication_layer_meta:open_db(DB, DefaultOpts), ok = start_ra_system(DB, Opts), Children = [ sup_spec(#?shards_sup{db = DB}, []), sup_spec(#?egress_sup{db = DB}, []), - shard_allocator_spec(DB, Opts) + shard_allocator_spec(DB) ], SupFlags = #{ strategy => one_for_all, @@ -129,7 +156,7 @@ init({#?shard_sup{db = DB, shard = Shard}, _}) -> intensity => 10, period => 100 }, - Opts = emqx_ds_replication_layer_meta:get_options(DB), + Opts = emqx_ds_replication_layer_meta:db_config(DB), Children = [ shard_storage_spec(DB, Shard, Opts), shard_replication_spec(DB, Shard, Opts) @@ -185,7 +212,7 @@ sup_spec(Id, Options) -> shard_spec(DB, Shard) -> #{ - id => {shard, Shard}, + id => Shard, start => {?MODULE, start_link_sup, [#?shard_sup{db = DB, shard = Shard}, []]}, shutdown => infinity, restart => permanent, @@ -205,14 +232,15 @@ shard_replication_spec(DB, Shard, Opts) -> #{ id => {Shard, replication}, start => {emqx_ds_replication_layer_shard, start_link, [DB, Shard, Opts]}, - restart => transient, + shutdown => 10_000, + restart => permanent, type => worker }. -shard_allocator_spec(DB, Opts) -> +shard_allocator_spec(DB) -> #{ id => shard_allocator, - start => {emqx_ds_replication_shard_allocator, start_link, [DB, Opts]}, + start => {emqx_ds_replication_shard_allocator, start_link, [DB]}, restart => permanent, type => worker }. diff --git a/apps/emqx_durable_storage/src/emqx_ds_builtin_metrics.erl b/apps/emqx_durable_storage/src/emqx_ds_builtin_metrics.erl new file mode 100644 index 000000000..ce984db57 --- /dev/null +++ b/apps/emqx_durable_storage/src/emqx_ds_builtin_metrics.erl @@ -0,0 +1,299 @@ +%%-------------------------------------------------------------------- +%% Copyright (c) 2023-2024 EMQ Technologies Co., Ltd. All Rights Reserved. +%% +%% Licensed under the Apache License, Version 2.0 (the "License"); +%% you may not use this file except in compliance with the License. +%% You may obtain a copy of the License at +%% +%% http://www.apache.org/licenses/LICENSE-2.0 +%% +%% Unless required by applicable law or agreed to in writing, software +%% distributed under the License is distributed on an "AS IS" BASIS, +%% WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +%% See the License for the specific language governing permissions and +%% limitations under the License. +%%-------------------------------------------------------------------- +-module(emqx_ds_builtin_metrics). + +%% DS-facing API: +-export([child_spec/0, init_for_db/1, shard_metric_id/2, init_for_shard/1]). + +%% Prometheus-facing API: +-export([prometheus_meta/0, prometheus_collect/1]). + +-export([ + inc_egress_batches/1, + inc_egress_batches_retry/1, + inc_egress_batches_failed/1, + inc_egress_messages/2, + inc_egress_bytes/2, + + observe_egress_flush_time/2, + + observe_store_batch_time/2, + + observe_next_time/2, + + inc_lts_seek_counter/2, + inc_lts_next_counter/2, + inc_lts_collision_counter/2 +]). + +%% behavior callbacks: +-export([]). + +%% internal exports: +-export([]). + +-export_type([shard_metrics_id/0]). + +-include("emqx_ds_metrics.hrl"). + +%%================================================================================ +%% Type declarations +%%================================================================================ + +-define(WORKER, ?MODULE). + +-define(STORAGE_LAYER_METRICS, [ + {slide, ?DS_STORE_BATCH_TIME}, + {counter, ?DS_LTS_SEEK_COUNTER}, + {counter, ?DS_LTS_NEXT_COUNTER}, + {counter, ?DS_LTS_COLLISION_COUNTER} +]). + +-define(FETCH_METRICS, [ + {slide, ?DS_BUILTIN_NEXT_TIME} +]). + +-define(DB_METRICS, ?STORAGE_LAYER_METRICS ++ ?FETCH_METRICS). + +-define(EGRESS_METRICS, [ + {counter, ?DS_EGRESS_BATCHES}, + {counter, ?DS_EGRESS_BATCHES_RETRY}, + {counter, ?DS_EGRESS_BATCHES_FAILED}, + {counter, ?DS_EGRESS_MESSAGES}, + {counter, ?DS_EGRESS_BYTES}, + {slide, ?DS_EGRESS_FLUSH_TIME} +]). + +-define(SHARD_METRICS, ?EGRESS_METRICS). + +-type shard_metrics_id() :: binary(). + +-elvis([{elvis_style, dont_repeat_yourself, disable}]). + +%%================================================================================ +%% API functions +%%================================================================================ + +-spec child_spec() -> supervisor:child_spec(). +child_spec() -> + emqx_metrics_worker:child_spec(?WORKER). + +%% @doc Initialize metrics that are global for a DS database +-spec init_for_db(emqx_ds:db()) -> ok. +init_for_db(DB) -> + emqx_metrics_worker:create_metrics(?WORKER, DB, ?DB_METRICS, []). + +-spec shard_metric_id(emqx_ds:db(), emqx_ds_replication_layer:shard_id()) -> shard_metrics_id(). +shard_metric_id(DB, ShardId) -> + iolist_to_binary([atom_to_list(DB), $/, ShardId]). + +%% @doc Initialize metrics that are specific for the shard. +-spec init_for_shard(shard_metrics_id()) -> ok. +init_for_shard(ShardId) -> + emqx_metrics_worker:create_metrics(?WORKER, ShardId, ?SHARD_METRICS, []). + +%% @doc Increase the number of successfully flushed batches +-spec inc_egress_batches(shard_metrics_id()) -> ok. +inc_egress_batches(Id) -> + catch emqx_metrics_worker:inc(?WORKER, Id, ?DS_EGRESS_BATCHES). + +%% @doc Increase the number of time the egress worker had to retry +%% flushing the batch +-spec inc_egress_batches_retry(shard_metrics_id()) -> ok. +inc_egress_batches_retry(Id) -> + catch emqx_metrics_worker:inc(?WORKER, Id, ?DS_EGRESS_BATCHES_RETRY). + +%% @doc Increase the number of time the egress worker encountered an +%% unrecoverable error while trying to flush the batch +-spec inc_egress_batches_failed(shard_metrics_id()) -> ok. +inc_egress_batches_failed(Id) -> + catch emqx_metrics_worker:inc(?WORKER, Id, ?DS_EGRESS_BATCHES_FAILED). + +%% @doc Increase the number of messages successfully saved to the shard +-spec inc_egress_messages(shard_metrics_id(), non_neg_integer()) -> ok. +inc_egress_messages(Id, NMessages) -> + catch emqx_metrics_worker:inc(?WORKER, Id, ?DS_EGRESS_MESSAGES, NMessages). + +%% @doc Increase the number of messages successfully saved to the shard +-spec inc_egress_bytes(shard_metrics_id(), non_neg_integer()) -> ok. +inc_egress_bytes(Id, NMessages) -> + catch emqx_metrics_worker:inc(?WORKER, Id, ?DS_EGRESS_BYTES, NMessages). + +%% @doc Add a sample of elapsed time spent flushing the egress to the +%% Raft log (in microseconds) +-spec observe_egress_flush_time(shard_metrics_id(), non_neg_integer()) -> ok. +observe_egress_flush_time(Id, FlushTime) -> + catch emqx_metrics_worker:observe(?WORKER, Id, ?DS_EGRESS_FLUSH_TIME, FlushTime). + +-spec observe_store_batch_time(emqx_ds_storage_layer:shard_id(), non_neg_integer()) -> ok. +observe_store_batch_time({DB, _}, StoreTime) -> + catch emqx_metrics_worker:observe(?WORKER, DB, ?DS_STORE_BATCH_TIME, StoreTime). + +%% @doc Add a sample of elapsed time spent waiting for a batch +%% `emqx_ds_replication_layer:next' +-spec observe_next_time(emqx_ds:db(), non_neg_integer()) -> ok. +observe_next_time(DB, NextTime) -> + catch emqx_metrics_worker:observe(?WORKER, DB, ?DS_BUILTIN_NEXT_TIME, NextTime). + +-spec inc_lts_seek_counter(emqx_ds_storage_layer:shard_id(), non_neg_integer()) -> ok. +inc_lts_seek_counter({DB, _}, Inc) -> + catch emqx_metrics_worker:inc(?WORKER, DB, ?DS_LTS_SEEK_COUNTER, Inc). + +-spec inc_lts_next_counter(emqx_ds_storage_layer:shard_id(), non_neg_integer()) -> ok. +inc_lts_next_counter({DB, _}, Inc) -> + catch emqx_metrics_worker:inc(?WORKER, DB, ?DS_LTS_NEXT_COUNTER, Inc). + +-spec inc_lts_collision_counter(emqx_ds_storage_layer:shard_id(), non_neg_integer()) -> ok. +inc_lts_collision_counter({DB, _}, Inc) -> + catch emqx_metrics_worker:inc(?WORKER, DB, ?DS_LTS_COLLISION_COUNTER, Inc). + +prometheus_meta() -> + lists:map( + fun + ({counter, A}) -> + {A, counter, A}; + ({slide, A}) -> + {A, counter, A} + end, + ?DB_METRICS ++ ?SHARD_METRICS + ). + +prometheus_collect(NodeOrAggr) -> + maps:merge(prometheus_per_db(NodeOrAggr), prometheus_per_shard(NodeOrAggr)). + +prometheus_per_db(NodeOrAggr) -> + lists:foldl( + fun(DB, Acc) -> + prometheus_per_db(NodeOrAggr, DB, Acc) + end, + #{}, + emqx_ds_builtin_db_sup:which_dbs() + ). + +%% This function returns the data in the following format: +%% ``` +%% #{emqx_ds_store_batch_time => +%% [{[{db, emqx_persistent_message}], 42}], +%% ... +%% ''' +%% +%% If `NodeOrAggr' = `node' then node name is appended to the list of +%% labels. +prometheus_per_db(NodeOrAggr, DB, Acc0) -> + Labels = [ + {db, DB} + | case NodeOrAggr of + node -> []; + _ -> [{node, node()}] + end + ], + #{counters := CC, slides := SS} = emqx_metrics_worker:get_metrics(?WORKER, DB), + %% Collect counters: + Acc1 = maps:fold( + fun(MetricId, Value, Acc1) -> + append_to_key(MetricId, {Labels, Value}, Acc1) + end, + Acc0, + CC + ), + %% Collect slides: + maps:fold( + fun(MetricId, Value, Acc2) -> + Acc3 = append_to_key(MetricId, slide_value(current, Value, Labels), Acc2), + append_to_key(MetricId, slide_value(last5m, Value, Labels), Acc3) + end, + Acc1, + SS + ). + +%% This function returns the data in the following format: +%% ``` +%% #{emqx_ds_egress_batches => +%% [{[{db,emqx_persistent_message},{shard,<<"1">>}],99408}, +%% {[{db,emqx_persistent_message},{shard,<<"0">>}],99409}], +%% emqx_ds_egress_batches_retry => +%% [{[{db,emqx_persistent_message},{shard,<<"1">>}],0}, +%% {[{db,emqx_persistent_message},{shard,<<"0">>}],0}], +%% emqx_ds_egress_messages => +%% ... +%% } +%% ''' +%% +%% If `NodeOrAggr' = `node' then node name is appended to the list of +%% labels. +prometheus_per_shard(NodeOrAggr) -> + lists:foldl( + fun(DB, Acc0) -> + lists:foldl( + fun(Shard, Acc) -> + prometheus_per_shard(NodeOrAggr, DB, Shard, Acc) + end, + Acc0, + emqx_ds_replication_layer_meta:shards(DB) + ) + end, + #{}, + emqx_ds_builtin_db_sup:which_dbs() + ). + +prometheus_per_shard(NodeOrAggr, DB, Shard, Acc0) -> + Labels = [ + {db, DB}, + {shard, Shard} + | case NodeOrAggr of + node -> []; + _ -> [{node, node()}] + end + ], + #{counters := CC, slides := SS} = emqx_metrics_worker:get_metrics( + ?WORKER, shard_metric_id(DB, Shard) + ), + %% Collect counters: + Acc1 = maps:fold( + fun(MetricId, Value, Acc1) -> + append_to_key(MetricId, {Labels, Value}, Acc1) + end, + Acc0, + CC + ), + %% Collect slides: + maps:fold( + fun(MetricId, Value, Acc2) -> + Acc3 = append_to_key(MetricId, slide_value(current, Value, Labels), Acc2), + append_to_key(MetricId, slide_value(last5m, Value, Labels), Acc3) + end, + Acc1, + SS + ). + +-spec append_to_key(K, V, #{K => [V]}) -> #{K => [V]}. +append_to_key(Key, Value, Map) -> + maps:update_with( + Key, + fun(L) -> + [Value | L] + end, + [Value], + Map + ). + +slide_value(Interval, Value, Labels0) -> + Labels = [{interval, Interval} | Labels0], + {Labels, maps:get(Interval, Value, 0)}. + +%%================================================================================ +%% Internal functions +%%================================================================================ diff --git a/apps/emqx_durable_storage/src/emqx_ds_builtin_sup.erl b/apps/emqx_durable_storage/src/emqx_ds_builtin_sup.erl index 50ed18de1..45e81bdc9 100644 --- a/apps/emqx_durable_storage/src/emqx_ds_builtin_sup.erl +++ b/apps/emqx_durable_storage/src/emqx_ds_builtin_sup.erl @@ -81,6 +81,7 @@ stop_db(DB) -> %% Chidren are attached dynamically to this one. init(?top) -> %% Children: + MetricsWorker = emqx_ds_builtin_metrics:child_spec(), MetadataServer = #{ id => metadata_server, start => {emqx_ds_replication_layer_meta, start_link, []}, @@ -102,7 +103,7 @@ init(?top) -> period => 1, auto_shutdown => never }, - {ok, {SupFlags, [MetadataServer, DBsSup]}}; + {ok, {SupFlags, [MetricsWorker, MetadataServer, DBsSup]}}; init(?databases) -> %% Children are added dynamically: SupFlags = #{ diff --git a/apps/emqx_durable_storage/src/emqx_ds_lts.erl b/apps/emqx_durable_storage/src/emqx_ds_lts.erl index 6ebfc820d..bd7cb3826 100644 --- a/apps/emqx_durable_storage/src/emqx_ds_lts.erl +++ b/apps/emqx_durable_storage/src/emqx_ds_lts.erl @@ -263,12 +263,14 @@ trie_insert(#trie{trie = Trie, stats = Stats, persist = Persist}, State, Token, end. -spec get_id_for_key(trie(), state(), edge()) -> static_key(). -get_id_for_key(#trie{static_key_size = Size}, _State, _Token) -> +get_id_for_key(#trie{static_key_size = Size}, State, Token) when Size =< 32 -> %% Requirements for the return value: %% %% It should be globally unique for the `{State, Token}` pair. Other %% than that, there's no requirements. The return value doesn't even %% have to be deterministic, since the states are saved in the trie. + %% Yet, it helps a lot if it is, so that applying the same sequence + %% of topics to different tries will result in the same trie state. %% %% The generated value becomes the ID of the topic in the durable %% storage. Its size should be relatively small to reduce the @@ -277,7 +279,7 @@ get_id_for_key(#trie{static_key_size = Size}, _State, _Token) -> %% If we want to impress computer science crowd, sorry, I mean to %% minimize storage requirements, we can even employ Huffman coding %% based on the frequency of messages. - <> = crypto:strong_rand_bytes(Size), + <> = crypto:hash(sha256, term_to_binary([State | Token])), Int. %% erlfmt-ignore diff --git a/apps/emqx_durable_storage/src/emqx_ds_replication_layer.erl b/apps/emqx_durable_storage/src/emqx_ds_replication_layer.erl index 72f142b8f..61126c164 100644 --- a/apps/emqx_durable_storage/src/emqx_ds_replication_layer.erl +++ b/apps/emqx_durable_storage/src/emqx_ds_replication_layer.erl @@ -43,7 +43,6 @@ -export([ %% RPC Targets: do_drop_db_v1/1, - do_store_batch_v1/4, do_get_streams_v1/4, do_get_streams_v2/4, do_make_iterator_v2/5, @@ -53,11 +52,11 @@ do_get_delete_streams_v4/4, do_make_delete_iterator_v4/5, do_delete_next_v4/5, - %% Unused: - do_drop_generation_v3/3, %% Obsolete: + do_store_batch_v1/4, do_make_iterator_v1/5, do_add_generation_v2/1, + do_drop_generation_v3/3, %% Egress API: ra_store_batch/3 @@ -65,7 +64,9 @@ -export([ init/1, - apply/3 + apply/3, + + snapshot_module/0 ]). -export_type([ @@ -80,6 +81,10 @@ batch/0 ]). +-export_type([ + ra_state/0 +]). + -include_lib("emqx_utils/include/emqx_message.hrl"). -include("emqx_ds_replication_layer.hrl"). @@ -133,6 +138,8 @@ -type message_id() :: emqx_ds:message_id(). +%% TODO: this type is obsolete and is kept only for compatibility with +%% BPAPIs. Remove it when emqx_ds_proto_v4 is gone (EMQX 5.6) -type batch() :: #{ ?tag := ?BATCH, ?batch_messages := [emqx_types:message()] @@ -140,6 +147,20 @@ -type generation_rank() :: {shard_id(), term()}. +%% Core state of the replication, i.e. the state of ra machine. +-type ra_state() :: #{ + db_shard := {emqx_ds:db(), shard_id()}, + latest := timestamp_us() +}. + +%% Command. Each command is an entry in the replication log. +-type ra_command() :: #{ + ?tag := ?BATCH | add_generation | update_config | drop_generation, + _ => _ +}. + +-type timestamp_us() :: non_neg_integer(). + %%================================================================================ %% API functions %%================================================================================ @@ -168,8 +189,7 @@ add_generation(DB) -> -spec update_db_config(emqx_ds:db(), builtin_db_opts()) -> ok | {error, _}. update_db_config(DB, CreateOpts) -> - ok = emqx_ds_replication_layer_meta:update_db_config(DB, CreateOpts), - Opts = emqx_ds_replication_layer_meta:get_options(DB), + Opts = #{} = emqx_ds_replication_layer_meta:update_db_config(DB, CreateOpts), foreach_shard( DB, fun(Shard) -> ok = ra_update_config(DB, Shard, Opts) end @@ -181,12 +201,19 @@ list_generations_with_lifetimes(DB) -> Shards = list_shards(DB), lists:foldl( fun(Shard, GensAcc) -> + case ra_list_generations_with_lifetimes(DB, Shard) of + Gens = #{} -> + ok; + {error, _Class, _Reason} -> + %% TODO: log error + Gens = #{} + end, maps:fold( fun(GenId, Data, AccInner) -> AccInner#{{Shard, GenId} => Data} end, GensAcc, - ra_list_generations_with_lifetimes(DB, Shard) + Gens ) end, #{}, @@ -221,14 +248,13 @@ get_streams(DB, TopicFilter, StartTime) -> Shards = list_shards(DB), lists:flatmap( fun(Shard) -> - Streams = - try - ra_get_streams(DB, Shard, TopicFilter, StartTime) - catch - error:{erpc, _} -> - %% TODO: log? - [] - end, + case ra_get_streams(DB, Shard, TopicFilter, StartTime) of + Streams when is_list(Streams) -> + ok; + {error, _Class, _Reason} -> + %% TODO: log error + Streams = [] + end, lists:map( fun({RankY, StorageLayerStream}) -> RankX = Shard, @@ -262,14 +288,11 @@ get_delete_streams(DB, TopicFilter, StartTime) -> emqx_ds:make_iterator_result(iterator()). make_iterator(DB, Stream, TopicFilter, StartTime) -> ?stream_v2(Shard, StorageStream) = Stream, - try ra_make_iterator(DB, Shard, StorageStream, TopicFilter, StartTime) of + case ra_make_iterator(DB, Shard, StorageStream, TopicFilter, StartTime) of {ok, Iter} -> {ok, #{?tag => ?IT, ?shard => Shard, ?enc => Iter}}; Error = {error, _, _} -> Error - catch - error:RPCError = {erpc, _} -> - {error, recoverable, RPCError} end. -spec make_delete_iterator(emqx_ds:db(), delete_stream(), emqx_ds:topic_filter(), emqx_ds:time()) -> @@ -279,22 +302,19 @@ make_delete_iterator(DB, Stream, TopicFilter, StartTime) -> case ra_make_delete_iterator(DB, Shard, StorageStream, TopicFilter, StartTime) of {ok, Iter} -> {ok, #{?tag => ?DELETE_IT, ?shard => Shard, ?enc => Iter}}; - Err = {error, _} -> - Err + Error = {error, _, _} -> + Error end. -spec update_iterator(emqx_ds:db(), iterator(), emqx_ds:message_key()) -> emqx_ds:make_iterator_result(iterator()). update_iterator(DB, OldIter, DSKey) -> #{?tag := ?IT, ?shard := Shard, ?enc := StorageIter} = OldIter, - try ra_update_iterator(DB, Shard, StorageIter, DSKey) of + case ra_update_iterator(DB, Shard, StorageIter, DSKey) of {ok, Iter} -> {ok, #{?tag => ?IT, ?shard => Shard, ?enc => Iter}}; Error = {error, _, _} -> Error - catch - error:RPCError = {erpc, _} -> - {error, recoverable, RPCError} end. -spec next(emqx_ds:db(), iterator(), pos_integer()) -> emqx_ds:next_result(iterator()). @@ -308,16 +328,16 @@ next(DB, Iter0, BatchSize) -> %% %% This kind of trickery should be probably done here in the %% replication layer. Or, perhaps, in the logic layer. - case ra_next(DB, Shard, StorageIter0, BatchSize) of + T0 = erlang:monotonic_time(microsecond), + Result = ra_next(DB, Shard, StorageIter0, BatchSize), + T1 = erlang:monotonic_time(microsecond), + emqx_ds_builtin_metrics:observe_next_time(DB, T1 - T0), + case Result of {ok, StorageIter, Batch} -> Iter = Iter0#{?enc := StorageIter}, {ok, Iter, Batch}; - Ok = {ok, _} -> - Ok; - Error = {error, _, _} -> - Error; - RPCError = {badrpc, _} -> - {error, recoverable, RPCError} + Other -> + Other end. -spec delete_next(emqx_ds:db(), delete_iterator(), emqx_ds:delete_selector(), pos_integer()) -> @@ -354,6 +374,19 @@ foreach_shard(DB, Fun) -> %% Internal exports (RPC targets) %%================================================================================ +%% NOTE +%% Target node may still be in the process of starting up when RPCs arrive, it's +%% good to have them handled gracefully. +%% TODO +%% There's a possibility of race condition: storage may shut down right after we +%% ask for its status. +-define(IF_STORAGE_RUNNING(SHARDID, EXPR), + case emqx_ds_storage_layer:shard_info(SHARDID, status) of + running -> EXPR; + down -> {error, recoverable, storage_down} + end +). + -spec do_drop_db_v1(emqx_ds:db()) -> ok | {error, _}. do_drop_db_v1(DB) -> MyShards = emqx_ds_replication_layer_meta:my_shards(DB), @@ -371,10 +404,9 @@ do_drop_db_v1(DB) -> batch(), emqx_ds:message_store_opts() ) -> - emqx_ds:store_batch_result(). -do_store_batch_v1(DB, Shard, #{?tag := ?BATCH, ?batch_messages := Messages}, Options) -> - Batch = [{emqx_message:timestamp(Message), Message} || Message <- Messages], - emqx_ds_storage_layer:store_batch({DB, Shard}, Batch, Options). + no_return(). +do_store_batch_v1(_DB, _Shard, _Batch, _Options) -> + error(obsolete_api). %% Remove me in EMQX 5.6 -dialyzer({nowarn_function, do_get_streams_v1/4}). @@ -386,11 +418,18 @@ do_get_streams_v1(_DB, _Shard, _TopicFilter, _StartTime) -> error(obsolete_api). -spec do_get_streams_v2( - emqx_ds:db(), emqx_ds_replication_layer:shard_id(), emqx_ds:topic_filter(), emqx_ds:time() + emqx_ds:db(), + emqx_ds_replication_layer:shard_id(), + emqx_ds:topic_filter(), + emqx_ds:time() ) -> - [{integer(), emqx_ds_storage_layer:stream()}]. + [{integer(), emqx_ds_storage_layer:stream()}] | emqx_ds:error(storage_down). do_get_streams_v2(DB, Shard, TopicFilter, StartTime) -> - emqx_ds_storage_layer:get_streams({DB, Shard}, TopicFilter, StartTime). + ShardId = {DB, Shard}, + ?IF_STORAGE_RUNNING( + ShardId, + emqx_ds_storage_layer:get_streams(ShardId, TopicFilter, StartTime) + ). -dialyzer({nowarn_function, do_make_iterator_v1/5}). -spec do_make_iterator_v1( @@ -413,7 +452,11 @@ do_make_iterator_v1(_DB, _Shard, _Stream, _TopicFilter, _StartTime) -> ) -> emqx_ds:make_iterator_result(emqx_ds_storage_layer:iterator()). do_make_iterator_v2(DB, Shard, Stream, TopicFilter, StartTime) -> - emqx_ds_storage_layer:make_iterator({DB, Shard}, Stream, TopicFilter, StartTime). + ShardId = {DB, Shard}, + ?IF_STORAGE_RUNNING( + ShardId, + emqx_ds_storage_layer:make_iterator(ShardId, Stream, TopicFilter, StartTime) + ). -spec do_make_delete_iterator_v4( emqx_ds:db(), @@ -434,9 +477,7 @@ do_make_delete_iterator_v4(DB, Shard, Stream, TopicFilter, StartTime) -> ) -> emqx_ds:make_iterator_result(emqx_ds_storage_layer:iterator()). do_update_iterator_v2(DB, Shard, OldIter, DSKey) -> - emqx_ds_storage_layer:update_iterator( - {DB, Shard}, OldIter, DSKey - ). + emqx_ds_storage_layer:update_iterator({DB, Shard}, OldIter, DSKey). -spec do_next_v1( emqx_ds:db(), @@ -446,7 +487,11 @@ do_update_iterator_v2(DB, Shard, OldIter, DSKey) -> ) -> emqx_ds:next_result(emqx_ds_storage_layer:iterator()). do_next_v1(DB, Shard, Iter, BatchSize) -> - emqx_ds_storage_layer:next({DB, Shard}, Iter, BatchSize). + ShardId = {DB, Shard}, + ?IF_STORAGE_RUNNING( + ShardId, + emqx_ds_storage_layer:next(ShardId, Iter, BatchSize) + ). -spec do_delete_next_v4( emqx_ds:db(), @@ -464,14 +509,19 @@ do_add_generation_v2(_DB) -> error(obsolete_api). -spec do_list_generations_with_lifetimes_v3(emqx_ds:db(), shard_id()) -> - #{emqx_ds:ds_specific_generation_rank() => emqx_ds:generation_info()}. -do_list_generations_with_lifetimes_v3(DB, ShardId) -> - emqx_ds_storage_layer:list_generations_with_lifetimes({DB, ShardId}). + #{emqx_ds:ds_specific_generation_rank() => emqx_ds:generation_info()} + | emqx_ds:error(storage_down). +do_list_generations_with_lifetimes_v3(DB, Shard) -> + ShardId = {DB, Shard}, + ?IF_STORAGE_RUNNING( + ShardId, + emqx_ds_storage_layer:list_generations_with_lifetimes(ShardId) + ). -spec do_drop_generation_v3(emqx_ds:db(), shard_id(), emqx_ds_storage_layer:gen_id()) -> - ok | {error, _}. -do_drop_generation_v3(DB, ShardId, GenId) -> - emqx_ds_storage_layer:drop_generation({DB, ShardId}, GenId). + no_return(). +do_drop_generation_v3(_DB, _ShardId, _GenId) -> + error(obsolete_api). -spec do_get_delete_streams_v4( emqx_ds:db(), emqx_ds_replication_layer:shard_id(), emqx_ds:topic_filter(), emqx_ds:time() @@ -491,6 +541,17 @@ list_nodes() -> %% Too large for normal operation, need better backpressure mechanism. -define(RA_TIMEOUT, 60 * 1000). +-define(SAFERPC(EXPR), + try + EXPR + catch + error:RPCError = {erpc, _} -> + {error, recoverable, RPCError} + end +). + +-spec ra_store_batch(emqx_ds:db(), emqx_ds_replication_layer:shard_id(), [emqx_types:message()]) -> + ok | {timeout, _} | {error, recoverable | unrecoverable, _Err} | _Err. ra_store_batch(DB, Shard, Messages) -> Command = #{ ?tag => ?BATCH, @@ -544,28 +605,34 @@ ra_drop_generation(DB, Shard, GenId) -> ra_get_streams(DB, Shard, TopicFilter, Time) -> {_, Node} = emqx_ds_replication_layer_shard:server(DB, Shard, local_preferred), TimestampUs = timestamp_to_timeus(Time), - emqx_ds_proto_v4:get_streams(Node, DB, Shard, TopicFilter, TimestampUs). + ?SAFERPC(emqx_ds_proto_v4:get_streams(Node, DB, Shard, TopicFilter, TimestampUs)). ra_get_delete_streams(DB, Shard, TopicFilter, Time) -> {_Name, Node} = emqx_ds_replication_layer_shard:server(DB, Shard, local_preferred), - emqx_ds_proto_v4:get_delete_streams(Node, DB, Shard, TopicFilter, Time). + ?SAFERPC(emqx_ds_proto_v4:get_delete_streams(Node, DB, Shard, TopicFilter, Time)). ra_make_iterator(DB, Shard, Stream, TopicFilter, StartTime) -> {_, Node} = emqx_ds_replication_layer_shard:server(DB, Shard, local_preferred), - TimestampUs = timestamp_to_timeus(StartTime), - emqx_ds_proto_v4:make_iterator(Node, DB, Shard, Stream, TopicFilter, TimestampUs). + TimeUs = timestamp_to_timeus(StartTime), + ?SAFERPC(emqx_ds_proto_v4:make_iterator(Node, DB, Shard, Stream, TopicFilter, TimeUs)). ra_make_delete_iterator(DB, Shard, Stream, TopicFilter, StartTime) -> {_Name, Node} = emqx_ds_replication_layer_shard:server(DB, Shard, local_preferred), - emqx_ds_proto_v4:make_delete_iterator(Node, DB, Shard, Stream, TopicFilter, StartTime). + TimeUs = timestamp_to_timeus(StartTime), + ?SAFERPC(emqx_ds_proto_v4:make_delete_iterator(Node, DB, Shard, Stream, TopicFilter, TimeUs)). ra_update_iterator(DB, Shard, Iter, DSKey) -> {_Name, Node} = emqx_ds_replication_layer_shard:server(DB, Shard, local_preferred), - emqx_ds_proto_v4:update_iterator(Node, DB, Shard, Iter, DSKey). + ?SAFERPC(emqx_ds_proto_v4:update_iterator(Node, DB, Shard, Iter, DSKey)). ra_next(DB, Shard, Iter, BatchSize) -> {_Name, Node} = emqx_ds_replication_layer_shard:server(DB, Shard, local_preferred), - emqx_ds_proto_v4:next(Node, DB, Shard, Iter, BatchSize). + case emqx_ds_proto_v4:next(Node, DB, Shard, Iter, BatchSize) of + RPCError = {badrpc, _} -> + {error, recoverable, RPCError}; + Other -> + Other + end. ra_delete_next(DB, Shard, Iter, Selector, BatchSize) -> {_Name, Node} = emqx_ds_replication_layer_shard:server(DB, Shard, local_preferred), @@ -573,25 +640,32 @@ ra_delete_next(DB, Shard, Iter, Selector, BatchSize) -> ra_list_generations_with_lifetimes(DB, Shard) -> {_Name, Node} = emqx_ds_replication_layer_shard:server(DB, Shard, local_preferred), - Gens = emqx_ds_proto_v4:list_generations_with_lifetimes(Node, DB, Shard), - maps:map( - fun(_GenId, Data = #{since := Since, until := Until}) -> - Data#{ - since := timeus_to_timestamp(Since), - until := emqx_maybe:apply(fun timeus_to_timestamp/1, Until) - } - end, - Gens - ). + case ?SAFERPC(emqx_ds_proto_v4:list_generations_with_lifetimes(Node, DB, Shard)) of + Gens = #{} -> + maps:map( + fun(_GenId, Data = #{since := Since, until := Until}) -> + Data#{ + since := timeus_to_timestamp(Since), + until := emqx_maybe:apply(fun timeus_to_timestamp/1, Until) + } + end, + Gens + ); + Error -> + Error + end. ra_drop_shard(DB, Shard) -> ra:delete_cluster(emqx_ds_replication_layer_shard:shard_servers(DB, Shard), ?RA_TIMEOUT). %% +-spec init(_Args :: map()) -> ra_state(). init(#{db := DB, shard := Shard}) -> #{db_shard => {DB, Shard}, latest => 0}. +-spec apply(ra_machine:command_meta_data(), ra_command(), ra_state()) -> + {ra_state(), _Reply, _Effects}. apply( #{index := RaftIdx}, #{ @@ -671,3 +745,6 @@ timestamp_to_timeus(TimestampMs) -> timeus_to_timestamp(TimestampUs) -> TimestampUs div 1000. + +snapshot_module() -> + emqx_ds_replication_snapshot. diff --git a/apps/emqx_durable_storage/src/emqx_ds_replication_layer_egress.erl b/apps/emqx_durable_storage/src/emqx_ds_replication_layer_egress.erl index 128aeb380..9201ccf04 100644 --- a/apps/emqx_durable_storage/src/emqx_ds_replication_layer_egress.erl +++ b/apps/emqx_durable_storage/src/emqx_ds_replication_layer_egress.erl @@ -40,6 +40,7 @@ -export_type([]). +-include_lib("emqx_utils/include/emqx_message.hrl"). -include_lib("snabbkaffe/include/trace.hrl"). %%================================================================================ @@ -49,8 +50,13 @@ -define(via(DB, Shard), {via, gproc, {n, l, {?MODULE, DB, Shard}}}). -define(flush, flush). --record(enqueue_req, {message :: emqx_types:message(), sync :: boolean()}). --record(enqueue_atomic_req, {batch :: [emqx_types:message()], sync :: boolean()}). +-record(enqueue_req, { + messages :: [emqx_types:message()], + sync :: boolean(), + atomic :: boolean(), + n_messages :: non_neg_integer(), + payload_bytes :: non_neg_integer() +}). %%================================================================================ %% API functions @@ -61,44 +67,32 @@ start_link(DB, Shard) -> gen_server:start_link(?via(DB, Shard), ?MODULE, [DB, Shard], []). -spec store_batch(emqx_ds:db(), [emqx_types:message()], emqx_ds:message_store_opts()) -> - ok. + emqx_ds:store_batch_result(). store_batch(DB, Messages, Opts) -> Sync = maps:get(sync, Opts, true), - case maps:get(atomic, Opts, false) of - false -> - lists:foreach( - fun(Message) -> - Shard = emqx_ds_replication_layer:shard_of_message(DB, Message, clientid), - gen_server:call( - ?via(DB, Shard), - #enqueue_req{ - message = Message, - sync = Sync - }, - infinity - ) - end, - Messages + Atomic = maps:get(atomic, Opts, false), + %% Usually we expect all messages in the batch to go into the + %% single shard, so this function is optimized for the happy case. + case shards_of_batch(DB, Messages) of + [{Shard, {NMsgs, NBytes}}] -> + %% Happy case: + enqueue_call_or_cast( + ?via(DB, Shard), + #enqueue_req{ + messages = Messages, + sync = Sync, + atomic = Atomic, + n_messages = NMsgs, + payload_bytes = NBytes + } ); - true -> - maps:foreach( - fun(Shard, Batch) -> - gen_server:call( - ?via(DB, Shard), - #enqueue_atomic_req{ - batch = Batch, - sync = Sync - }, - infinity - ) - end, - maps:groups_from_list( - fun(Message) -> - emqx_ds_replication_layer:shard_of_message(DB, Message, clientid) - end, - Messages - ) - ) + [_, _ | _] when Atomic -> + %% It's impossible to commit a batch to multiple shards + %% atomically + {error, unrecoverable, atomic_commit_to_multiple_shards}; + _Shards -> + %% Use a slower implementation for the unlikely case: + repackage_messages(DB, Messages, Sync) end. %%================================================================================ @@ -108,35 +102,65 @@ store_batch(DB, Messages, Opts) -> -record(s, { db :: emqx_ds:db(), shard :: emqx_ds_replication_layer:shard_id(), + metrics_id :: emqx_ds_builtin_metrics:shard_metrics_id(), + n_retries = 0 :: non_neg_integer(), + %% FIXME: Currently max_retries is always 0, because replication + %% layer doesn't guarantee idempotency. Retrying would create + %% duplicate messages. + max_retries = 0 :: non_neg_integer(), n = 0 :: non_neg_integer(), - tref :: reference(), - batch = [] :: [emqx_types:message()], + n_bytes = 0 :: non_neg_integer(), + tref :: undefined | reference(), + queue :: queue:queue(emqx_types:message()), pending_replies = [] :: [gen_server:from()] }). init([DB, Shard]) -> process_flag(trap_exit, true), process_flag(message_queue_data, off_heap), + logger:update_process_metadata(#{domain => [emqx, ds, egress, DB]}), + MetricsId = emqx_ds_builtin_metrics:shard_metric_id(DB, Shard), + ok = emqx_ds_builtin_metrics:init_for_shard(MetricsId), S = #s{ db = DB, shard = Shard, - tref = start_timer() + metrics_id = MetricsId, + queue = queue:new() }, {ok, S}. -handle_call(#enqueue_req{message = Msg, sync = Sync}, From, S) -> - do_enqueue(From, Sync, Msg, S); -handle_call(#enqueue_atomic_req{batch = Batch, sync = Sync}, From, S) -> - Len = length(Batch), - do_enqueue(From, Sync, {atomic, Len, Batch}, S); +handle_call( + #enqueue_req{ + messages = Msgs, + sync = Sync, + atomic = Atomic, + n_messages = NMsgs, + payload_bytes = NBytes + }, + From, + S0 = #s{pending_replies = Replies0} +) -> + S = S0#s{pending_replies = [From | Replies0]}, + {noreply, enqueue(Sync, Atomic, Msgs, NMsgs, NBytes, S)}; handle_call(_Call, _From, S) -> {reply, {error, unknown_call}, S}. +handle_cast( + #enqueue_req{ + messages = Msgs, + sync = Sync, + atomic = Atomic, + n_messages = NMsgs, + payload_bytes = NBytes + }, + S +) -> + {noreply, enqueue(Sync, Atomic, Msgs, NMsgs, NBytes, S)}; handle_cast(_Cast, S) -> {noreply, S}. handle_info(?flush, S) -> - {noreply, do_flush(S)}; + {noreply, flush(S)}; handle_info(_Info, S) -> {noreply, S}. @@ -151,80 +175,215 @@ terminate(_Reason, _S) -> %% Internal functions %%================================================================================ +enqueue( + Sync, + Atomic, + Msgs, + BatchSize, + BatchBytes, + S0 = #s{n = NMsgs0, n_bytes = NBytes0, queue = Q0} +) -> + %% At this point we don't split the batches, even when they aren't + %% atomic. It wouldn't win us anything in terms of memory, and + %% EMQX currently feeds data to DS in very small batches, so + %% granularity should be fine enough. + NMax = application:get_env(emqx_durable_storage, egress_batch_size, 1000), + NBytesMax = application:get_env(emqx_durable_storage, egress_batch_bytes, infinity), + NMsgs = NMsgs0 + BatchSize, + NBytes = NBytes0 + BatchBytes, + case (NMsgs >= NMax orelse NBytes >= NBytesMax) andalso (NMsgs0 > 0) of + true -> + %% Adding this batch would cause buffer to overflow. Flush + %% it now, and retry: + S1 = flush(S0), + enqueue(Sync, Atomic, Msgs, BatchSize, BatchBytes, S1); + false -> + %% The buffer is empty, we enqueue the atomic batch in its + %% entirety: + Q1 = lists:foldl(fun queue:in/2, Q0, Msgs), + S1 = S0#s{n = NMsgs, n_bytes = NBytes, queue = Q1}, + case NMsgs >= NMax orelse NBytes >= NBytesMax of + true -> + flush(S1); + false -> + ensure_timer(S1) + end + end. + -define(COOLDOWN_MIN, 1000). -define(COOLDOWN_MAX, 5000). -do_flush(S = #s{batch = []}) -> - S#s{tref = start_timer()}; +flush(S) -> + do_flush(cancel_timer(S)). + +do_flush(S0 = #s{n = 0}) -> + S0; do_flush( - S = #s{batch = Messages, pending_replies = Replies, db = DB, shard = Shard} + S = #s{ + queue = Q, + pending_replies = Replies, + db = DB, + shard = Shard, + metrics_id = Metrics, + n_retries = Retries, + max_retries = MaxRetries + } ) -> - case emqx_ds_replication_layer:ra_store_batch(DB, Shard, lists:reverse(Messages)) of + Messages = queue:to_list(Q), + T0 = erlang:monotonic_time(microsecond), + Result = emqx_ds_replication_layer:ra_store_batch(DB, Shard, Messages), + T1 = erlang:monotonic_time(microsecond), + emqx_ds_builtin_metrics:observe_egress_flush_time(Metrics, T1 - T0), + case Result of ok -> + emqx_ds_builtin_metrics:inc_egress_batches(Metrics), + emqx_ds_builtin_metrics:inc_egress_messages(Metrics, S#s.n), + emqx_ds_builtin_metrics:inc_egress_bytes(Metrics, S#s.n_bytes), ?tp( emqx_ds_replication_layer_egress_flush, #{db => DB, shard => Shard, batch => Messages} ), lists:foreach(fun(From) -> gen_server:reply(From, ok) end, Replies), - true = erlang:garbage_collect(), - ok; - Error -> - true = erlang:garbage_collect(), + erlang:garbage_collect(), + S#s{ + n = 0, + n_bytes = 0, + queue = queue:new(), + pending_replies = [] + }; + {timeout, ServerId} when Retries < MaxRetries -> + %% Note: this is a hot loop, so we report error messages + %% with `debug' level to avoid wiping the logs. Instead, + %% error the detection must rely on the metrics. Debug + %% logging can be enabled for the particular egress server + %% via logger domain. ?tp( - warning, - emqx_ds_replication_layer_egress_flush_failed, - #{db => DB, shard => Shard, reason => Error} + debug, + emqx_ds_replication_layer_egress_flush_retry, + #{db => DB, shard => Shard, reason => timeout, server_id => ServerId} ), - Cooldown = ?COOLDOWN_MIN + rand:uniform(?COOLDOWN_MAX - ?COOLDOWN_MIN), - ok = timer:sleep(Cooldown), - %% Since we drop the entire batch here, we at least reply callers with an - %% error so they don't hang indefinitely in the `gen_server' call with - %% `infinity' timeout. - lists:foreach(fun(From) -> gen_server:reply(From, {error, Error}) end, Replies) - end, - S#s{ - n = 0, - batch = [], - pending_replies = [], - tref = start_timer() - }. + %% Retry sending the batch: + emqx_ds_builtin_metrics:inc_egress_batches_retry(Metrics), + erlang:garbage_collect(), + %% We block the gen_server until the next retry. + BlockTime = ?COOLDOWN_MIN + rand:uniform(?COOLDOWN_MAX - ?COOLDOWN_MIN), + timer:sleep(BlockTime), + S#s{n_retries = Retries + 1}; + Err -> + ?tp( + debug, + emqx_ds_replication_layer_egress_flush_failed, + #{db => DB, shard => Shard, error => Err} + ), + emqx_ds_builtin_metrics:inc_egress_batches_failed(Metrics), + Reply = + case Err of + {error, _, _} -> Err; + {timeout, ServerId} -> {error, recoverable, {timeout, ServerId}}; + _ -> {error, unrecoverable, Err} + end, + lists:foreach( + fun(From) -> gen_server:reply(From, Reply) end, Replies + ), + erlang:garbage_collect(), + S#s{ + n = 0, + n_bytes = 0, + queue = queue:new(), + pending_replies = [], + n_retries = 0 + } + end. -do_enqueue(From, Sync, MsgOrBatch, S0 = #s{n = N, batch = Batch, pending_replies = Replies}) -> - NMax = application:get_env(emqx_durable_storage, egress_batch_size, 1000), - S1 = - case MsgOrBatch of - {atomic, NumMsgs, Msgs} -> - S0#s{n = N + NumMsgs, batch = Msgs ++ Batch}; - Msg -> - S0#s{n = N + 1, batch = [Msg | Batch]} - end, - %% TODO: later we may want to delay the reply until the message is - %% replicated, but it requies changes to the PUBACK/PUBREC flow to - %% allow for async replies. For now, we ack when the message is - %% _buffered_ rather than stored. - %% - %% Otherwise, the client would freeze for at least flush interval, - %% or until the buffer is filled. - S2 = - case Sync of - true -> - S1#s{pending_replies = [From | Replies]}; - false -> - gen_server:reply(From, ok), - S1 - end, - S = - case N >= NMax of - true -> - _ = erlang:cancel_timer(S2#s.tref), - do_flush(S2); - false -> - S2 - end, - %% TODO: add a backpressure mechanism for the server to avoid - %% building a long message queue. - {noreply, S}. +-spec shards_of_batch(emqx_ds:db(), [emqx_types:message()]) -> + [{emqx_ds_replication_layer:shard_id(), {NMessages, NBytes}}] +when + NMessages :: non_neg_integer(), + NBytes :: non_neg_integer(). +shards_of_batch(DB, Messages) -> + maps:to_list( + lists:foldl( + fun(Message, Acc) -> + %% TODO: sharding strategy must be part of the DS DB schema: + Shard = emqx_ds_replication_layer:shard_of_message(DB, Message, clientid), + Size = payload_size(Message), + maps:update_with( + Shard, + fun({N, S}) -> + {N + 1, S + Size} + end, + {1, Size}, + Acc + ) + end, + #{}, + Messages + ) + ). -start_timer() -> +repackage_messages(DB, Messages, Sync) -> + Batches = lists:foldl( + fun(Message, Acc) -> + Shard = emqx_ds_replication_layer:shard_of_message(DB, Message, clientid), + Size = payload_size(Message), + maps:update_with( + Shard, + fun({N, S, Msgs}) -> + {N + 1, S + Size, [Message | Msgs]} + end, + {1, Size, [Message]}, + Acc + ) + end, + #{}, + Messages + ), + maps:fold( + fun(Shard, {NMsgs, ByteSize, RevMessages}, ErrAcc) -> + Err = enqueue_call_or_cast( + ?via(DB, Shard), + #enqueue_req{ + messages = lists:reverse(RevMessages), + sync = Sync, + atomic = false, + n_messages = NMsgs, + payload_bytes = ByteSize + } + ), + compose_errors(ErrAcc, Err) + end, + ok, + Batches + ). + +enqueue_call_or_cast(To, Req = #enqueue_req{sync = true}) -> + gen_server:call(To, Req, infinity); +enqueue_call_or_cast(To, Req = #enqueue_req{sync = false}) -> + gen_server:cast(To, Req). + +compose_errors(ErrAcc, ok) -> + ErrAcc; +compose_errors(ok, Err) -> + Err; +compose_errors({error, recoverable, _}, {error, unrecoverable, Err}) -> + {error, unrecoverable, Err}; +compose_errors(ErrAcc, _Err) -> + ErrAcc. + +ensure_timer(S = #s{tref = undefined}) -> Interval = application:get_env(emqx_durable_storage, egress_flush_interval, 100), - erlang:send_after(Interval, self(), ?flush). + Tref = erlang:send_after(Interval, self(), ?flush), + S#s{tref = Tref}; +ensure_timer(S) -> + S. + +cancel_timer(S = #s{tref = undefined}) -> + S; +cancel_timer(S = #s{tref = TRef}) -> + _ = erlang:cancel_timer(TRef), + S#s{tref = undefined}. + +%% @doc Return approximate size of the MQTT message (it doesn't take +%% all things into account, for example headers and extras) +payload_size(#message{payload = P, topic = T}) -> + size(P) + size(T). diff --git a/apps/emqx_durable_storage/src/emqx_ds_replication_layer_meta.erl b/apps/emqx_durable_storage/src/emqx_ds_replication_layer_meta.erl index f84863c03..97d4e7412 100644 --- a/apps/emqx_durable_storage/src/emqx_ds_replication_layer_meta.erl +++ b/apps/emqx_durable_storage/src/emqx_ds_replication_layer_meta.erl @@ -29,32 +29,62 @@ -export([ shards/1, my_shards/1, - allocate_shards/2, + shard_info/2, + allocate_shards/1, replica_set/2, sites/0, node/1, - open_db/2, - get_options/1, - update_db_config/2, - drop_db/1, this_site/0, print_status/0 ]). +%% DB API: +-export([ + open_db/2, + db_config/1, + update_db_config/2, + drop_db/1 +]). + +%% Site / shard allocation: +-export([ + join_db_site/2, + leave_db_site/2, + assign_db_sites/2, + replica_set_transitions/2, + update_replica_set/3, + db_sites/1, + target_set/2 +]). + +%% Subscriptions to changes: +-export([ + subscribe/2, + unsubscribe/1 +]). + %% gen_server -export([start_link/0, init/1, handle_call/3, handle_cast/2, handle_info/2, terminate/2]). %% internal exports: -export([ open_db_trans/2, - allocate_shards_trans/2, + allocate_shards_trans/1, + assign_db_sites_trans/2, + modify_db_sites_trans/2, + update_replica_set_trans/3, update_db_config_trans/2, drop_db_trans/1, claim_site/2, n_shards/1 ]). --export_type([site/0]). +-export_type([ + site/0, + transition/0, + subscription_event/0, + update_cluster_result/0 +]). -include_lib("stdlib/include/qlc.hrl"). -include_lib("stdlib/include/ms_transform.hrl"). @@ -86,15 +116,34 @@ -record(?SHARD_TAB, { shard :: {emqx_ds:db(), emqx_ds_replication_layer:shard_id()}, + %% Sites that currently contain the data: + replica_set :: [site()], %% Sites that should contain the data when the cluster is in the %% stable state (no nodes are being added or removed from it): - replica_set :: [site()], + target_set :: [site()] | undefined, misc = #{} :: map() }). %% Persistent ID of the node (independent from the IP/FQDN): -type site() :: binary(). +%% Membership transition of shard's replica set: +-type transition() :: {add | del, site()}. + +-type update_cluster_result() :: + ok + | {error, {nonexistent_db, emqx_ds:db()}} + | {error, {nonexistent_sites, [site()]}} + | {error, {too_few_sites, [site()]}} + | {error, _}. + +%% Subject of the subscription: +-type subject() :: emqx_ds:db(). + +%% Event for the subscription: +-type subscription_event() :: + {changed, {shard, emqx_ds:db(), emqx_ds_replication_layer:shard_id()}}. + %% Peristent term key: -define(emqx_ds_builtin_site, emqx_ds_builtin_site). @@ -156,17 +205,36 @@ start_link() -> -spec shards(emqx_ds:db()) -> [emqx_ds_replication_layer:shard_id()]. shards(DB) -> - filter_shards(DB). + Recs = mnesia:dirty_match_object(?SHARD_TAB, ?SHARD_PAT({DB, '_'})), + [Shard || #?SHARD_TAB{shard = {_, Shard}} <- Recs]. + +-spec shard_info(emqx_ds:db(), emqx_ds_replication_layer:shard_id()) -> + #{replica_set := #{site() => #{status => up | joining}}} + | undefined. +shard_info(DB, Shard) -> + case mnesia:dirty_read(?SHARD_TAB, {DB, Shard}) of + [] -> + undefined; + [#?SHARD_TAB{replica_set = Replicas}] -> + ReplicaSet = maps:from_list([ + begin + %% TODO: + ReplInfo = #{status => up}, + {I, ReplInfo} + end + || I <- Replicas + ]), + #{replica_set => ReplicaSet} + end. -spec my_shards(emqx_ds:db()) -> [emqx_ds_replication_layer:shard_id()]. my_shards(DB) -> Site = this_site(), - filter_shards(DB, fun(#?SHARD_TAB{replica_set = ReplicaSet}) -> - lists:member(Site, ReplicaSet) - end). + Recs = mnesia:dirty_match_object(?SHARD_TAB, ?SHARD_PAT({DB, '_'})), + [Shard || #?SHARD_TAB{shard = {_, Shard}, replica_set = RS} <- Recs, lists:member(Site, RS)]. -allocate_shards(DB, Opts) -> - case mria:transaction(?SHARD, fun ?MODULE:allocate_shards_trans/2, [DB, Opts]) of +allocate_shards(DB) -> + case mria:transaction(?SHARD, fun ?MODULE:allocate_shards_trans/1, [DB]) of {atomic, Shards} -> {ok, Shards}; {aborted, {shards_already_allocated, Shards}} -> @@ -175,16 +243,6 @@ allocate_shards(DB, Opts) -> {error, #{reason => insufficient_sites_online, needed => Needed, sites => Sites}} end. --spec replica_set(emqx_ds:db(), emqx_ds_replication_layer:shard_id()) -> - {ok, [site()]} | {error, _}. -replica_set(DB, Shard) -> - case mnesia:dirty_read(?SHARD_TAB, {DB, Shard}) of - [#?SHARD_TAB{replica_set = ReplicaSet}] -> - {ok, ReplicaSet}; - [] -> - {error, no_shard} - end. - -spec sites() -> [site()]. sites() -> eval_qlc(qlc:q([Site || #?NODE_TAB{site = Site} <- mnesia:table(?NODE_TAB)])). @@ -198,8 +256,12 @@ node(Site) -> undefined end. --spec get_options(emqx_ds:db()) -> emqx_ds_replication_layer:builtin_db_opts(). -get_options(DB) -> +%%=============================================================================== +%% DB API +%%=============================================================================== + +-spec db_config(emqx_ds:db()) -> emqx_ds_replication_layer:builtin_db_opts(). +db_config(DB) -> case mnesia:dirty_read(?META_TAB, DB) of [#?META_TAB{db_props = Opts}] -> Opts; @@ -210,27 +272,104 @@ get_options(DB) -> -spec open_db(emqx_ds:db(), emqx_ds_replication_layer:builtin_db_opts()) -> emqx_ds_replication_layer:builtin_db_opts(). open_db(DB, DefaultOpts) -> - {atomic, Opts} = mria:transaction(?SHARD, fun ?MODULE:open_db_trans/2, [DB, DefaultOpts]), - Opts. + transaction(fun ?MODULE:open_db_trans/2, [DB, DefaultOpts]). -spec update_db_config(emqx_ds:db(), emqx_ds_replication_layer:builtin_db_opts()) -> - ok | {error, _}. + emqx_ds_replication_layer:builtin_db_opts() | {error, nonexistent_db}. update_db_config(DB, DefaultOpts) -> - {atomic, Opts} = mria:transaction(?SHARD, fun ?MODULE:update_db_config_trans/2, [ - DB, DefaultOpts - ]), - Opts. + transaction(fun ?MODULE:update_db_config_trans/2, [DB, DefaultOpts]). -spec drop_db(emqx_ds:db()) -> ok. drop_db(DB) -> - _ = mria:transaction(?SHARD, fun ?MODULE:drop_db_trans/1, [DB]), - ok. + transaction(fun ?MODULE:drop_db_trans/1, [DB]). + +%%=============================================================================== +%% Site / shard allocation API +%%=============================================================================== + +%% @doc Join a site to the set of sites the DB is replicated across. +-spec join_db_site(emqx_ds:db(), site()) -> update_cluster_result(). +join_db_site(DB, Site) -> + transaction(fun ?MODULE:modify_db_sites_trans/2, [DB, [{add, Site}]]). + +%% @doc Make a site leave the set of sites the DB is replicated across. +-spec leave_db_site(emqx_ds:db(), site()) -> update_cluster_result(). +leave_db_site(DB, Site) -> + transaction(fun ?MODULE:modify_db_sites_trans/2, [DB, [{del, Site}]]). + +%% @doc Assign a set of sites to the DB for replication. +-spec assign_db_sites(emqx_ds:db(), [site()]) -> update_cluster_result(). +assign_db_sites(DB, Sites) -> + transaction(fun ?MODULE:assign_db_sites_trans/2, [DB, Sites]). + +%% @doc List the sites the DB is replicated across. +-spec db_sites(emqx_ds:db()) -> [site()]. +db_sites(DB) -> + Recs = mnesia:dirty_match_object(?SHARD_TAB, ?SHARD_PAT({DB, '_'})), + list_db_sites(Recs). + +%% @doc List the sequence of transitions that should be conducted in order to +%% bring the set of replicas for a DB shard in line with the target set. +-spec replica_set_transitions(emqx_ds:db(), emqx_ds_replication_layer:shard_id()) -> + [transition()] | undefined. +replica_set_transitions(DB, Shard) -> + case mnesia:dirty_read(?SHARD_TAB, {DB, Shard}) of + [#?SHARD_TAB{target_set = TargetSet, replica_set = ReplicaSet}] -> + compute_transitions(TargetSet, ReplicaSet); + [] -> + undefined + end. + +%% @doc Update the set of replication sites for a shard. +%% To be called after a `transition()` has been conducted successfully. +-spec update_replica_set(emqx_ds:db(), emqx_ds_replication_layer:shard_id(), transition()) -> ok. +update_replica_set(DB, Shard, Trans) -> + case mria:transaction(?SHARD, fun ?MODULE:update_replica_set_trans/3, [DB, Shard, Trans]) of + {atomic, ok} -> + ok; + {aborted, Reason} -> + {error, Reason} + end. + +%% @doc Get the current set of replication sites for a shard. +-spec replica_set(emqx_ds:db(), emqx_ds_replication_layer:shard_id()) -> + [site()] | undefined. +replica_set(DB, Shard) -> + case mnesia:dirty_read(?SHARD_TAB, {DB, Shard}) of + [#?SHARD_TAB{replica_set = ReplicaSet}] -> + ReplicaSet; + [] -> + undefined + end. + +%% @doc Get the target set of replication sites for a DB shard. +%% Target set is updated every time the set of replication sites for the DB changes. +%% See `join_db_site/2`, `leave_db_site/2`, `assign_db_sites/2`. +-spec target_set(emqx_ds:db(), emqx_ds_replication_layer:shard_id()) -> + [site()] | undefined. +target_set(DB, Shard) -> + case mnesia:dirty_read(?SHARD_TAB, {DB, Shard}) of + [#?SHARD_TAB{target_set = TargetSet}] -> + TargetSet; + [] -> + undefined + end. + +%%================================================================================ + +subscribe(Pid, Subject) -> + gen_server:call(?SERVER, {subscribe, Pid, Subject}, infinity). + +unsubscribe(Pid) -> + gen_server:call(?SERVER, {unsubscribe, Pid}, infinity). %%================================================================================ %% behavior callbacks %%================================================================================ --record(s, {}). +-record(s, { + subs = #{} :: #{pid() => {subject(), _Monitor :: reference()}} +}). init([]) -> process_flag(trap_exit, true), @@ -238,14 +377,24 @@ init([]) -> ensure_tables(), ensure_site(), S = #s{}, + {ok, _Node} = mnesia:subscribe({table, ?SHARD_TAB, simple}), {ok, S}. +handle_call({subscribe, Pid, Subject}, _From, S) -> + {reply, ok, handle_subscribe(Pid, Subject, S)}; +handle_call({unsubscribe, Pid}, _From, S) -> + {reply, ok, handle_unsubscribe(Pid, S)}; handle_call(_Call, _From, S) -> {reply, {error, unknown_call}, S}. handle_cast(_Cast, S) -> {noreply, S}. +handle_info({mnesia_table_event, {write, #?SHARD_TAB{shard = {DB, Shard}}, _}}, S) -> + ok = notify_subscribers(DB, {shard, DB, Shard}, S), + {noreply, S}; +handle_info({'DOWN', _MRef, process, Pid, _Reason}, S) -> + {noreply, handle_unsubscribe(Pid, S)}; handle_info(_Info, S) -> {noreply, S}. @@ -268,19 +417,15 @@ open_db_trans(DB, CreateOpts) -> Opts end. --spec allocate_shards_trans(emqx_ds:db(), emqx_ds_replication_layer:builtin_db_opts()) -> [_Shard]. -allocate_shards_trans(DB, Opts) -> - NShards = maps:get(n_shards, Opts), - NSites = maps:get(n_sites, Opts), - ReplicationFactor = maps:get(replication_factor, Opts), - NReplicas = min(NSites, ReplicationFactor), - Shards = [integer_to_binary(I) || I <- lists:seq(0, NShards - 1)], - AllSites = mnesia:match_object(?NODE_TAB, ?NODE_PAT(), read), - case length(AllSites) of +-spec allocate_shards_trans(emqx_ds:db()) -> [emqx_ds_replication_layer:shard_id()]. +allocate_shards_trans(DB) -> + Opts = #{n_shards := NShards, n_sites := NSites} = db_config_trans(DB), + Nodes = mnesia:match_object(?NODE_TAB, ?NODE_PAT(), read), + case length(Nodes) of N when N >= NSites -> ok; _ -> - mnesia:abort({insufficient_sites_online, NSites, AllSites}) + mnesia:abort({insufficient_sites_online, NSites, Nodes}) end, case mnesia:match_object(?SHARD_TAB, ?SHARD_PAT({DB, '_'}), write) of [] -> @@ -289,18 +434,11 @@ allocate_shards_trans(DB, Opts) -> ShardsAllocated = [Shard || #?SHARD_TAB{shard = {_DB, Shard}} <- Records], mnesia:abort({shards_already_allocated, ShardsAllocated}) end, - {Allocation, _} = lists:mapfoldl( - fun(Shard, SSites) -> - {Sites, _} = emqx_utils_stream:consume(NReplicas, SSites), - {_, SRest} = emqx_utils_stream:consume(1, SSites), - {{Shard, Sites}, SRest} - end, - emqx_utils_stream:repeat(emqx_utils_stream:list(AllSites)), - Shards - ), + Shards = gen_shards(NShards), + Sites = [S || #?NODE_TAB{site = S} <- Nodes], + Allocation = compute_allocation(Shards, Sites, Opts), lists:map( - fun({Shard, Sites}) -> - ReplicaSet = [Site || #?NODE_TAB{site = Site} <- Sites], + fun({Shard, ReplicaSet}) -> Record = #?SHARD_TAB{ shard = {DB, Shard}, replica_set = ReplicaSet @@ -311,29 +449,86 @@ allocate_shards_trans(DB, Opts) -> Allocation ). --spec update_db_config_trans(emqx_ds:db(), emqx_ds_replication_layer:builtin_db_opts()) -> - ok | {error, database}. -update_db_config_trans(DB, CreateOpts) -> - case mnesia:wread({?META_TAB, DB}) of - [#?META_TAB{db_props = Opts}] -> - %% Since this is an update and not a reopen, - %% we should keep the shard number and replication factor - %% and not create a new shard server - #{ - n_shards := NShards, - replication_factor := ReplicationFactor - } = Opts, - - mnesia:write(#?META_TAB{ - db = DB, - db_props = CreateOpts#{ - n_shards := NShards, - replication_factor := ReplicationFactor - } - }), - ok; +-spec assign_db_sites_trans(emqx_ds:db(), [site()]) -> ok. +assign_db_sites_trans(DB, Sites) -> + Opts = db_config_trans(DB), + case [S || S <- Sites, mnesia:read(?NODE_TAB, S, read) == []] of + [] when length(Sites) == 0 -> + mnesia:abort({too_few_sites, Sites}); [] -> - {error, no_database} + ok; + NonexistentSites -> + mnesia:abort({nonexistent_sites, NonexistentSites}) + end, + %% TODO + %% Optimize reallocation. The goals are: + %% 1. Minimize the number of membership transitions. + %% 2. Ensure that sites are responsible for roughly the same number of shards. + Shards = mnesia:match_object(?SHARD_TAB, ?SHARD_PAT({DB, '_'}), write), + Reallocation = compute_allocation(Shards, Sites, Opts), + lists:foreach( + fun({Record, ReplicaSet}) -> + ok = mnesia:write(Record#?SHARD_TAB{target_set = ReplicaSet}) + end, + Reallocation + ). + +-spec modify_db_sites_trans(emqx_ds:db(), [transition()]) -> ok. +modify_db_sites_trans(DB, Modifications) -> + Shards = mnesia:match_object(?SHARD_TAB, ?SHARD_PAT({DB, '_'}), write), + Sites0 = list_db_target_sites(Shards), + Sites = lists:foldl(fun apply_transition/2, Sites0, Modifications), + case Sites of + Sites0 -> + ok; + _Changed -> + assign_db_sites_trans(DB, Sites) + end. + +update_replica_set_trans(DB, Shard, Trans) -> + case mnesia:read(?SHARD_TAB, {DB, Shard}, write) of + [Record = #?SHARD_TAB{replica_set = ReplicaSet0, target_set = TargetSet0}] -> + %% NOTE + %% It's possible to complete a transition that's no longer planned. We + %% should anticipate that we may stray _away_ from the target set. + TargetSet1 = emqx_maybe:define(TargetSet0, ReplicaSet0), + ReplicaSet = apply_transition(Trans, ReplicaSet0), + case lists:usort(TargetSet1) of + ReplicaSet -> + TargetSet = undefined; + TS -> + TargetSet = TS + end, + mnesia:write(Record#?SHARD_TAB{replica_set = ReplicaSet, target_set = TargetSet}); + [] -> + mnesia:abort({nonexistent_shard, {DB, Shard}}) + end. + +-spec update_db_config_trans(emqx_ds:db(), emqx_ds_replication_layer:builtin_db_opts()) -> + emqx_ds_replication_layer:builtin_db_opts(). +update_db_config_trans(DB, UpdateOpts) -> + Opts = db_config_trans(DB, write), + %% Since this is an update and not a reopen, + %% we should keep the shard number and replication factor + %% and not create a new shard server + ChangeableOpts = maps:without([n_shards, n_sites, replication_factor], UpdateOpts), + EffectiveOpts = maps:merge(Opts, ChangeableOpts), + ok = mnesia:write(#?META_TAB{ + db = DB, + db_props = EffectiveOpts + }), + EffectiveOpts. + +-spec db_config_trans(emqx_ds:db()) -> emqx_ds_replication_layer:builtin_db_opts(). +db_config_trans(DB) -> + db_config_trans(DB, read). + +db_config_trans(DB, LockType) -> + case mnesia:read(?META_TAB, DB, LockType) of + [#?META_TAB{db_props = Config}] -> + Config; + [] -> + mnesia:abort({nonexistent_db, DB}) end. -spec drop_db_trans(emqx_ds:db()) -> ok. @@ -391,6 +586,61 @@ ensure_site() -> persistent_term:put(?emqx_ds_builtin_site, Site), ok. +%% @doc Returns sorted list of sites shards are replicated across. +-spec list_db_sites([_Shard]) -> [site()]. +list_db_sites(Shards) -> + flatmap_sorted_set(fun get_shard_sites/1, Shards). + +-spec list_db_target_sites([_Shard]) -> [site()]. +list_db_target_sites(Shards) -> + flatmap_sorted_set(fun get_shard_target_sites/1, Shards). + +-spec get_shard_sites(_Shard) -> [site()]. +get_shard_sites(#?SHARD_TAB{replica_set = ReplicaSet}) -> + ReplicaSet. + +-spec get_shard_target_sites(_Shard) -> [site()]. +get_shard_target_sites(#?SHARD_TAB{target_set = Sites}) when is_list(Sites) -> + Sites; +get_shard_target_sites(#?SHARD_TAB{target_set = undefined} = Shard) -> + get_shard_sites(Shard). + +-spec compute_allocation([Shard], [Site], emqx_ds_replication_layer:builtin_db_opts()) -> + [{Shard, [Site, ...]}]. +compute_allocation(Shards, Sites, Opts) -> + NSites = length(Sites), + ReplicationFactor = maps:get(replication_factor, Opts), + NReplicas = min(NSites, ReplicationFactor), + ShardsSorted = lists:sort(Shards), + SitesSorted = lists:sort(Sites), + {Allocation, _} = lists:mapfoldl( + fun(Shard, SSites) -> + {ReplicaSet, _} = emqx_utils_stream:consume(NReplicas, SSites), + {_, SRest} = emqx_utils_stream:consume(1, SSites), + {{Shard, ReplicaSet}, SRest} + end, + emqx_utils_stream:repeat(emqx_utils_stream:list(SitesSorted)), + ShardsSorted + ), + Allocation. + +compute_transitions(undefined, _ReplicaSet) -> + []; +compute_transitions(TargetSet, ReplicaSet) -> + Additions = TargetSet -- ReplicaSet, + Deletions = ReplicaSet -- TargetSet, + intersperse([{add, S} || S <- Additions], [{del, S} || S <- Deletions]). + +%% @doc Apply a transition to a list of sites, preserving sort order. +-spec apply_transition(transition(), [site()]) -> [site()]. +apply_transition({add, S}, Sites) -> + lists:usort([S | Sites]); +apply_transition({del, S}, Sites) -> + lists:delete(S, Sites). + +gen_shards(NShards) -> + [integer_to_binary(I) || I <- lists:seq(0, NShards - 1)]. + eval_qlc(Q) -> case mnesia:is_transaction() of true -> @@ -400,29 +650,63 @@ eval_qlc(Q) -> Result end. -filter_shards(DB) -> - filter_shards(DB, const(true)). +transaction(Fun, Args) -> + case mria:transaction(?SHARD, Fun, Args) of + {atomic, Result} -> + Result; + {aborted, Reason} -> + {error, Reason} + end. --spec filter_shards(emqx_ds:db(), fun((_) -> boolean())) -> - [emqx_ds_replication_layer:shard_id()]. -filter_shards(DB, Predicte) -> - filter_shards(DB, Predicte, fun(#?SHARD_TAB{shard = {_, ShardId}}) -> - ShardId - end). +%%==================================================================== -filter_shards(DB, Predicate, Mapper) -> - eval_qlc( - qlc:q([ - Mapper(Shard) - || #?SHARD_TAB{shard = {D, _}} = Shard <- mnesia:table( - ?SHARD_TAB - ), - D =:= DB, - Predicate(Shard) - ]) +handle_subscribe(Pid, Subject, S = #s{subs = Subs0}) -> + case maps:is_key(Pid, Subs0) of + false -> + MRef = erlang:monitor(process, Pid), + Subs = Subs0#{Pid => {Subject, MRef}}, + S#s{subs = Subs}; + true -> + S + end. + +handle_unsubscribe(Pid, S = #s{subs = Subs0}) -> + case maps:take(Pid, Subs0) of + {{_Subject, MRef}, Subs} -> + _ = erlang:demonitor(MRef, [flush]), + S#s{subs = Subs}; + error -> + S + end. + +notify_subscribers(EventSubject, Event, #s{subs = Subs}) -> + maps:foreach( + fun(Pid, {Subject, _MRef}) -> + Subject == EventSubject andalso + erlang:send(Pid, {changed, Event}) + end, + Subs ). -const(Result) -> - fun(_) -> - Result - end. +%%==================================================================== + +%% @doc Intersperse elements of two lists. +%% Example: intersperse([1, 2], [3, 4, 5]) -> [1, 3, 2, 4, 5]. +-spec intersperse([X], [Y]) -> [X | Y]. +intersperse(L1, []) -> + L1; +intersperse([], L2) -> + L2; +intersperse([H1 | T1], L2) -> + [H1 | intersperse(L2, T1)]. + +%% @doc Map list into a list of sets and return union, as a sorted list. +-spec flatmap_sorted_set(fun((X) -> [Y]), [X]) -> [Y]. +flatmap_sorted_set(Fun, L) -> + ordsets:to_list( + lists:foldl( + fun(X, Acc) -> ordsets:union(ordsets:from_list(Fun(X)), Acc) end, + ordsets:new(), + L + ) + ). diff --git a/apps/emqx_durable_storage/src/emqx_ds_replication_layer_shard.erl b/apps/emqx_durable_storage/src/emqx_ds_replication_layer_shard.erl index 7540e01bb..e0e70596a 100644 --- a/apps/emqx_durable_storage/src/emqx_ds_replication_layer_shard.erl +++ b/apps/emqx_durable_storage/src/emqx_ds_replication_layer_shard.erl @@ -21,6 +21,7 @@ %% Static server configuration -export([ shard_servers/2, + shard_server/3, local_server/2 ]). @@ -30,6 +31,14 @@ server/3 ]). +%% Membership +-export([ + add_local_server/2, + drop_local_server/2, + remove_server/3, + server_info/2 +]). + -behaviour(gen_server). -export([ init/1, @@ -38,21 +47,31 @@ terminate/2 ]). +-type server() :: ra:server_id(). + +-define(MEMBERSHIP_CHANGE_TIMEOUT, 30_000). + %% start_link(DB, Shard, Opts) -> gen_server:start_link(?MODULE, {DB, Shard, Opts}, []). +-spec shard_servers(emqx_ds:db(), emqx_ds_replication_layer:shard_id()) -> [server()]. shard_servers(DB, Shard) -> - {ok, ReplicaSet} = emqx_ds_replication_layer_meta:replica_set(DB, Shard), - [ - {server_name(DB, Shard, Site), emqx_ds_replication_layer_meta:node(Site)} - || Site <- ReplicaSet - ]. + ReplicaSet = emqx_ds_replication_layer_meta:replica_set(DB, Shard), + [shard_server(DB, Shard, Site) || Site <- ReplicaSet]. +-spec shard_server( + emqx_ds:db(), + emqx_ds_replication_layer:shard_id(), + emqx_ds_replication_layer_meta:site() +) -> server(). +shard_server(DB, Shard, Site) -> + {server_name(DB, Shard, Site), emqx_ds_replication_layer_meta:node(Site)}. + +-spec local_server(emqx_ds:db(), emqx_ds_replication_layer:shard_id()) -> server(). local_server(DB, Shard) -> - Site = emqx_ds_replication_layer_meta:this_site(), - {server_name(DB, Shard, Site), node()}. + {server_name(DB, Shard, local_site()), node()}. cluster_name(DB, Shard) -> iolist_to_binary(io_lib:format("~s_~s", [DB, Shard])). @@ -79,13 +98,11 @@ get_servers_leader_preferred(DB, Shard) -> Servers = ra_leaderboard:lookup_members(ClusterName), [Leader | lists:delete(Leader, Servers)]; undefined -> - %% TODO: Dynamic membership. get_shard_servers(DB, Shard) end. get_server_local_preferred(DB, Shard) -> - %% NOTE: Contact random replica that is not a known leader. - %% TODO: Replica may be down, so we may need to retry. + %% NOTE: Contact either local server or a random replica. ClusterName = get_cluster_name(DB, Shard), case ra_leaderboard:lookup_members(ClusterName) of Servers when is_list(Servers) -> @@ -94,15 +111,21 @@ get_server_local_preferred(DB, Shard) -> %% TODO %% Leader is unkonwn if there are no servers of this group on the %% local node. We want to pick a replica in that case as well. - %% TODO: Dynamic membership. pick_random(get_shard_servers(DB, Shard)) end. +lookup_leader(DB, Shard) -> + %% NOTE + %% Does not block, but the result may be outdated or even unknown when there's + %% no servers on the local node. + ClusterName = get_cluster_name(DB, Shard), + ra_leaderboard:lookup_leader(ClusterName). + pick_local(Servers) -> - case lists:dropwhile(fun({_Name, Node}) -> Node =/= node() end, Servers) of - [Local | _] -> + case lists:keyfind(node(), 2, Servers) of + Local when is_tuple(Local) -> Local; - [] -> + false -> pick_random(Servers) end. @@ -118,11 +141,153 @@ get_local_server(DB, Shard) -> get_shard_servers(DB, Shard) -> maps:get(servers, emqx_ds_replication_shard_allocator:shard_meta(DB, Shard)). +local_site() -> + emqx_ds_replication_layer_meta:this_site(). + +%% + +%% @doc Add a local server to the shard cluster. +%% It's recommended to have the local server running before calling this function. +%% This function is idempotent. +-spec add_local_server(emqx_ds:db(), emqx_ds_replication_layer:shard_id()) -> + ok | emqx_ds:error(_Reason). +add_local_server(DB, Shard) -> + %% NOTE + %% Adding local server as "promotable" member to the cluster, which means + %% that it will affect quorum until it is promoted to a voter, which in + %% turn happens when the server has caught up sufficiently with the log. + %% We also rely on this "membership" to understand when the server's + %% readiness. + ShardServers = shard_servers(DB, Shard), + LocalServer = local_server(DB, Shard), + case server_info(uid, LocalServer) of + UID when is_binary(UID) -> + ServerRecord = #{ + id => LocalServer, + membership => promotable, + uid => UID + }; + unknown -> + ServerRecord = #{ + id => LocalServer, + membership => voter + } + end, + Timeout = ?MEMBERSHIP_CHANGE_TIMEOUT, + case ra_try_servers(ShardServers, fun ra:add_member/3, [ServerRecord, Timeout]) of + {ok, _, _Leader} -> + ok; + {error, already_member} -> + ok; + Error -> + {error, recoverable, Error} + end. + +%% @doc Remove a local server from the shard cluster and clean up on-disk data. +%% It's required to have the local server running before calling this function. +%% This function is idempotent. +-spec drop_local_server(emqx_ds:db(), emqx_ds_replication_layer:shard_id()) -> + ok | emqx_ds:error(_Reason). +drop_local_server(DB, Shard) -> + %% NOTE: Timeouts are ignored, it's a best effort attempt. + _ = prep_stop_server(DB, Shard), + LocalServer = local_server(DB, Shard), + case remove_server(DB, Shard, LocalServer) of + ok -> + ra:force_delete_server(DB, LocalServer); + {error, _, _Reason} = Error -> + Error + end. + +%% @doc Remove a (remote) server from the shard cluster. +%% The server might not be running when calling this function, e.g. the node +%% might be offline. Because of this, on-disk data will not be cleaned up. +%% This function is idempotent. +-spec remove_server(emqx_ds:db(), emqx_ds_replication_layer:shard_id(), server()) -> + ok | emqx_ds:error(_Reason). +remove_server(DB, Shard, Server) -> + ShardServers = shard_servers(DB, Shard), + Timeout = ?MEMBERSHIP_CHANGE_TIMEOUT, + case ra_try_servers(ShardServers, fun ra:remove_member/3, [Server, Timeout]) of + {ok, _, _Leader} -> + ok; + {error, not_member} -> + ok; + Error -> + {error, recoverable, Error} + end. + +-spec server_info + (readiness, server()) -> ready | {unready, _Status, _Membership} | unknown; + (leader, server()) -> server() | unknown; + (uid, server()) -> _UID :: binary() | unknown. +server_info(readiness, Server) -> + %% NOTE + %% Server is ready if it's either the leader or a follower with voter "membership" + %% status (meaning it was promoted after catching up with the log). + case current_leader(Server) of + Server -> + ready; + Leader when Leader /= unknown -> + member_info(readiness, Server, Leader); + unknown -> + unknown + end; +server_info(leader, Server) -> + current_leader(Server); +server_info(uid, Server) -> + maps:get(uid, ra_overview(Server), unknown). + +member_info(readiness, Server, Leader) -> + Cluster = maps:get(cluster, ra_overview(Leader), #{}), + member_readiness(maps:get(Server, Cluster, #{})). + +current_leader(Server) -> + %% NOTE: This call will block until the leader is known, or until the timeout. + case ra:members(Server) of + {ok, _Servers, Leader} -> + Leader; + _Error -> + unknown + end. + +member_readiness(#{status := Status, voter_status := #{membership := Membership}}) -> + case Status of + normal when Membership =:= voter -> + ready; + _Other -> + {unready, Status, Membership} + end; +member_readiness(#{}) -> + unknown. + +%% + +ra_try_servers([Server | Rest], Fun, Args) -> + case erlang:apply(Fun, [Server | Args]) of + {ok, R, Leader} -> + {ok, R, Leader}; + {error, Reason} when Reason == noproc; Reason == nodedown -> + ra_try_servers(Rest, Fun, Args); + ErrorOrTimeout -> + ErrorOrTimeout + end; +ra_try_servers([], _Fun, _Args) -> + {error, servers_unreachable}. + +ra_overview(Server) -> + case ra:member_overview(Server) of + {ok, Overview, _Leader} -> + Overview; + _Error -> + #{} + end. + %% init({DB, Shard, Opts}) -> _ = process_flag(trap_exit, true), - _Meta = start_shard(DB, Shard, Opts), + ok = start_server(DB, Shard, Opts), {ok, {DB, Shard}}. handle_call(_Call, _From, State) -> @@ -132,66 +297,91 @@ handle_cast(_Msg, State) -> {noreply, State}. terminate(_Reason, {DB, Shard}) -> + %% NOTE: Timeouts are ignored, it's a best effort attempt. + catch prep_stop_server(DB, Shard), LocalServer = get_local_server(DB, Shard), ok = ra:stop_server(DB, LocalServer). %% -start_shard(DB, Shard, #{replication_options := ReplicationOpts}) -> - Site = emqx_ds_replication_layer_meta:this_site(), +start_server(DB, Shard, #{replication_options := ReplicationOpts}) -> ClusterName = cluster_name(DB, Shard), LocalServer = local_server(DB, Shard), Servers = shard_servers(DB, Shard), case ra:restart_server(DB, LocalServer) of - ok -> - Bootstrap = false; {error, name_not_registered} -> Bootstrap = true, + Machine = {module, emqx_ds_replication_layer, #{db => DB, shard => Shard}}, + LogOpts = maps:with( + [ + snapshot_interval, + resend_window + ], + ReplicationOpts + ), ok = ra:start_server(DB, #{ id => LocalServer, - uid => <>, + uid => server_uid(DB, Shard), cluster_name => ClusterName, initial_members => Servers, - machine => {module, emqx_ds_replication_layer, #{db => DB, shard => Shard}}, - log_init_args => maps:with( - [ - snapshot_interval, - resend_window - ], - ReplicationOpts - ) - }) + machine => Machine, + log_init_args => LogOpts + }); + ok -> + Bootstrap = false; + {error, {already_started, _}} -> + Bootstrap = false end, - case Servers of - [LocalServer | _] -> - %% TODO - %% Not super robust, but we probably don't expect nodes to be down - %% when we bring up a fresh consensus group. Triggering election - %% is not really required otherwise. - %% TODO - %% Ensure that doing that on node restart does not disrupt consensus. - %% Edit: looks like it doesn't, this could actually be quite useful - %% to "steal" leadership from nodes that have too much leader load. - %% TODO - %% It doesn't really work that way. There's `ra:transfer_leadership/2` - %% for that. - try - ra:trigger_election(LocalServer, _Timeout = 1_000) - catch - %% TODO - %% Tolerating exceptions because server might be occupied with log - %% replay for a while. - exit:{timeout, _} when not Bootstrap -> - ok - end; - _ -> + %% NOTE + %% Triggering election is necessary when a new consensus group is being brought up. + %% TODO + %% It's probably a good idea to rebalance leaders across the cluster from time to + %% time. There's `ra:transfer_leadership/2` for that. + try Bootstrap andalso ra:trigger_election(LocalServer, _Timeout = 1_000) of + false -> + ok; + ok -> ok - end, - #{ - cluster_name => ClusterName, - servers => Servers, - local_server => LocalServer - }. + catch + %% TODO + %% Tolerating exceptions because server might be occupied with log replay for + %% a while. + exit:{timeout, _} when not Bootstrap -> + ok + end. + +server_uid(_DB, Shard) -> + %% NOTE + %% Each new "instance" of a server should have a unique identifier. Otherwise, + %% if some server migrates to another node during rebalancing, and then comes + %% back, `ra` will be very confused by it having the same UID as before. + %% Keeping the shard ID as a prefix to make it easier to identify the server + %% in the filesystem / logs / etc. + Ts = integer_to_binary(erlang:system_time(microsecond)), + <>. + +%% + +prep_stop_server(DB, Shard) -> + prep_stop_server(DB, Shard, 5_000). + +prep_stop_server(DB, Shard, Timeout) -> + LocalServer = get_local_server(DB, Shard), + Candidates = lists:delete(LocalServer, shard_servers(DB, Shard)), + case lookup_leader(DB, Shard) of + LocalServer when Candidates =/= [] -> + %% NOTE + %% Trigger leadership transfer *and* force to wait until the new leader + %% is elected and updated in the leaderboard. This should help to avoid + %% edge cases where entries appended right before removal are duplicated + %% due to client retries. + %% TODO: Candidate may be offline. + [Candidate | _] = Candidates, + _ = ra:transfer_leadership(LocalServer, Candidate), + wait_until(fun() -> lookup_leader(DB, Shard) == Candidate end, Timeout); + _Another -> + ok + end. %% @@ -205,3 +395,24 @@ memoize(Fun, Args) -> Result -> Result end. + +wait_until(Fun, Timeout) -> + wait_until(Fun, Timeout, 100). + +wait_until(Fun, Timeout, Sleep) -> + Deadline = erlang:monotonic_time(millisecond) + Timeout, + loop_until(Fun, Deadline, Sleep). + +loop_until(Fun, Deadline, Sleep) -> + case Fun() of + true -> + ok; + false -> + case erlang:monotonic_time(millisecond) of + Now when Now < Deadline -> + timer:sleep(Sleep), + loop_until(Fun, Deadline, Sleep); + _ -> + timeout + end + end. diff --git a/apps/emqx_durable_storage/src/emqx_ds_replication_shard_allocator.erl b/apps/emqx_durable_storage/src/emqx_ds_replication_shard_allocator.erl index 6da33f09f..f02335a10 100644 --- a/apps/emqx_durable_storage/src/emqx_ds_replication_shard_allocator.erl +++ b/apps/emqx_durable_storage/src/emqx_ds_replication_shard_allocator.erl @@ -16,11 +16,16 @@ -module(emqx_ds_replication_shard_allocator). --export([start_link/2]). +-include_lib("snabbkaffe/include/trace.hrl"). + +-export([start_link/1]). -export([n_shards/1]). -export([shard_meta/2]). +%% Maintenace purposes: +-export([trigger_transitions/1]). + -behaviour(gen_server). -export([ init/1, @@ -30,14 +35,37 @@ terminate/2 ]). +-export([handle_transition/4]). + -define(db_meta(DB), {?MODULE, DB}). -define(shard_meta(DB, SHARD), {?MODULE, DB, SHARD}). +-define(ALLOCATE_RETRY_TIMEOUT, 1_000). + +-define(TRANS_RETRY_TIMEOUT, 5_000). +-define(CRASH_RETRY_DELAY, 20_000). +-define(REMOVE_REPLICA_DELAY, {10_000, 5_000}). + +-ifdef(TEST). +-undef(TRANS_RETRY_TIMEOUT). +-undef(REMOVE_REPLICA_DELAY). +-define(TRANS_RETRY_TIMEOUT, 1_000). +-define(REMOVE_REPLICA_DELAY, {3_000, 2_000}). +-endif. + %% -start_link(DB, Opts) -> - gen_server:start_link(?MODULE, {DB, Opts}, []). +-record(trigger_transitions, {}). +-spec start_link(emqx_ds:db()) -> {ok, pid()}. +start_link(DB) -> + gen_server:start_link(?MODULE, DB, []). + +-spec trigger_transitions(pid()) -> ok. +trigger_transitions(Pid) -> + gen_server:cast(Pid, #trigger_transitions{}). + +-spec n_shards(emqx_ds:db()) -> non_neg_integer(). n_shards(DB) -> Meta = persistent_term:get(?db_meta(DB)), maps:get(n_shards, Meta). @@ -47,48 +75,60 @@ shard_meta(DB, Shard) -> %% --define(ALLOCATE_RETRY_TIMEOUT, 1_000). +-record(transhdl, { + shard :: emqx_ds_replication_layer:shard_id(), + trans :: emqx_ds_replication_layer_meta:transition(), + pid :: pid() +}). -init({DB, Opts}) -> +-type state() :: #{ + db := emqx_ds:db(), + shards := [emqx_ds_replication_layer:shard_id()], + status := allocating | ready, + transitions := #{_Track => #transhdl{}} +}. + +-spec init(emqx_ds:db()) -> {ok, state()}. +init(DB) -> _ = erlang:process_flag(trap_exit, true), - _ = logger:set_process_metadata(#{db => DB, domain => [ds, db, shard_allocator]}), - State = #{db => DB, opts => Opts, status => allocating}, - case allocate_shards(State) of - {ok, NState} -> - {ok, NState}; - {error, Data} -> - _ = logger:notice( - Data#{ - msg => "Shard allocation still in progress", - retry_in => ?ALLOCATE_RETRY_TIMEOUT - } - ), - {ok, State, ?ALLOCATE_RETRY_TIMEOUT} - end. + _ = logger:set_process_metadata(#{db => DB, domain => [emqx, ds, DB, shard_allocator]}), + State = #{ + db => DB, + shards => [], + status => allocating, + transitions => #{} + }, + {ok, handle_allocate_shards(State)}. +-spec handle_call(_Call, _From, state()) -> {reply, ignored, state()}. handle_call(_Call, _From, State) -> {reply, ignored, State}. +-spec handle_cast(_Cast, state()) -> {noreply, state()}. +handle_cast(#trigger_transitions{}, State) -> + {noreply, handle_pending_transitions(State)}; handle_cast(_Cast, State) -> {noreply, State}. -handle_info(timeout, State) -> - case allocate_shards(State) of - {ok, NState} -> - {noreply, NState}; - {error, Data} -> - _ = logger:notice( - Data#{ - msg => "Shard allocation still in progress", - retry_in => ?ALLOCATE_RETRY_TIMEOUT - } - ), - {noreply, State, ?ALLOCATE_RETRY_TIMEOUT} - end; +-spec handle_info(Info, state()) -> {noreply, state()} when + Info :: + emqx_ds_replication_layer_meta:subscription_event() + | {timeout, reference(), allocate} + | {'EXIT', pid(), _Reason}. +handle_info({timeout, _TRef, allocate}, State) -> + {noreply, handle_allocate_shards(State)}; +handle_info({changed, {shard, DB, Shard}}, State = #{db := DB}) -> + {noreply, handle_shard_changed(Shard, State)}; +handle_info({changed, _}, State) -> + {noreply, State}; +handle_info({'EXIT', Pid, Reason}, State) -> + {noreply, handle_exit(Pid, Reason, State)}; handle_info(_Info, State) -> {noreply, State}. -terminate(_Reason, #{db := DB, shards := Shards}) -> +-spec terminate(_Reason, state()) -> _Ok. +terminate(_Reason, State = #{db := DB, shards := Shards}) -> + unsubscribe_db_changes(State), erase_db_meta(DB), erase_shards_meta(DB, Shards); terminate(_Reason, #{}) -> @@ -96,10 +136,258 @@ terminate(_Reason, #{}) -> %% -allocate_shards(State = #{db := DB, opts := Opts}) -> - case emqx_ds_replication_layer_meta:allocate_shards(DB, Opts) of +handle_allocate_shards(State0) -> + case allocate_shards(State0) of + {ok, State} -> + %% NOTE + %% Subscribe to shard changes and trigger any yet unhandled transitions. + ok = subscribe_db_changes(State), + ok = trigger_transitions(self()), + State; + {error, Data} -> + _ = logger:notice( + Data#{ + msg => "Shard allocation still in progress", + retry_in => ?ALLOCATE_RETRY_TIMEOUT + } + ), + _TRef = erlang:start_timer(?ALLOCATE_RETRY_TIMEOUT, self(), allocate), + State0 + end. + +subscribe_db_changes(#{db := DB}) -> + emqx_ds_replication_layer_meta:subscribe(self(), DB). + +unsubscribe_db_changes(_State) -> + emqx_ds_replication_layer_meta:unsubscribe(self()). + +%% + +handle_shard_changed(Shard, State = #{db := DB}) -> + ok = save_shard_meta(DB, Shard), + handle_shard_transitions(Shard, next_transitions(DB, Shard), State). + +handle_pending_transitions(State = #{db := DB, shards := Shards}) -> + lists:foldl( + fun(Shard, StateAcc) -> + handle_shard_transitions(Shard, next_transitions(DB, Shard), StateAcc) + end, + State, + Shards + ). + +next_transitions(DB, Shard) -> + emqx_ds_replication_layer_meta:replica_set_transitions(DB, Shard). + +handle_shard_transitions(_Shard, [], State) -> + %% We reached the target allocation. + State; +handle_shard_transitions(Shard, [Trans | _Rest], State) -> + case transition_handler(Shard, Trans, State) of + {Track, Handler} -> + ensure_transition_handler(Track, Shard, Trans, Handler, State); + undefined -> + State + end. + +transition_handler(Shard, Trans, _State = #{db := DB}) -> + ThisSite = emqx_ds_replication_layer_meta:this_site(), + case Trans of + {add, ThisSite} -> + {Shard, fun trans_add_local/3}; + {del, ThisSite} -> + {Shard, fun trans_drop_local/3}; + {del, Site} -> + ReplicaSet = emqx_ds_replication_layer_meta:replica_set(DB, Shard), + case lists:member(Site, ReplicaSet) of + true -> + %% NOTE + %% Let the replica handle its own removal first, but still set + %% up a removal handler after a delay, in case the replica is + %% unresponsive. + Handler = {fun trans_delay/5, [ + ?REMOVE_REPLICA_DELAY, + fun trans_rm_unresponsive/3 + ]}, + %% NOTE + %% Putting this transition handler on separate "track" so that it + %% won't block any changes with higher priority (e.g. managing + %% local replicas). + {{unresp, Shard}, Handler}; + false -> + undefined + end; + _NotOurs -> + %% This site is not involved in the next queued transition. + undefined + end. + +handle_transition(DB, Shard, Trans, Handler) -> + logger:set_process_metadata(#{ + db => DB, + shard => Shard, + domain => [emqx, ds, DB, shard_transition] + }), + ?tp( + dsrepl_shard_transition_begin, + #{shard => Shard, db => DB, transition => Trans, pid => self()} + ), + apply_handler(Handler, DB, Shard, Trans). + +apply_handler({Fun, Args}, DB, Shard, Trans) -> + erlang:apply(Fun, [DB, Shard, Trans | Args]); +apply_handler(Fun, DB, Shard, Trans) -> + erlang:apply(Fun, [DB, Shard, Trans]). + +trans_add_local(DB, Shard, {add, Site}) -> + logger:info(#{msg => "Adding new local shard replica", site => Site}), + do_add_local(membership, DB, Shard). + +do_add_local(membership = Stage, DB, Shard) -> + ok = start_shard(DB, Shard), + case emqx_ds_replication_layer_shard:add_local_server(DB, Shard) of + ok -> + do_add_local(readiness, DB, Shard); + {error, recoverable, Reason} -> + logger:warning(#{ + msg => "Shard membership change failed", + reason => Reason, + retry_in => ?TRANS_RETRY_TIMEOUT + }), + ok = timer:sleep(?TRANS_RETRY_TIMEOUT), + do_add_local(Stage, DB, Shard) + end; +do_add_local(readiness = Stage, DB, Shard) -> + LocalServer = emqx_ds_replication_layer_shard:local_server(DB, Shard), + case emqx_ds_replication_layer_shard:server_info(readiness, LocalServer) of + ready -> + logger:info(#{msg => "Local shard replica ready"}); + Status -> + logger:warning(#{ + msg => "Still waiting for local shard replica to be ready", + status => Status, + retry_in => ?TRANS_RETRY_TIMEOUT + }), + ok = timer:sleep(?TRANS_RETRY_TIMEOUT), + do_add_local(Stage, DB, Shard) + end. + +trans_drop_local(DB, Shard, {del, Site}) -> + logger:info(#{msg => "Dropping local shard replica", site => Site}), + do_drop_local(DB, Shard). + +do_drop_local(DB, Shard) -> + case emqx_ds_replication_layer_shard:drop_local_server(DB, Shard) of + ok -> + ok = emqx_ds_builtin_db_sup:stop_shard({DB, Shard}), + ok = emqx_ds_storage_layer:drop_shard({DB, Shard}), + logger:info(#{msg => "Local shard replica dropped"}); + {error, recoverable, Reason} -> + logger:warning(#{ + msg => "Shard membership change failed", + reason => Reason, + retry_in => ?TRANS_RETRY_TIMEOUT + }), + ok = timer:sleep(?TRANS_RETRY_TIMEOUT), + do_drop_local(DB, Shard) + end. + +trans_rm_unresponsive(DB, Shard, {del, Site}) -> + logger:info(#{msg => "Removing unresponsive shard replica", site => Site}), + do_rm_unresponsive(DB, Shard, Site). + +do_rm_unresponsive(DB, Shard, Site) -> + Server = emqx_ds_replication_layer_shard:shard_server(DB, Shard, Site), + case emqx_ds_replication_layer_shard:remove_server(DB, Shard, Server) of + ok -> + logger:info(#{msg => "Unresponsive shard replica removed"}); + {error, recoverable, Reason} -> + logger:warning(#{ + msg => "Shard membership change failed", + reason => Reason, + retry_in => ?TRANS_RETRY_TIMEOUT + }), + ok = timer:sleep(?TRANS_RETRY_TIMEOUT), + do_rm_unresponsive(DB, Shard, Site) + end. + +trans_delay(DB, Shard, Trans, Delay, NextHandler) -> + ok = delay(Delay), + %% NOTE: Proceed only if the transition we are going to handle is still desired. + case next_transitions(DB, Shard) of + [Trans | _] -> + apply_handler(NextHandler, DB, Shard, Trans); + _Outdated -> + exit({shutdown, skipped}) + end. + +%% + +ensure_transition_handler(Track, Shard, Trans, Handler, State = #{transitions := Ts}) -> + case maps:get(Track, Ts, undefined) of + undefined -> + Pid = start_transition_handler(Shard, Trans, Handler, State), + Record = #transhdl{shard = Shard, trans = Trans, pid = Pid}, + State#{transitions := Ts#{Track => Record}}; + _AlreadyRunning -> + %% NOTE: Avoiding multiple transition handlers for the same shard for safety. + State + end. + +start_transition_handler(Shard, Trans, Handler, #{db := DB}) -> + proc_lib:spawn_link(?MODULE, handle_transition, [DB, Shard, Trans, Handler]). + +handle_exit(Pid, Reason, State0 = #{db := DB, transitions := Ts}) -> + case maps:to_list(maps:filter(fun(_, TH) -> TH#transhdl.pid == Pid end, Ts)) of + [{Track, #transhdl{shard = Shard, trans = Trans}}] -> + ?tp( + dsrepl_shard_transition_end, + #{shard => Shard, db => DB, transition => Trans, pid => Pid, reason => Reason} + ), + State = State0#{transitions := maps:remove(Track, Ts)}, + handle_transition_exit(Shard, Trans, Reason, State); + [] -> + %% NOTE + %% Actually, it's sort of expected to have a portion of exit signals here, + %% because of `mria:with_middleman/3`. But it's impossible to tell them apart + %% from other singals. + logger:warning(#{msg => "Unexpected exit signal", pid => Pid, reason => Reason}), + State0 + end. + +handle_transition_exit(Shard, Trans, normal, State = #{db := DB}) -> + %% NOTE: This will trigger the next transition if any. + ok = emqx_ds_replication_layer_meta:update_replica_set(DB, Shard, Trans), + State; +handle_transition_exit(_Shard, _Trans, {shutdown, skipped}, State) -> + State; +handle_transition_exit(Shard, Trans, Reason, State) -> + logger:warning(#{ + msg => "Shard membership transition failed", + shard => Shard, + transition => Trans, + reason => Reason, + retry_in => ?CRASH_RETRY_DELAY + }), + %% NOTE + %% In case of `{add, Site}` transition failure, we have no choice but to retry: + %% no other node can perform the transition and make progress towards the desired + %% state. + case Trans of + {add, _ThisSite} -> + {Track, Handler} = transition_handler(Shard, Trans, State), + RetryHandler = {fun trans_delay/5, [?CRASH_RETRY_DELAY, Handler]}, + ensure_transition_handler(Track, Shard, Trans, RetryHandler, State); + _Another -> + State + end. + +%% + +allocate_shards(State = #{db := DB}) -> + case emqx_ds_replication_layer_meta:allocate_shards(DB) of {ok, Shards} -> - logger:notice(#{msg => "Shards allocated", shards => Shards}), + logger:info(#{msg => "Shards allocated", shards => Shards}), ok = start_shards(DB, emqx_ds_replication_layer_meta:my_shards(DB)), ok = start_egresses(DB, Shards), ok = save_db_meta(DB, Shards), @@ -110,25 +398,23 @@ allocate_shards(State = #{db := DB, opts := Opts}) -> end. start_shards(DB, Shards) -> - ok = lists:foreach( - fun(Shard) -> - ok = emqx_ds_builtin_db_sup:ensure_shard({DB, Shard}) - end, - Shards - ), - ok = logger:info(#{msg => "Shards started", shards => Shards}), + lists:foreach(fun(Shard) -> start_shard(DB, Shard) end, Shards). + +start_shard(DB, Shard) -> + ok = emqx_ds_builtin_db_sup:ensure_shard({DB, Shard}), + ok = logger:info(#{msg => "Shard started", shard => Shard}), ok. start_egresses(DB, Shards) -> - ok = lists:foreach( - fun(Shard) -> - ok = emqx_ds_builtin_db_sup:ensure_egress({DB, Shard}) - end, - Shards - ), - logger:info(#{msg => "Egresses started", shards => Shards}), + lists:foreach(fun(Shard) -> start_egress(DB, Shard) end, Shards). + +start_egress(DB, Shard) -> + ok = emqx_ds_builtin_db_sup:ensure_egress({DB, Shard}), + ok = logger:info(#{msg => "Egress started", shard => Shard}), ok. +%% + save_db_meta(DB, Shards) -> persistent_term:put(?db_meta(DB), #{ shards => Shards, @@ -152,3 +438,10 @@ erase_shards_meta(DB, Shards) -> erase_shard_meta(DB, Shard) -> persistent_term:erase(?shard_meta(DB, Shard)). + +%% + +delay({MinDelay, Variance}) -> + timer:sleep(MinDelay + rand:uniform(Variance)); +delay(Delay) -> + timer:sleep(Delay). diff --git a/apps/emqx_durable_storage/src/emqx_ds_replication_snapshot.erl b/apps/emqx_durable_storage/src/emqx_ds_replication_snapshot.erl new file mode 100644 index 000000000..c90c71688 --- /dev/null +++ b/apps/emqx_durable_storage/src/emqx_ds_replication_snapshot.erl @@ -0,0 +1,256 @@ +%%-------------------------------------------------------------------- +%% Copyright (c) 2024 EMQ Technologies Co., Ltd. All Rights Reserved. +%% +%% Licensed under the Apache License, Version 2.0 (the "License"); +%% you may not use this file except in compliance with the License. +%% You may obtain a copy of the License at +%% +%% http://www.apache.org/licenses/LICENSE-2.0 +%% +%% Unless required by applicable law or agreed to in writing, software +%% distributed under the License is distributed on an "AS IS" BASIS, +%% WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +%% See the License for the specific language governing permissions and +%% limitations under the License. +%%-------------------------------------------------------------------- + +-module(emqx_ds_replication_snapshot). + +-include_lib("snabbkaffe/include/trace.hrl"). + +-behaviour(ra_snapshot). +-export([ + prepare/2, + write/3, + + begin_read/2, + read_chunk/3, + + begin_accept/2, + accept_chunk/2, + complete_accept/2, + + recover/1, + validate/1, + read_meta/1 +]). + +%% Read state. +-record(rs, { + phase :: machine_state | storage_snapshot, + started_at :: _Time :: integer(), + state :: emqx_ds_replication_layer:ra_state() | undefined, + reader :: emqx_ds_storage_snapshot:reader() | undefined +}). + +%% Write state. +-record(ws, { + phase :: machine_state | storage_snapshot, + started_at :: _Time :: integer(), + dir :: file:filename(), + meta :: ra_snapshot:meta(), + state :: emqx_ds_replication_layer:ra_state() | undefined, + writer :: emqx_ds_storage_snapshot:writer() | undefined +}). + +-type rs() :: #rs{}. +-type ws() :: #ws{}. + +-type ra_state() :: emqx_ds_replication_layer:ra_state(). + +%% Writing a snapshot. +%% This process is exactly the same as writing a ra log snapshot: store the +%% log meta and the machine state in a single snapshot file. + +-spec prepare(_RaftIndex, ra_state()) -> _State :: ra_state(). +prepare(Index, State) -> + ra_log_snapshot:prepare(Index, State). + +-spec write(_SnapshotDir :: file:filename(), ra_snapshot:meta(), _State :: ra_state()) -> + ok | {ok, _BytesWritten :: non_neg_integer()} | {error, ra_snapshot:file_err()}. +write(Dir, Meta, MachineState) -> + ra_log_snapshot:write(Dir, Meta, MachineState). + +%% Reading a snapshot. +%% +%% This is triggered by the leader when it finds out that a follower is +%% behind so much that there are no log segments covering the gap anymore. +%% This process, on the other hand, MUST involve reading the storage snapshot, +%% (in addition to the log snapshot) to reconstruct the storage state on the +%% target server. +%% +%% Currently, a snapshot reader is owned by a special "snapshot sender" process +%% spawned by the leader `ra` server, which sends chunks to the target server +%% in a tight loop. This process terminates under the following conditions: +%% 1. The snapshot is completely read and sent. +%% 2. Remote server fails to accept a chunk, either due to network failure (most +%% likely) or a logic error (very unlikely). +%% +%% TODO +%% In the latter case the process terminates without the chance to clean up the +%% snapshot reader resource, which will cause the snapshot to linger indefinitely. +%% For better control over resources, observability, and niceties like flow +%% control and backpressure we need to move this into a dedicated process tree. + +-spec begin_read(_SnapshotDir :: file:filename(), _Context :: #{}) -> + {ok, ra_snapshot:meta(), rs()} | {error, _Reason :: term()}. +begin_read(Dir, _Context) -> + RS = #rs{ + phase = machine_state, + started_at = erlang:monotonic_time(millisecond) + }, + case ra_log_snapshot:recover(Dir) of + {ok, Meta, MachineState} -> + start_snapshot_reader(Meta, RS#rs{state = MachineState}); + Error -> + Error + end. + +start_snapshot_reader(Meta, RS) -> + ShardId = shard_id(RS), + logger:info(#{ + msg => "dsrepl_snapshot_read_started", + shard => ShardId + }), + {ok, SnapReader} = emqx_ds_storage_layer:take_snapshot(ShardId), + {ok, Meta, RS#rs{reader = SnapReader}}. + +-spec read_chunk(rs(), _Size :: non_neg_integer(), _SnapshotDir :: file:filename()) -> + {ok, binary(), {next, rs()} | last} | {error, _Reason :: term()}. +read_chunk(RS = #rs{phase = machine_state, state = MachineState}, _Size, _Dir) -> + Chunk = term_to_binary(MachineState), + {ok, Chunk, {next, RS#rs{phase = storage_snapshot}}}; +read_chunk(RS = #rs{phase = storage_snapshot, reader = SnapReader0}, Size, _Dir) -> + case emqx_ds_storage_snapshot:read_chunk(SnapReader0, Size) of + {next, Chunk, SnapReader} -> + {ok, Chunk, {next, RS#rs{reader = SnapReader}}}; + {last, Chunk, SnapReader} -> + %% TODO: idempotence? + ?tp(dsrepl_snapshot_read_complete, #{reader => SnapReader}), + _ = complete_read(RS#rs{reader = SnapReader}), + {ok, Chunk, last}; + {error, Reason} -> + ?tp(dsrepl_snapshot_read_error, #{reason => Reason, reader => SnapReader0}), + _ = emqx_ds_storage_snapshot:release_reader(SnapReader0), + error(Reason) + end. + +complete_read(RS = #rs{reader = SnapReader, started_at = StartedAt}) -> + _ = emqx_ds_storage_snapshot:release_reader(SnapReader), + logger:info(#{ + msg => "dsrepl_snapshot_read_complete", + shard => shard_id(RS), + duration_ms => erlang:monotonic_time(millisecond) - StartedAt, + read_bytes => emqx_ds_storage_snapshot:reader_info(bytes_read, SnapReader) + }). + +%% Accepting a snapshot. +%% +%% This process is triggered by the target server, when the leader finds out +%% that the target server is severely lagging behind. This is receiving side of +%% `begin_read/2` and `read_chunk/3`. +%% +%% Currently, a snapshot writer is owned by the follower `ra` server process +%% residing in dedicated `receive_snapshot` state. This process reverts back +%% to the regular `follower` state under the following conditions: +%% 1. The snapshot is completely accepted, and the machine state is recovered. +%% 2. The process times out waiting for the next chunk. +%% 3. The process encounters a logic error (very unlikely). +%% +%% TODO +%% In the latter cases, the snapshot writer will not have a chance to clean up. +%% For better control over resources, observability, and niceties like flow +%% control and backpressure we need to move this into a dedicated process tree. + +-spec begin_accept(_SnapshotDir :: file:filename(), ra_snapshot:meta()) -> + {ok, ws()}. +begin_accept(Dir, Meta) -> + WS = #ws{ + phase = machine_state, + started_at = erlang:monotonic_time(millisecond), + dir = Dir, + meta = Meta + }, + {ok, WS}. + +-spec accept_chunk(binary(), ws()) -> + {ok, ws()} | {error, _Reason :: term()}. +accept_chunk(Chunk, WS = #ws{phase = machine_state}) -> + MachineState = binary_to_term(Chunk), + start_snapshot_writer(WS#ws{state = MachineState}); +accept_chunk(Chunk, WS = #ws{phase = storage_snapshot, writer = SnapWriter0}) -> + %% TODO: idempotence? + case emqx_ds_storage_snapshot:write_chunk(SnapWriter0, Chunk) of + {next, SnapWriter} -> + {ok, WS#ws{writer = SnapWriter}}; + {error, Reason} -> + ?tp(dsrepl_snapshot_write_error, #{reason => Reason, writer => SnapWriter0}), + _ = emqx_ds_storage_snapshot:abort_writer(SnapWriter0), + error(Reason) + end. + +start_snapshot_writer(WS) -> + ShardId = shard_id(WS), + logger:info(#{ + msg => "dsrepl_snapshot_write_started", + shard => ShardId + }), + _ = emqx_ds_builtin_db_sup:terminate_storage(ShardId), + {ok, SnapWriter} = emqx_ds_storage_layer:accept_snapshot(ShardId), + {ok, WS#ws{phase = storage_snapshot, writer = SnapWriter}}. + +-spec complete_accept(ws()) -> ok | {error, ra_snapshot:file_err()}. +complete_accept(Chunk, WS = #ws{phase = storage_snapshot, writer = SnapWriter0}) -> + %% TODO: idempotence? + case emqx_ds_storage_snapshot:write_chunk(SnapWriter0, Chunk) of + {last, SnapWriter} -> + ?tp(dsrepl_snapshot_write_complete, #{writer => SnapWriter}), + _ = emqx_ds_storage_snapshot:release_writer(SnapWriter), + Result = complete_accept(WS#ws{writer = SnapWriter}), + ?tp(dsrepl_snapshot_accepted, #{shard => shard_id(WS)}), + Result; + {error, Reason} -> + ?tp(dsrepl_snapshot_write_error, #{reason => Reason, writer => SnapWriter0}), + _ = emqx_ds_storage_snapshot:abort_writer(SnapWriter0), + error(Reason) + end. + +complete_accept(WS = #ws{started_at = StartedAt, writer = SnapWriter}) -> + ShardId = shard_id(WS), + logger:info(#{ + msg => "dsrepl_snapshot_read_complete", + shard => ShardId, + duration_ms => erlang:monotonic_time(millisecond) - StartedAt, + bytes_written => emqx_ds_storage_snapshot:writer_info(bytes_written, SnapWriter) + }), + {ok, _} = emqx_ds_builtin_db_sup:restart_storage(ShardId), + write_machine_snapshot(WS). + +write_machine_snapshot(#ws{dir = Dir, meta = Meta, state = MachineState}) -> + write(Dir, Meta, MachineState). + +%% Restoring machine state from a snapshot. +%% This is equivalent to restoring from a log snapshot. + +-spec recover(_SnapshotDir :: file:filename()) -> + {ok, ra_snapshot:meta(), ra_state()} | {error, _Reason}. +recover(Dir) -> + %% TODO: Verify that storage layer is online? + ra_log_snapshot:recover(Dir). + +-spec validate(_SnapshotDir :: file:filename()) -> + ok | {error, _Reason}. +validate(Dir) -> + ra_log_snapshot:validate(Dir). + +-spec read_meta(_SnapshotDir :: file:filename()) -> + {ok, ra_snapshot:meta()} | {error, _Reason}. +read_meta(Dir) -> + ra_log_snapshot:read_meta(Dir). + +shard_id(#rs{state = MachineState}) -> + shard_id(MachineState); +shard_id(#ws{state = MachineState}) -> + shard_id(MachineState); +shard_id(MachineState) -> + maps:get(db_shard, MachineState). diff --git a/apps/emqx_durable_storage/src/emqx_ds_storage_bitfield_lts.erl b/apps/emqx_durable_storage/src/emqx_ds_storage_bitfield_lts.erl index 594854d21..2ec6674b6 100644 --- a/apps/emqx_durable_storage/src/emqx_ds_storage_bitfield_lts.erl +++ b/apps/emqx_durable_storage/src/emqx_ds_storage_bitfield_lts.erl @@ -44,6 +44,7 @@ -export_type([options/0]). +-include("emqx_ds_metrics.hrl"). -include_lib("emqx_utils/include/emqx_message.hrl"). -include_lib("snabbkaffe/include/trace.hrl"). @@ -115,8 +116,6 @@ ?last_seen_key := binary() }. --define(COUNTER, emqx_ds_storage_bitfield_lts_counter). - %% Limit on the number of wildcard levels in the learned topic trie: -define(WILDCARD_LIMIT, 10). @@ -140,6 +139,8 @@ -define(DIM_TOPIC, 1). -define(DIM_TS, 2). +-define(DS_LTS_COUNTERS, [?DS_LTS_SEEK_COUNTER, ?DS_LTS_NEXT_COUNTER, ?DS_LTS_COLLISION_COUNTER]). + -ifdef(TEST). -include_lib("eunit/include/eunit.hrl"). -endif. @@ -347,13 +348,18 @@ update_iterator( ) -> {ok, OldIter#{?last_seen_key => DSKey}}. -next(_Shard, Schema = #s{ts_offset = TSOffset}, It, BatchSize) -> +next(Shard, Schema = #s{ts_offset = TSOffset}, It, BatchSize) -> %% Compute safe cutoff time. %% It's the point in time where the last complete epoch ends, so we need to know %% the current time to compute it. + init_counters(), Now = emqx_ds:timestamp_us(), SafeCutoffTime = (Now bsr TSOffset) bsl TSOffset, - next_until(Schema, It, SafeCutoffTime, BatchSize). + try + next_until(Schema, It, SafeCutoffTime, BatchSize) + after + report_counters(Shard) + end. next_until(_Schema, It = #{?tag := ?IT, ?start_time := StartTime}, SafeCutoffTime, _BatchSize) when StartTime >= SafeCutoffTime @@ -375,20 +381,23 @@ next_until(#s{db = DB, data = CF, keymappers = Keymappers}, It, SafeCutoffTime, filter := Filter } = prepare_loop_context(DB, CF, TopicIndex, StartTime, SafeCutoffTime, Varying, Keymappers), try - put(?COUNTER, 0), next_loop(ITHandle, Keymapper, Filter, SafeCutoffTime, It, [], BatchSize) after - rocksdb:iterator_close(ITHandle), - erase(?COUNTER) + rocksdb:iterator_close(ITHandle) end. -delete_next(_Shard, Schema = #s{ts_offset = TSOffset}, It, Selector, BatchSize) -> +delete_next(Shard, Schema = #s{ts_offset = TSOffset}, It, Selector, BatchSize) -> %% Compute safe cutoff time. %% It's the point in time where the last complete epoch ends, so we need to know %% the current time to compute it. + init_counters(), Now = emqx_message:timestamp_now(), SafeCutoffTime = (Now bsr TSOffset) bsl TSOffset, - delete_next_until(Schema, It, SafeCutoffTime, Selector, BatchSize). + try + delete_next_until(Schema, It, SafeCutoffTime, Selector, BatchSize) + after + report_counters(Shard) + end. delete_next_until( _Schema, @@ -417,7 +426,6 @@ delete_next_until( DB, CF, TopicIndex, StartTime, SafeCutoffTime, Varying, Keymappers ), try - put(?COUNTER, 0), LoopContext = LoopContext0#{ db => DB, cf => CF, @@ -430,8 +438,7 @@ delete_next_until( }, delete_next_loop(LoopContext) after - rocksdb:iterator_close(ITHandle), - erase(?COUNTER) + rocksdb:iterator_close(ITHandle) end. %%================================================================================ @@ -477,7 +484,6 @@ prepare_loop_context(DB, CF, TopicIndex, StartTime, SafeCutoffTime, Varying, Key next_loop(_ITHandle, _KeyMapper, _Filter, _Cutoff, It, Acc, 0) -> {ok, It, lists:reverse(Acc)}; next_loop(ITHandle, KeyMapper, Filter, Cutoff, It0, Acc0, N0) -> - inc_counter(), #{?tag := ?IT, ?last_seen_key := Key0} = It0, case emqx_ds_bitmask_keymapper:bin_increment(Filter, Key0) of overflow -> @@ -485,6 +491,7 @@ next_loop(ITHandle, KeyMapper, Filter, Cutoff, It0, Acc0, N0) -> Key1 -> %% assert true = Key1 > Key0, + inc_counter(?DS_LTS_SEEK_COUNTER), case rocksdb:iterator_move(ITHandle, {seek, Key1}) of {ok, Key, Val} -> {N, It, Acc} = traverse_interval( @@ -510,6 +517,7 @@ traverse_interval(ITHandle, KeyMapper, Filter, Cutoff, Key, Val, It0, Acc0, N) - Acc = [{Key, Msg} | Acc0], traverse_interval(ITHandle, KeyMapper, Filter, Cutoff, It, Acc, N - 1); false -> + inc_counter(?DS_LTS_COLLISION_COUNTER), traverse_interval(ITHandle, KeyMapper, Filter, Cutoff, It, Acc0, N) end; overflow -> @@ -521,7 +529,7 @@ traverse_interval(ITHandle, KeyMapper, Filter, Cutoff, Key, Val, It0, Acc0, N) - traverse_interval(_ITHandle, _KeyMapper, _Filter, _Cutoff, It, Acc, 0) -> {0, It, Acc}; traverse_interval(ITHandle, KeyMapper, Filter, Cutoff, It, Acc, N) -> - inc_counter(), + inc_counter(?DS_LTS_NEXT_COUNTER), case rocksdb:iterator_move(ITHandle, next) of {ok, Key, Val} -> traverse_interval(ITHandle, KeyMapper, Filter, Cutoff, Key, Val, It, Acc, N); @@ -541,7 +549,7 @@ delete_next_loop(LoopContext0) -> iterated_over := AccIter0, it_handle := ITHandle } = LoopContext0, - inc_counter(), + inc_counter(?DS_LTS_SEEK_COUNTER), #{?tag := ?DELETE_IT, ?last_seen_key := Key0} = It0, case emqx_ds_bitmask_keymapper:bin_increment(Filter, Key0) of overflow -> @@ -623,7 +631,7 @@ delete_traverse_interval1(LoopContext0) -> iterated_over := AccIter, storage_iter := It } = LoopContext0, - inc_counter(), + inc_counter(?DS_LTS_NEXT_COUNTER), case rocksdb:iterator_move(ITHandle, next) of {ok, Key, Val} -> delete_traverse_interval(LoopContext0#{ @@ -767,9 +775,20 @@ read_persisted_trie(IT, {ok, KeyB, ValB}) -> read_persisted_trie(_IT, {error, invalid_iterator}) -> []. -inc_counter() -> - N = get(?COUNTER), - put(?COUNTER, N + 1). +inc_counter(Counter) -> + N = get(Counter), + put(Counter, N + 1). + +init_counters() -> + _ = [put(I, 0) || I <- ?DS_LTS_COUNTERS], + ok. + +report_counters(Shard) -> + emqx_ds_builtin_metrics:inc_lts_seek_counter(Shard, get(?DS_LTS_SEEK_COUNTER)), + emqx_ds_builtin_metrics:inc_lts_next_counter(Shard, get(?DS_LTS_NEXT_COUNTER)), + emqx_ds_builtin_metrics:inc_lts_collision_counter(Shard, get(?DS_LTS_COLLISION_COUNTER)), + _ = [erase(I) || I <- ?DS_LTS_COUNTERS], + ok. %% @doc Generate a column family ID for the MQTT messages -spec data_cf(emqx_ds_storage_layer:gen_id()) -> [char()]. diff --git a/apps/emqx_durable_storage/src/emqx_ds_storage_layer.erl b/apps/emqx_durable_storage/src/emqx_ds_storage_layer.erl index 69f5b8231..4981c3fc1 100644 --- a/apps/emqx_durable_storage/src/emqx_ds_storage_layer.erl +++ b/apps/emqx_durable_storage/src/emqx_ds_storage_layer.erl @@ -19,8 +19,12 @@ %% Replication layer API: -export([ - open_shard/2, + %% Lifecycle + start_link/2, drop_shard/1, + shard_info/2, + + %% Data store_batch/3, get_streams/3, get_delete_streams/3, @@ -29,14 +33,20 @@ update_iterator/3, next/3, delete_next/4, + + %% Generations update_config/3, add_generation/2, list_generations_with_lifetimes/1, - drop_generation/2 + drop_generation/2, + + %% Snapshotting + take_snapshot/1, + accept_snapshot/1 ]). %% gen_server --export([start_link/2, init/1, handle_call/3, handle_cast/2, handle_info/2, terminate/2]). +-export([init/1, handle_call/3, handle_cast/2, handle_info/2, terminate/2]). %% internal exports: -export([db_dir/1]). @@ -229,10 +239,7 @@ -record(call_update_config, {options :: emqx_ds:create_db_opts(), since :: emqx_ds:time()}). -record(call_list_generations_with_lifetimes, {}). -record(call_drop_generation, {gen_id :: gen_id()}). - --spec open_shard(shard_id(), options()) -> ok. -open_shard(Shard, Options) -> - emqx_ds_storage_layer_sup:ensure_shard(Shard, Options). +-record(call_take_snapshot, {}). -spec drop_shard(shard_id()) -> ok. drop_shard(Shard) -> @@ -244,11 +251,20 @@ drop_shard(Shard) -> emqx_ds:message_store_opts() ) -> emqx_ds:store_batch_result(). -store_batch(Shard, Messages, Options) -> - %% We always store messages in the current generation: - GenId = generation_current(Shard), - #{module := Mod, data := GenData} = generation_get(Shard, GenId), - Mod:store_batch(Shard, GenData, Messages, Options). +store_batch(Shard, Messages = [{Time, _Msg} | _], Options) -> + %% NOTE + %% We assume that batches do not span generations. Callers should enforce this. + ?tp(emqx_ds_storage_layer_store_batch, #{ + shard => Shard, messages => Messages, options => Options + }), + #{module := Mod, data := GenData} = generation_at(Shard, Time), + T0 = erlang:monotonic_time(microsecond), + Result = Mod:store_batch(Shard, GenData, Messages, Options), + T1 = erlang:monotonic_time(microsecond), + emqx_ds_builtin_metrics:observe_store_batch_time(Shard, T1 - T0), + Result; +store_batch(_Shard, [], _Options) -> + ok. -spec get_streams(shard_id(), emqx_ds:topic_filter(), emqx_ds:time()) -> [{integer(), stream()}]. @@ -258,14 +274,14 @@ get_streams(Shard, TopicFilter, StartTime) -> lists:flatmap( fun(GenId) -> ?tp(get_streams_get_gen, #{gen_id => GenId}), - case generation_get_safe(Shard, GenId) of - {ok, #{module := Mod, data := GenData}} -> + case generation_get(Shard, GenId) of + #{module := Mod, data := GenData} -> Streams = Mod:get_streams(Shard, GenData, TopicFilter, StartTime), [ {GenId, ?stream_v2(GenId, InnerStream)} || InnerStream <- Streams ]; - {error, not_found} -> + not_found -> %% race condition: generation was dropped before getting its streams? [] end @@ -281,14 +297,14 @@ get_delete_streams(Shard, TopicFilter, StartTime) -> lists:flatmap( fun(GenId) -> ?tp(get_streams_get_gen, #{gen_id => GenId}), - case generation_get_safe(Shard, GenId) of - {ok, #{module := Mod, data := GenData}} -> + case generation_get(Shard, GenId) of + #{module := Mod, data := GenData} -> Streams = Mod:get_delete_streams(Shard, GenData, TopicFilter, StartTime), [ ?delete_stream(GenId, InnerStream) || InnerStream <- Streams ]; - {error, not_found} -> + not_found -> %% race condition: generation was dropped before getting its streams? [] end @@ -301,8 +317,8 @@ get_delete_streams(Shard, TopicFilter, StartTime) -> make_iterator( Shard, ?stream_v2(GenId, Stream), TopicFilter, StartTime ) -> - case generation_get_safe(Shard, GenId) of - {ok, #{module := Mod, data := GenData}} -> + case generation_get(Shard, GenId) of + #{module := Mod, data := GenData} -> case Mod:make_iterator(Shard, GenData, Stream, TopicFilter, StartTime) of {ok, Iter} -> {ok, #{ @@ -313,7 +329,7 @@ make_iterator( {error, _} = Err -> Err end; - {error, not_found} -> + not_found -> {error, unrecoverable, generation_not_found} end. @@ -322,8 +338,8 @@ make_iterator( make_delete_iterator( Shard, ?delete_stream(GenId, Stream), TopicFilter, StartTime ) -> - case generation_get_safe(Shard, GenId) of - {ok, #{module := Mod, data := GenData}} -> + case generation_get(Shard, GenId) of + #{module := Mod, data := GenData} -> case Mod:make_delete_iterator(Shard, GenData, Stream, TopicFilter, StartTime) of {ok, Iter} -> {ok, #{ @@ -334,7 +350,7 @@ make_delete_iterator( {error, _} = Err -> Err end; - {error, not_found} -> + not_found -> {error, end_of_stream} end. @@ -345,8 +361,8 @@ update_iterator( #{?tag := ?IT, ?generation := GenId, ?enc := OldIter}, DSKey ) -> - case generation_get_safe(Shard, GenId) of - {ok, #{module := Mod, data := GenData}} -> + case generation_get(Shard, GenId) of + #{module := Mod, data := GenData} -> case Mod:update_iterator(Shard, GenData, OldIter, DSKey) of {ok, Iter} -> {ok, #{ @@ -357,15 +373,15 @@ update_iterator( {error, _} = Err -> Err end; - {error, not_found} -> + not_found -> {error, unrecoverable, generation_not_found} end. -spec next(shard_id(), iterator(), pos_integer()) -> emqx_ds:next_result(iterator()). next(Shard, Iter = #{?tag := ?IT, ?generation := GenId, ?enc := GenIter0}, BatchSize) -> - case generation_get_safe(Shard, GenId) of - {ok, #{module := Mod, data := GenData}} -> + case generation_get(Shard, GenId) of + #{module := Mod, data := GenData} -> Current = generation_current(Shard), case Mod:next(Shard, GenData, GenIter0, BatchSize) of {ok, _GenIter, []} when GenId < Current -> @@ -378,7 +394,7 @@ next(Shard, Iter = #{?tag := ?IT, ?generation := GenId, ?enc := GenIter0}, Batch Error = {error, _, _} -> Error end; - {error, not_found} -> + not_found -> %% generation was possibly dropped by GC {error, unrecoverable, generation_not_found} end. @@ -391,8 +407,8 @@ delete_next( Selector, BatchSize ) -> - case generation_get_safe(Shard, GenId) of - {ok, #{module := Mod, data := GenData}} -> + case generation_get(Shard, GenId) of + #{module := Mod, data := GenData} -> Current = generation_current(Shard), case Mod:delete_next(Shard, GenData, GenIter0, Selector, BatchSize) of {ok, _GenIter, _Deleted = 0, _IteratedOver = 0} when GenId < Current -> @@ -405,7 +421,7 @@ delete_next( Error = {error, _} -> Error end; - {error, not_found} -> + not_found -> %% generation was possibly dropped by GC {ok, end_of_stream} end. @@ -436,6 +452,28 @@ list_generations_with_lifetimes(ShardId) -> drop_generation(ShardId, GenId) -> gen_server:call(?REF(ShardId), #call_drop_generation{gen_id = GenId}, infinity). +-spec shard_info(shard_id(), status) -> running | down. +shard_info(ShardId, status) -> + try get_schema_runtime(ShardId) of + #{} -> running + catch + error:badarg -> down + end. + +-spec take_snapshot(shard_id()) -> {ok, emqx_ds_storage_snapshot:reader()} | {error, _Reason}. +take_snapshot(ShardId) -> + case gen_server:call(?REF(ShardId), #call_take_snapshot{}, infinity) of + {ok, Dir} -> + emqx_ds_storage_snapshot:new_reader(Dir); + Error -> + Error + end. + +-spec accept_snapshot(shard_id()) -> {ok, emqx_ds_storage_snapshot:writer()} | {error, _Reason}. +accept_snapshot(ShardId) -> + ok = drop_shard(ShardId), + handle_accept_snapshot(ShardId). + %%================================================================================ %% gen_server for the shard %%================================================================================ @@ -462,6 +500,7 @@ init({ShardId, Options}) -> process_flag(trap_exit, true), logger:set_process_metadata(#{shard_id => ShardId, domain => [ds, storage_layer, shard]}), erase_schema_runtime(ShardId), + clear_all_checkpoints(ShardId), {ok, DB, CFRefs0} = rocksdb_open(ShardId, Options), {Schema, CFRefs} = case get_schema_persistent(DB) of @@ -505,6 +544,9 @@ handle_call(#call_drop_generation{gen_id = GenId}, _From, S0) -> {Reply, S} = handle_drop_generation(S0, GenId), commit_metadata(S), {reply, Reply, S}; +handle_call(#call_take_snapshot{}, _From, S) -> + Snapshot = handle_take_snapshot(S), + {reply, Snapshot, S}; handle_call(_Call, _From, S) -> {reply, {error, unknown_call}, S}. @@ -526,6 +568,23 @@ terminate(_Reason, #s{db = DB, shard_id = ShardId}) -> %% Internal functions %%================================================================================ +-spec clear_all_checkpoints(shard_id()) -> ok. +clear_all_checkpoints(ShardId) -> + CheckpointBaseDir = checkpoints_dir(ShardId), + ok = filelib:ensure_path(CheckpointBaseDir), + {ok, AllFiles} = file:list_dir(CheckpointBaseDir), + CheckpointDirs = [Dir || Dir <- AllFiles, filelib:is_dir(Dir)], + lists:foreach( + fun(Dir) -> + logger:debug(#{ + msg => "ds_storage_deleting_previous_checkpoint", + dir => Dir + }), + ok = file:del_dir_r(Dir) + end, + CheckpointDirs + ). + -spec open_shard(shard_id(), rocksdb:db_handle(), cf_refs(), shard_schema()) -> shard(). open_shard(ShardId, DB, CFRefs, ShardSchema) -> @@ -671,7 +730,7 @@ create_new_shard_schema(ShardId, DB, CFRefs, Prototype) -> {gen_id(), shard_schema(), cf_refs()}. new_generation(ShardId, DB, Schema0, Since) -> #{current_generation := PrevGenId, prototype := {Mod, ModConf}} = Schema0, - GenId = PrevGenId + 1, + GenId = next_generation_id(PrevGenId), {GenData, NewCFRefs} = Mod:create(ShardId, DB, GenId, ModConf), GenSchema = #{ module => Mod, @@ -687,6 +746,14 @@ new_generation(ShardId, DB, Schema0, Since) -> }, {GenId, Schema, NewCFRefs}. +-spec next_generation_id(gen_id()) -> gen_id(). +next_generation_id(GenId) -> + GenId + 1. + +-spec prev_generation_id(gen_id()) -> gen_id(). +prev_generation_id(GenId) when GenId > 0 -> + GenId - 1. + %% @doc Commit current state of the server to both rocksdb and the persistent term -spec commit_metadata(server_state()) -> ok. commit_metadata(#s{shard_id = ShardId, schema = Schema, shard = Runtime, db = DB}) -> @@ -726,7 +793,15 @@ rocksdb_open(Shard, Options) -> -spec db_dir(shard_id()) -> file:filename(). db_dir({DB, ShardId}) -> - filename:join([emqx_ds:base_dir(), atom_to_list(DB), binary_to_list(ShardId)]). + filename:join([emqx_ds:base_dir(), DB, binary_to_list(ShardId)]). + +-spec checkpoints_dir(shard_id()) -> file:filename(). +checkpoints_dir({DB, ShardId}) -> + filename:join([emqx_ds:base_dir(), DB, checkpoints, binary_to_list(ShardId)]). + +-spec checkpoint_dir(shard_id(), _Name :: file:name()) -> file:filename(). +checkpoint_dir(ShardId, Name) -> + filename:join([checkpoints_dir(ShardId), Name]). -spec update_last_until(Schema, emqx_ds:time()) -> Schema | {error, exists | overlaps_existing_generations} @@ -759,6 +834,21 @@ run_post_creation_actions(#{new_gen_runtime_data := NewGenData}) -> %% Different implementation modules NewGenData. +handle_take_snapshot(#s{db = DB, shard_id = ShardId}) -> + Name = integer_to_list(erlang:system_time(millisecond)), + Dir = checkpoint_dir(ShardId, Name), + _ = filelib:ensure_dir(Dir), + case rocksdb:checkpoint(DB, Dir) of + ok -> + {ok, Dir}; + {error, _} = Error -> + Error + end. + +handle_accept_snapshot(ShardId) -> + Dir = db_dir(ShardId), + emqx_ds_storage_snapshot:new_writer(Dir). + %%-------------------------------------------------------------------------------- %% Schema access %%-------------------------------------------------------------------------------- @@ -768,18 +858,13 @@ generation_current(Shard) -> #{current_generation := Current} = get_schema_runtime(Shard), Current. --spec generation_get(shard_id(), gen_id()) -> generation(). +-spec generation_get(shard_id(), gen_id()) -> generation() | not_found. generation_get(Shard, GenId) -> - {ok, GenData} = generation_get_safe(Shard, GenId), - GenData. - --spec generation_get_safe(shard_id(), gen_id()) -> {ok, generation()} | {error, not_found}. -generation_get_safe(Shard, GenId) -> case get_schema_runtime(Shard) of #{?GEN_KEY(GenId) := GenData} -> - {ok, GenData}; + GenData; #{} -> - {error, not_found} + not_found end. -spec generations_since(shard_id(), emqx_ds:time()) -> [gen_id()]. @@ -796,6 +881,20 @@ generations_since(Shard, Since) -> Schema ). +-spec generation_at(shard_id(), emqx_ds:time()) -> generation(). +generation_at(Shard, Time) -> + Schema = #{current_generation := Current} = get_schema_runtime(Shard), + generation_at(Time, Current, Schema). + +generation_at(Time, GenId, Schema) -> + #{?GEN_KEY(GenId) := Gen} = Schema, + case Gen of + #{since := Since} when Time < Since andalso GenId > 0 -> + generation_at(Time, prev_generation_id(GenId), Schema); + _ -> + Gen + end. + -define(PERSISTENT_TERM(SHARD), {emqx_ds_storage_layer, SHARD}). -spec get_schema_runtime(shard_id()) -> shard(). diff --git a/apps/emqx_durable_storage/src/emqx_ds_storage_layer_sup.erl b/apps/emqx_durable_storage/src/emqx_ds_storage_layer_sup.erl deleted file mode 100644 index 136669ed2..000000000 --- a/apps/emqx_durable_storage/src/emqx_ds_storage_layer_sup.erl +++ /dev/null @@ -1,88 +0,0 @@ -%%-------------------------------------------------------------------- -%% Copyright (c) 2022-2024 EMQ Technologies Co., Ltd. All Rights Reserved. -%% -%% Licensed under the Apache License, Version 2.0 (the "License"); -%% you may not use this file except in compliance with the License. -%% You may obtain a copy of the License at -%% -%% http://www.apache.org/licenses/LICENSE-2.0 -%% -%% Unless required by applicable law or agreed to in writing, software -%% distributed under the License is distributed on an "AS IS" BASIS, -%% WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -%% See the License for the specific language governing permissions and -%% limitations under the License. -%%-------------------------------------------------------------------- --module(emqx_ds_storage_layer_sup). - --behaviour(supervisor). - -%% API: --export([start_link/0, start_shard/2, stop_shard/1, ensure_shard/2]). - -%% behaviour callbacks: --export([init/1]). - -%%================================================================================ -%% Type declarations -%%================================================================================ - --define(SUP, ?MODULE). - -%%================================================================================ -%% API funcions -%%================================================================================ - --spec start_link() -> {ok, pid()}. -start_link() -> - supervisor:start_link(?MODULE, []). - --spec start_shard(emqx_ds_storage_layer:shard_id(), emqx_ds:create_db_opts()) -> - supervisor:startchild_ret(). -start_shard(Shard, Options) -> - supervisor:start_child(?SUP, shard_child_spec(Shard, Options)). - --spec stop_shard(emqx_ds_storage_layer:shard_id()) -> ok | {error, _}. -stop_shard(Shard) -> - ok = supervisor:terminate_child(?SUP, Shard), - ok = supervisor:delete_child(?SUP, Shard). - --spec ensure_shard(emqx_ds_storage_layer:shard_id(), emqx_ds_storage_layer:options()) -> - ok | {error, _Reason}. -ensure_shard(Shard, Options) -> - case start_shard(Shard, Options) of - {ok, _Pid} -> - ok; - {error, {already_started, _Pid}} -> - ok; - {error, Reason} -> - {error, Reason} - end. - -%%================================================================================ -%% behaviour callbacks -%%================================================================================ - -init([]) -> - Children = [], - SupFlags = #{ - strategy => one_for_one, - intensity => 10, - period => 10 - }, - {ok, {SupFlags, Children}}. - -%%================================================================================ -%% Internal functions -%%================================================================================ - --spec shard_child_spec(emqx_ds_storage_layer:shard_id(), emqx_ds:create_db_opts()) -> - supervisor:child_spec(). -shard_child_spec(Shard, Options) -> - #{ - id => Shard, - start => {emqx_ds_storage_layer, start_link, [Shard, Options]}, - shutdown => 5_000, - restart => permanent, - type => worker - }. diff --git a/apps/emqx_durable_storage/src/emqx_ds_storage_snapshot.erl b/apps/emqx_durable_storage/src/emqx_ds_storage_snapshot.erl new file mode 100644 index 000000000..86648ed58 --- /dev/null +++ b/apps/emqx_durable_storage/src/emqx_ds_storage_snapshot.erl @@ -0,0 +1,327 @@ +%%-------------------------------------------------------------------- +%% Copyright (c) 2024 EMQ Technologies Co., Ltd. All Rights Reserved. +%% +%% Licensed under the Apache License, Version 2.0 (the "License"); +%% you may not use this file except in compliance with the License. +%% You may obtain a copy of the License at +%% +%% http://www.apache.org/licenses/LICENSE-2.0 +%% +%% Unless required by applicable law or agreed to in writing, software +%% distributed under the License is distributed on an "AS IS" BASIS, +%% WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +%% See the License for the specific language governing permissions and +%% limitations under the License. +%%-------------------------------------------------------------------- +-module(emqx_ds_storage_snapshot). + +-include_lib("kernel/include/file.hrl"). + +-export([ + new_reader/1, + read_chunk/2, + abort_reader/1, + release_reader/1, + reader_info/2 +]). + +-export([ + new_writer/1, + write_chunk/2, + abort_writer/1, + release_writer/1, + writer_info/2 +]). + +-export_type([ + reader/0, + writer/0 +]). + +%% + +-define(FILECHUNK(RELPATH, POS, MORE), #{ + '$' => chunk, + rp => RELPATH, + pos => POS, + more => MORE +}). +-define(PAT_FILECHUNK(RELPATH, POS, MORE), #{ + '$' := chunk, + rp := RELPATH, + pos := POS, + more := MORE +}). + +-define(EOS(), #{ + '$' => eos +}). +-define(PAT_EOS(), #{ + '$' := eos +}). + +-define(PAT_HEADER(), #{'$' := _}). + +%% + +-record(reader, { + dirpath :: file:filename(), + files :: #{_RelPath => reader_file()}, + queue :: [_RelPath :: file:filename()] +}). + +-record(rfile, { + abspath :: file:filename(), + fd :: file:io_device() | eof, + pos :: non_neg_integer() +}). + +-opaque reader() :: #reader{}. +-type reader_file() :: #rfile{}. + +-type reason() :: {atom(), _AbsPath :: file:filename(), _Details :: term()}. + +%% @doc Initialize a reader for a snapshot directory. +%% Snapshot directory is a directory containing arbitrary number of regular +%% files in arbitrary subdirectory structure. Files are read in indeterminate +%% order. It's an error to have non-regular files in the directory (e.g. symlinks). +-spec new_reader(_Dir :: file:filename()) -> {ok, reader()}. +new_reader(DirPath) -> + %% NOTE + %% Opening all files at once, so there would be less error handling later + %% during transfer. + %% TODO + %% Beware of how errors are handled: if one file fails to open, the whole + %% process will exit. This is fine for the purpose of replication (because + %% ra spawns separate process for each transfer), but may not be suitable + %% for other use cases. + Files = emqx_utils_fs:traverse_dir( + fun(Path, Info, Acc) -> new_reader_file(Path, Info, DirPath, Acc) end, + #{}, + DirPath + ), + {ok, #reader{ + dirpath = DirPath, + files = Files, + queue = maps:keys(Files) + }}. + +new_reader_file(Path, #file_info{type = regular}, DirPath, Acc) -> + case file:open(Path, [read, binary, raw]) of + {ok, IoDev} -> + RelPath = emqx_utils_fs:find_relpath(Path, DirPath), + File = #rfile{abspath = Path, fd = IoDev, pos = 0}, + Acc#{RelPath => File}; + {error, Reason} -> + error({open_failed, Path, Reason}) + end; +new_reader_file(Path, #file_info{type = Type}, _, _Acc) -> + error({bad_file_type, Path, Type}); +new_reader_file(Path, {error, Reason}, _, _Acc) -> + error({inaccessible, Path, Reason}). + +%% @doc Read a chunk of data from the snapshot. +%% Returns `{last, Chunk, Reader}` when the last chunk is read. After that, one +%% should call `release_reader/1` to finalize the process (or `abort_reader/1` if +%% keeping the snapshot is desired). +-spec read_chunk(reader(), _Size :: non_neg_integer()) -> + {last | next, _Chunk :: iodata(), reader()} | {error, reason()}. +read_chunk(R = #reader{files = Files, queue = [RelPath | Rest]}, Size) -> + File = maps:get(RelPath, Files), + case read_chunk_file(RelPath, File, Size) of + {last, Chunk, FileRest} -> + {next, Chunk, R#reader{files = Files#{RelPath := FileRest}, queue = Rest}}; + {next, Chunk, FileRest} -> + {next, Chunk, R#reader{files = Files#{RelPath := FileRest}}}; + Error -> + Error + end; +read_chunk(R = #reader{queue = []}, _Size) -> + {last, make_packet(?EOS()), R}. + +read_chunk_file(RelPath, RFile0 = #rfile{fd = IoDev, pos = Pos, abspath = AbsPath}, Size) -> + case file:read(IoDev, Size) of + {ok, Chunk} -> + ChunkSize = byte_size(Chunk), + HasMore = ChunkSize div Size, + RFile1 = RFile0#rfile{pos = Pos + ChunkSize}, + case HasMore of + _Yes = 1 -> + Status = next, + RFile = RFile1; + _No = 0 -> + Status = last, + RFile = release_reader_file(RFile1) + end, + Packet = make_packet(?FILECHUNK(RelPath, Pos, HasMore), Chunk), + {Status, Packet, RFile}; + eof -> + Packet = make_packet(?FILECHUNK(RelPath, Pos, 0)), + {last, Packet, release_reader_file(RFile0)}; + {error, Reason} -> + {error, {read_failed, AbsPath, Reason}} + end. + +%% @doc Aborts the snapshot reader, but does not release the snapshot files. +-spec abort_reader(reader()) -> ok. +abort_reader(#reader{files = Files}) -> + lists:foreach(fun release_reader_file/1, maps:values(Files)). + +%% @doc Aborts the snapshot reader and deletes the snapshot files. +-spec release_reader(reader()) -> ok. +release_reader(R = #reader{dirpath = DirPath}) -> + ok = abort_reader(R), + file:del_dir_r(DirPath). + +release_reader_file(RFile = #rfile{fd = eof}) -> + RFile; +release_reader_file(RFile = #rfile{fd = IoDev}) -> + _ = file:close(IoDev), + RFile#rfile{fd = eof}. + +-spec reader_info(bytes_read, reader()) -> _Bytes :: non_neg_integer(). +reader_info(bytes_read, #reader{files = Files}) -> + maps:fold(fun(_, RFile, Sum) -> Sum + RFile#rfile.pos end, 0, Files). + +%% + +-record(writer, { + dirpath :: file:filename(), + files :: #{_RelPath :: file:filename() => writer_file()} +}). + +-record(wfile, { + abspath :: file:filename(), + fd :: file:io_device() | eof, + pos :: non_neg_integer() +}). + +-opaque writer() :: #writer{}. +-type writer_file() :: #wfile{}. + +%% @doc Initialize a writer into a snapshot directory. +%% The directory needs not to exist, it will be created if it doesn't. +%% Having non-empty directory is not an error, existing files will be +%% overwritten. +-spec new_writer(_Dir :: file:filename()) -> {ok, writer()} | {error, reason()}. +new_writer(DirPath) -> + case filelib:ensure_path(DirPath) of + ok -> + {ok, #writer{dirpath = DirPath, files = #{}}}; + {error, Reason} -> + {error, {mkdir_failed, DirPath, Reason}} + end. + +%% @doc Write a chunk of data to the snapshot. +%% Returns `{last, Writer}` when the last chunk is written. After that, one +%% should call `release_writer/1` to finalize the process. +-spec write_chunk(writer(), _Chunk :: binary()) -> + {last | next, writer()} | {error, _Reason}. +write_chunk(W, Packet) -> + case parse_packet(Packet) of + {?PAT_FILECHUNK(RelPath, Pos, More), Chunk} -> + write_chunk(W, RelPath, Pos, More, Chunk); + {?PAT_EOS(), _Rest} -> + %% TODO: Verify all files are `eof` at this point? + {last, W}; + Error -> + Error + end. + +write_chunk(W = #writer{files = Files}, RelPath, Pos, More, Chunk) -> + case Files of + #{RelPath := WFile} -> + write_chunk(W, WFile, RelPath, Pos, More, Chunk); + #{} when Pos == 0 -> + case new_writer_file(W, RelPath) of + WFile = #wfile{} -> + write_chunk(W, WFile, RelPath, Pos, More, Chunk); + Error -> + Error + end; + #{} -> + {error, {bad_chunk, RelPath, Pos}} + end. + +write_chunk(W = #writer{files = Files}, WFile0, RelPath, Pos, More, Chunk) -> + case write_chunk_file(WFile0, Pos, More, Chunk) of + WFile = #wfile{} -> + {next, W#writer{files = Files#{RelPath => WFile}}}; + Error -> + Error + end. + +new_writer_file(#writer{dirpath = DirPath}, RelPath) -> + AbsPath = filename:join(DirPath, RelPath), + _ = filelib:ensure_dir(AbsPath), + case file:open(AbsPath, [write, binary, raw]) of + {ok, IoDev} -> + #wfile{ + abspath = AbsPath, + fd = IoDev, + pos = 0 + }; + {error, Reason} -> + {error, {open_failed, AbsPath, Reason}} + end. + +write_chunk_file(WFile0 = #wfile{fd = IoDev, pos = Pos, abspath = AbsPath}, Pos, More, Chunk) -> + ChunkSize = byte_size(Chunk), + case (ChunkSize > 0) andalso file:write(IoDev, Chunk) of + false -> + WFile0; + ok -> + WFile1 = WFile0#wfile{pos = Pos + ChunkSize}, + case More of + 0 -> release_writer_file(WFile1); + _ -> WFile1 + end; + {error, Reason} -> + {error, {write_failed, AbsPath, Reason}} + end; +write_chunk_file(WFile = #wfile{pos = WPos}, Pos, _More, _Chunk) when Pos < WPos -> + WFile; +write_chunk_file(#wfile{abspath = AbsPath}, Pos, _More, _Chunk) -> + {error, {bad_chunk, AbsPath, Pos}}. + +%% @doc Abort the writer and clean up unfinished snapshot files. +-spec abort_writer(writer()) -> ok | {error, file:posix()}. +abort_writer(W = #writer{dirpath = DirPath}) -> + ok = release_writer(W), + file:del_dir_r(DirPath). + +%% @doc Release the writer and close all snapshot files. +-spec release_writer(writer()) -> ok. +release_writer(#writer{files = Files}) -> + ok = lists:foreach(fun release_writer_file/1, maps:values(Files)). + +release_writer_file(WFile = #wfile{fd = eof}) -> + WFile; +release_writer_file(WFile = #wfile{fd = IoDev}) -> + _ = file:close(IoDev), + WFile#wfile{fd = eof}. + +-spec writer_info(bytes_written, writer()) -> _Bytes :: non_neg_integer(). +writer_info(bytes_written, #writer{files = Files}) -> + maps:fold(fun(_, WFile, Sum) -> Sum + WFile#wfile.pos end, 0, Files). + +%% + +make_packet(Header) -> + term_to_binary(Header). + +make_packet(Header, Rest) -> + HeaderBytes = term_to_binary(Header), + <>. + +parse_packet(Packet) -> + try binary_to_term(Packet, [safe, used]) of + {Header = ?PAT_HEADER(), Length} -> + {_, Rest} = split_binary(Packet, Length), + {Header, Rest}; + {Header, _} -> + {error, {bad_header, Header}} + catch + error:badarg -> + {error, bad_packet} + end. diff --git a/apps/emqx_durable_storage/test/emqx_ds_SUITE.erl b/apps/emqx_durable_storage/test/emqx_ds_SUITE.erl index 64d81307c..49742fdc6 100644 --- a/apps/emqx_durable_storage/test/emqx_ds_SUITE.erl +++ b/apps/emqx_durable_storage/test/emqx_ds_SUITE.erl @@ -53,7 +53,7 @@ t_00_smoke_open_drop(_Config) -> lists:foreach( fun(Shard) -> ?assertEqual( - {ok, [Site]}, emqx_ds_replication_layer_meta:replica_set(DB, Shard) + [Site], emqx_ds_replication_layer_meta:replica_set(DB, Shard) ) end, Shards @@ -98,8 +98,8 @@ t_03_smoke_iterate(_Config) -> ?assertMatch(ok, emqx_ds:store_batch(DB, Msgs)), [{_, Stream}] = emqx_ds:get_streams(DB, TopicFilter, StartTime), {ok, Iter0} = emqx_ds:make_iterator(DB, Stream, TopicFilter, StartTime), - {ok, Iter, Batch} = iterate(DB, Iter0, 1), - ?assertEqual(Msgs, [Msg || {_Key, Msg} <- Batch], {Iter0, Iter}). + {ok, Iter, Batch} = emqx_ds_test_helpers:consume_iter(DB, Iter0), + ?assertEqual(Msgs, Batch, {Iter0, Iter}). %% Verify that iterators survive restart of the application. This is %% an important property, since the lifetime of the iterators is tied @@ -125,8 +125,8 @@ t_04_restart(_Config) -> {ok, _} = application:ensure_all_started(emqx_durable_storage), ok = emqx_ds:open_db(DB, opts()), %% The old iterator should be still operational: - {ok, Iter, Batch} = iterate(DB, Iter0, 1), - ?assertEqual(Msgs, [Msg || {_Key, Msg} <- Batch], {Iter0, Iter}). + {ok, Iter, Batch} = emqx_ds_test_helpers:consume_iter(DB, Iter0), + ?assertEqual(Msgs, Batch, {Iter0, Iter}). %% Check that we can create iterators directly from DS keys. t_05_update_iterator(_Config) -> @@ -148,9 +148,8 @@ t_05_update_iterator(_Config) -> Res1 = emqx_ds:update_iterator(DB, OldIter, Key0), ?assertMatch({ok, _Iter1}, Res1), {ok, Iter1} = Res1, - {ok, FinalIter, Batch} = iterate(DB, Iter1, 1), - AllMsgs = [Msg0 | [Msg || {_Key, Msg} <- Batch]], - ?assertEqual(Msgs, AllMsgs, #{from_key => Iter1, final_iter => FinalIter}), + {ok, Iter, Batch} = emqx_ds_test_helpers:consume_iter(DB, Iter1, #{batch_size => 1}), + ?assertEqual(Msgs, [Msg0 | Batch], #{from_key => Iter1, final_iter => Iter}), ok. t_06_update_config(_Config) -> @@ -190,9 +189,9 @@ t_06_update_config(_Config) -> ), Checker = fun({StartTime, Msgs0}, Acc) -> - Msgs = Msgs0 ++ Acc, - Batch = fetch_all(DB, TopicFilter, StartTime), - ?assertEqual(Msgs, Batch, {StartTime}), + Msgs = Acc ++ Msgs0, + Batch = emqx_ds_test_helpers:consume(DB, TopicFilter, StartTime), + ?assertEqual(Msgs, Batch, StartTime), Msgs end, lists:foldl(Checker, [], lists:zip(StartTimes, MsgsList)). @@ -234,9 +233,9 @@ t_07_add_generation(_Config) -> ), Checker = fun({StartTime, Msgs0}, Acc) -> - Msgs = Msgs0 ++ Acc, - Batch = fetch_all(DB, TopicFilter, StartTime), - ?assertEqual(Msgs, Batch, {StartTime}), + Msgs = Acc ++ Msgs0, + Batch = emqx_ds_test_helpers:consume(DB, TopicFilter, StartTime), + ?assertEqual(Msgs, Batch, StartTime), Msgs end, lists:foldl(Checker, [], lists:zip(StartTimes, MsgsList)). @@ -323,17 +322,10 @@ t_09_atomic_store_batch(_Config) -> sync => true }) ), - - ok + {ok, Flush} = ?block_until(#{?snk_kind := emqx_ds_replication_layer_egress_flush}), + ?assertMatch(#{batch := [_, _, _]}, Flush) end, - fun(Trace) -> - %% Must contain exactly one flush with all messages. - ?assertMatch( - [#{batch := [_, _, _]}], - ?of_kind(emqx_ds_replication_layer_egress_flush, Trace) - ), - ok - end + [] ), ok. @@ -356,14 +348,15 @@ t_10_non_atomic_store_batch(_Config) -> sync => true }) ), - - ok + timer:sleep(1000) end, fun(Trace) -> %% Should contain one flush per message. + Batches = ?projection(batch, ?of_kind(emqx_ds_replication_layer_egress_flush, Trace)), + ?assertMatch([_], Batches), ?assertMatch( - [#{batch := [_]}, #{batch := [_]}, #{batch := [_]}], - ?of_kind(emqx_ds_replication_layer_egress_flush, Trace) + [_, _, _], + lists:append(Batches) ), ok end @@ -398,9 +391,8 @@ t_smoke_delete_next(_Config) -> TopicFilterHash = ['#'], [{_, Stream}] = emqx_ds:get_streams(DB, TopicFilterHash, StartTime), - {ok, Iter0} = emqx_ds:make_iterator(DB, Stream, TopicFilterHash, StartTime), - {ok, _Iter, Batch} = iterate(DB, Iter0, 1), - ?assertEqual([Msg1, Msg3], [Msg || {_Key, Msg} <- Batch]), + Batch = emqx_ds_test_helpers:consume_stream(DB, Stream, TopicFilterHash, StartTime), + ?assertEqual([Msg1, Msg3], Batch), ok = emqx_ds:add_generation(DB), @@ -444,9 +436,9 @@ t_drop_generation_with_never_used_iterator(_Config) -> ], ?assertMatch(ok, emqx_ds:store_batch(DB, Msgs1)), - ?assertMatch( - {error, unrecoverable, generation_not_found, []}, - iterate(DB, Iter0, 1) + ?assertError( + {error, unrecoverable, generation_not_found}, + emqx_ds_test_helpers:consume_iter(DB, Iter0) ), %% New iterator for the new stream will only see the later messages. @@ -454,9 +446,9 @@ t_drop_generation_with_never_used_iterator(_Config) -> ?assertNotEqual(Stream0, Stream1), {ok, Iter1} = emqx_ds:make_iterator(DB, Stream1, TopicFilter, StartTime), - {ok, Iter, Batch} = iterate(DB, Iter1, 1), + {ok, Iter, Batch} = emqx_ds_test_helpers:consume_iter(DB, Iter1, #{batch_size => 1}), ?assertNotEqual(end_of_stream, Iter), - ?assertEqual(Msgs1, [Msg || {_Key, Msg} <- Batch]), + ?assertEqual(Msgs1, Batch), ok. @@ -496,9 +488,9 @@ t_drop_generation_with_used_once_iterator(_Config) -> ], ?assertMatch(ok, emqx_ds:store_batch(DB, Msgs1)), - ?assertMatch( - {error, unrecoverable, generation_not_found, []}, - iterate(DB, Iter1, 1) + ?assertError( + {error, unrecoverable, generation_not_found}, + emqx_ds_test_helpers:consume_iter(DB, Iter1) ). t_drop_generation_update_iterator(_Config) -> @@ -683,10 +675,83 @@ t_error_mapping_replication_layer(_Config) -> length([error || {error, _, _} <- Results2]) > 0, Results2 ), - - snabbkaffe:stop(), meck:unload(). +%% This testcase verifies the behavior of `store_batch' operation +%% when the underlying code experiences recoverable or unrecoverable +%% problems. +t_store_batch_fail(_Config) -> + ?check_trace( + #{timetrap => 15_000}, + try + meck:new(emqx_ds_storage_layer, [passthrough, no_history]), + DB = ?FUNCTION_NAME, + ?assertMatch(ok, emqx_ds:open_db(DB, (opts())#{n_shards => 2})), + %% Success: + Batch1 = [ + message(<<"C1">>, <<"foo/bar">>, <<"1">>, 1), + message(<<"C1">>, <<"foo/bar">>, <<"2">>, 1) + ], + ?assertMatch(ok, emqx_ds:store_batch(DB, Batch1, #{sync => true})), + %% Inject unrecoverable error: + meck:expect(emqx_ds_storage_layer, store_batch, fun(_DB, _Shard, _Messages) -> + {error, unrecoverable, mock} + end), + Batch2 = [ + message(<<"C1">>, <<"foo/bar">>, <<"3">>, 1), + message(<<"C1">>, <<"foo/bar">>, <<"4">>, 1) + ], + ?assertMatch( + {error, unrecoverable, mock}, emqx_ds:store_batch(DB, Batch2, #{sync => true}) + ), + meck:unload(emqx_ds_storage_layer), + %% Inject a recoveralbe error: + meck:new(ra, [passthrough, no_history]), + meck:expect(ra, process_command, fun(Servers, Shard, Command) -> + ?tp(ra_command, #{servers => Servers, shard => Shard, command => Command}), + {timeout, mock} + end), + Batch3 = [ + message(<<"C1">>, <<"foo/bar">>, <<"5">>, 2), + message(<<"C2">>, <<"foo/bar">>, <<"6">>, 2), + message(<<"C1">>, <<"foo/bar">>, <<"7">>, 3), + message(<<"C2">>, <<"foo/bar">>, <<"8">>, 3) + ], + %% Note: due to idempotency issues the number of retries + %% is currently set to 0: + ?assertMatch( + {error, recoverable, {timeout, mock}}, + emqx_ds:store_batch(DB, Batch3, #{sync => true}) + ), + meck:unload(ra), + ?assertMatch(ok, emqx_ds:store_batch(DB, Batch3, #{sync => true})), + lists:sort(emqx_ds_test_helpers:consume_per_stream(DB, ['#'], 1)) + after + meck:unload() + end, + [ + {"message ordering", fun(StoredMessages, _Trace) -> + [{_, Stream1}, {_, Stream2}] = StoredMessages, + ?assertMatch( + [ + #message{payload = <<"1">>}, + #message{payload = <<"2">>}, + #message{payload = <<"5">>}, + #message{payload = <<"7">>} + ], + Stream1 + ), + ?assertMatch( + [ + #message{payload = <<"6">>}, + #message{payload = <<"8">>} + ], + Stream2 + ) + end} + ] + ). + update_data_set() -> [ [ @@ -702,25 +767,6 @@ update_data_set() -> ] ]. -fetch_all(DB, TopicFilter, StartTime) -> - Streams0 = emqx_ds:get_streams(DB, TopicFilter, StartTime), - Streams = lists:sort( - fun({{_, A}, _}, {{_, B}, _}) -> - A < B - end, - Streams0 - ), - lists:foldl( - fun({_, Stream}, Acc) -> - {ok, Iter0} = emqx_ds:make_iterator(DB, Stream, TopicFilter, StartTime), - {ok, _, Msgs0} = iterate(DB, Iter0, StartTime), - Msgs = lists:map(fun({_, Msg}) -> Msg end, Msgs0), - Acc ++ Msgs - end, - [], - Streams - ). - message(ClientId, Topic, Payload, PublishedAt) -> Msg = message(Topic, Payload, PublishedAt), Msg#message{from = ClientId}. @@ -733,21 +779,6 @@ message(Topic, Payload, PublishedAt) -> id = emqx_guid:gen() }. -iterate(DB, It, BatchSize) -> - iterate(DB, It, BatchSize, []). - -iterate(DB, It0, BatchSize, Acc) -> - case emqx_ds:next(DB, It0, BatchSize) of - {ok, It, []} -> - {ok, It, Acc}; - {ok, It, Msgs} -> - iterate(DB, It, BatchSize, Acc ++ Msgs); - {ok, end_of_stream} -> - {ok, end_of_stream, Acc}; - {error, Class, Reason} -> - {error, Class, Reason, Acc} - end. - delete(DB, It, Selector, BatchSize) -> delete(DB, It, Selector, BatchSize, 0). @@ -784,6 +815,7 @@ init_per_testcase(_TC, Config) -> Config. end_per_testcase(_TC, _Config) -> + snabbkaffe:stop(), ok = application:stop(emqx_durable_storage), mria:stop(), _ = mnesia:delete_schema([node()]), diff --git a/apps/emqx_durable_storage/test/emqx_ds_replication_SUITE.erl b/apps/emqx_durable_storage/test/emqx_ds_replication_SUITE.erl new file mode 100644 index 000000000..3b0e37c7f --- /dev/null +++ b/apps/emqx_durable_storage/test/emqx_ds_replication_SUITE.erl @@ -0,0 +1,565 @@ +%%-------------------------------------------------------------------- +%% Copyright (c) 2024 EMQ Technologies Co., Ltd. All Rights Reserved. +%% +%% Licensed under the Apache License, Version 2.0 (the "License"); +%% you may not use this file except in compliance with the License. +%% You may obtain a copy of the License at +%% +%% http://www.apache.org/licenses/LICENSE-2.0 +%% +%% Unless required by applicable law or agreed to in writing, software +%% distributed under the License is distributed on an "AS IS" BASIS, +%% WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +%% See the License for the specific language governing permissions and +%% limitations under the License. +%%-------------------------------------------------------------------- +-module(emqx_ds_replication_SUITE). + +-compile(export_all). +-compile(nowarn_export_all). + +-include_lib("emqx/include/emqx.hrl"). +-include_lib("common_test/include/ct.hrl"). +-include_lib("stdlib/include/assert.hrl"). +-include_lib("snabbkaffe/include/test_macros.hrl"). + +-define(DB, testdb). + +opts() -> + opts(#{}). + +opts(Overrides) -> + maps:merge( + #{ + backend => builtin, + storage => {emqx_ds_storage_bitfield_lts, #{}}, + n_shards => 16, + n_sites => 1, + replication_factor => 3, + replication_options => #{ + wal_max_size_bytes => 64 * 1024, + wal_max_batch_size => 1024, + snapshot_interval => 128 + } + }, + Overrides + ). + +appspec(emqx_durable_storage) -> + {emqx_durable_storage, #{ + before_start => fun snabbkaffe:fix_ct_logging/0, + override_env => [{egress_flush_interval, 1}] + }}. + +t_replication_transfers_snapshots(init, Config) -> + Apps = [appspec(emqx_durable_storage)], + NodeSpecs = emqx_cth_cluster:mk_nodespecs( + [ + {t_replication_transfers_snapshots1, #{apps => Apps}}, + {t_replication_transfers_snapshots2, #{apps => Apps}}, + {t_replication_transfers_snapshots3, #{apps => Apps}} + ], + #{work_dir => emqx_cth_suite:work_dir(?FUNCTION_NAME, Config)} + ), + Nodes = emqx_cth_cluster:start(NodeSpecs), + [{nodes, Nodes}, {specs, NodeSpecs} | Config]; +t_replication_transfers_snapshots('end', Config) -> + ok = emqx_cth_cluster:stop(?config(nodes, Config)). + +t_replication_transfers_snapshots(Config) -> + NMsgs = 4000, + Nodes = [Node, NodeOffline | _] = ?config(nodes, Config), + _Specs = [_, SpecOffline | _] = ?config(specs, Config), + + %% Initialize DB on all nodes and wait for it to be online. + Opts = opts(#{n_shards => 1, n_sites => 3}), + ?assertEqual( + [{ok, ok} || _ <- Nodes], + erpc:multicall(Nodes, emqx_ds, open_db, [?DB, Opts]) + ), + ?retry( + 500, + 10, + ?assertMatch([[_], [_], [_]], [shards_online(N, ?DB) || N <- Nodes]) + ), + + %% Stop the DB on the "offline" node. + ok = emqx_cth_cluster:stop_node(NodeOffline), + + %% Fill the storage with messages and few additional generations. + Messages = fill_storage(Node, ?DB, NMsgs, #{p_addgen => 0.01}), + + %% Restart the node. + [NodeOffline] = emqx_cth_cluster:restart(SpecOffline), + {ok, SRef} = snabbkaffe:subscribe( + ?match_event(#{ + ?snk_kind := dsrepl_snapshot_accepted, + ?snk_meta := #{node := NodeOffline} + }) + ), + ?assertEqual( + ok, + erpc:call(NodeOffline, emqx_ds, open_db, [?DB, opts()]) + ), + + %% Trigger storage operation and wait the replica to be restored. + _ = add_generation(Node, ?DB), + ?assertMatch( + {ok, _}, + snabbkaffe:receive_events(SRef) + ), + + %% Wait until any pending replication activities are finished (e.g. Raft log entries). + ok = timer:sleep(3_000), + + %% Check that the DB has been restored. + Shard = hd(shards(NodeOffline, ?DB)), + MessagesOffline = lists:keysort( + #message.timestamp, + consume_shard(NodeOffline, ?DB, Shard, ['#'], 0) + ), + ?assertEqual( + sample(40, Messages), + sample(40, MessagesOffline) + ), + ?assertEqual( + Messages, + MessagesOffline + ). + +t_rebalance(init, Config) -> + Apps = [appspec(emqx_durable_storage)], + Nodes = emqx_cth_cluster:start( + [ + {t_rebalance1, #{apps => Apps}}, + {t_rebalance2, #{apps => Apps}}, + {t_rebalance3, #{apps => Apps}}, + {t_rebalance4, #{apps => Apps}} + ], + #{work_dir => emqx_cth_suite:work_dir(?FUNCTION_NAME, Config)} + ), + [{nodes, Nodes} | Config]; +t_rebalance('end', Config) -> + ok = emqx_cth_cluster:stop(?config(nodes, Config)). + +t_rebalance(Config) -> + %% This testcase verifies that the storage rebalancing works correctly: + %% 1. Join/leave operations are applied successfully. + %% 2. Message data survives the rebalancing. + %% 3. Shard cluster membership converges to the target replica allocation. + %% 4. Replication factor is respected. + + NMsgs = 800, + NClients = 5, + Nodes = [N1, N2, N3, N4] = ?config(nodes, Config), + + %% Initialize DB on the first node. + Opts = opts(#{n_shards => 16, n_sites => 1, replication_factor => 3}), + ?assertEqual(ok, erpc:call(N1, emqx_ds, open_db, [?DB, Opts])), + ?assertMatch( + Shards when length(Shards) == 16, + shards_online(N1, ?DB) + ), + + %% Open DB on the rest of the nodes. + ?assertEqual( + [{ok, ok} || _ <- [N2, N3, N4]], + erpc:multicall([N2, N3, N4], emqx_ds, open_db, [?DB, Opts]) + ), + + Sites = [S1, S2 | _Rest] = [ds_repl_meta(N, this_site) || N <- Nodes], + ct:pal("Sites: ~p~n", [Sites]), + + %% Only N1 should be responsible for all shards initially. + ?assertEqual( + [[S1] || _ <- Nodes], + [ds_repl_meta(N, db_sites, [?DB]) || N <- Nodes] + ), + + %% Fill the storage with messages and few additional generations. + %% This will force shards to trigger snapshot transfers during rebalance. + ClientMessages = emqx_utils:pmap( + fun(CID) -> + N = lists:nth(1 + (CID rem length(Nodes)), Nodes), + fill_storage(N, ?DB, NMsgs, #{client_id => integer_to_binary(CID)}) + end, + lists:seq(1, NClients), + infinity + ), + Messages1 = lists:sort(fun compare_message/2, lists:append(ClientMessages)), + + %% Join the second site to the DB replication sites. + ?assertEqual(ok, ds_repl_meta(N1, join_db_site, [?DB, S2])), + %% Should be no-op. + ?assertEqual(ok, ds_repl_meta(N2, join_db_site, [?DB, S2])), + ct:pal("Transitions (~p -> ~p): ~p~n", [[S1], [S1, S2], transitions(N1, ?DB)]), + + %% Fill in some more messages *during* the rebalance. + MessagesRB1 = fill_storage(N4, ?DB, NMsgs, #{client_id => <<"RB1">>}), + + ?retry(1000, 10, ?assertEqual([], transitions(N1, ?DB))), + + %% Now join the rest of the sites. + ?assertEqual(ok, ds_repl_meta(N2, assign_db_sites, [?DB, Sites])), + ct:pal("Transitions (~p -> ~p): ~p~n", [[S1, S2], Sites, transitions(N1, ?DB)]), + + %% Fill in some more messages *during* the rebalance. + MessagesRB2 = fill_storage(N4, ?DB, NMsgs, #{client_id => <<"RB2">>}), + + ?retry(1000, 10, ?assertEqual([], transitions(N2, ?DB))), + + %% Verify that each node is now responsible for 3/4 of the shards. + ?assertEqual( + [(16 * 3) div length(Nodes) || _ <- Nodes], + [n_shards_online(N, ?DB) || N <- Nodes] + ), + + %% Verify that the set of shard servers matches the target allocation. + Allocation = [ds_repl_meta(N, my_shards, [?DB]) || N <- Nodes], + ShardServers = [ + shard_server_info(N, ?DB, Shard, Site, readiness) + || {N, Site, Shards} <- lists:zip3(Nodes, Sites, Allocation), + Shard <- Shards + ], + ?assert( + lists:all(fun({_Server, Status}) -> Status == ready end, ShardServers), + ShardServers + ), + + %% Verify that the messages are preserved after the rebalance. + Messages = Messages1 ++ MessagesRB1 ++ MessagesRB2, + MessagesN4 = lists:sort(fun compare_message/2, consume(N4, ?DB, ['#'], 0)), + ?assertEqual(sample(20, Messages), sample(20, MessagesN4)), + ?assertEqual(Messages, MessagesN4), + + %% Scale down the cluster by removing the first node. + ?assertEqual(ok, ds_repl_meta(N1, leave_db_site, [?DB, S1])), + ct:pal("Transitions (~p -> ~p): ~p~n", [Sites, tl(Sites), transitions(N1, ?DB)]), + + ?retry(1000, 10, ?assertEqual([], transitions(N2, ?DB))), + + %% Verify that each node is now responsible for each shard. + ?assertEqual( + [0, 16, 16, 16], + [n_shards_online(N, ?DB) || N <- Nodes] + ), + + %% Verify that the messages are once again preserved after the rebalance. + MessagesN3 = lists:sort(fun compare_message/2, consume(N3, ?DB, ['#'], 0)), + ?assertEqual(sample(20, Messages), sample(20, MessagesN3)), + ?assertEqual(Messages, MessagesN3). + +t_join_leave_errors(init, Config) -> + Apps = [appspec(emqx_durable_storage)], + Nodes = emqx_cth_cluster:start( + [ + {t_join_leave_errors1, #{apps => Apps}}, + {t_join_leave_errors2, #{apps => Apps}} + ], + #{work_dir => emqx_cth_suite:work_dir(?FUNCTION_NAME, Config)} + ), + [{nodes, Nodes} | Config]; +t_join_leave_errors('end', Config) -> + ok = emqx_cth_cluster:stop(?config(nodes, Config)). + +t_join_leave_errors(Config) -> + %% This testcase verifies that logical errors arising during handling of + %% join/leave operations are reported correctly. + + [N1, N2] = ?config(nodes, Config), + + Opts = opts(#{n_shards => 16, n_sites => 1, replication_factor => 3}), + ?assertEqual(ok, erpc:call(N1, emqx_ds, open_db, [?DB, Opts])), + ?assertEqual(ok, erpc:call(N2, emqx_ds, open_db, [?DB, Opts])), + + [S1, S2] = [ds_repl_meta(N, this_site) || N <- [N1, N2]], + + ?assertEqual([S1], ds_repl_meta(N1, db_sites, [?DB])), + + %% Attempts to join a nonexistent DB / site. + ?assertEqual( + {error, {nonexistent_db, boo}}, + ds_repl_meta(N1, join_db_site, [_DB = boo, S1]) + ), + ?assertEqual( + {error, {nonexistent_sites, [<<"NO-MANS-SITE">>]}}, + ds_repl_meta(N1, join_db_site, [?DB, <<"NO-MANS-SITE">>]) + ), + %% NOTE: Leaving a non-existent site is not an error. + ?assertEqual( + ok, + ds_repl_meta(N1, leave_db_site, [?DB, <<"NO-MANS-SITE">>]) + ), + + %% Should be no-op. + ?assertEqual(ok, ds_repl_meta(N1, join_db_site, [?DB, S1])), + ?assertEqual([], transitions(N1, ?DB)), + + %% Impossible to leave the last site. + ?assertEqual( + {error, {too_few_sites, []}}, + ds_repl_meta(N1, leave_db_site, [?DB, S1]) + ), + + %% "Move" the DB to the other node. + ?assertEqual(ok, ds_repl_meta(N1, join_db_site, [?DB, S2])), + ?assertEqual(ok, ds_repl_meta(N2, leave_db_site, [?DB, S1])), + ?assertMatch([_ | _], transitions(N1, ?DB)), + ?retry(1000, 10, ?assertEqual([], transitions(N1, ?DB))), + + %% Should be no-op. + ?assertEqual(ok, ds_repl_meta(N2, leave_db_site, [?DB, S1])), + ?assertEqual([], transitions(N1, ?DB)). + +t_rebalance_chaotic_converges(init, Config) -> + Apps = [appspec(emqx_durable_storage)], + Nodes = emqx_cth_cluster:start( + [ + {t_rebalance_chaotic_converges1, #{apps => Apps}}, + {t_rebalance_chaotic_converges2, #{apps => Apps}}, + {t_rebalance_chaotic_converges3, #{apps => Apps}} + ], + #{work_dir => emqx_cth_suite:work_dir(?FUNCTION_NAME, Config)} + ), + [{nodes, Nodes} | Config]; +t_rebalance_chaotic_converges('end', Config) -> + ok = emqx_cth_cluster:stop(?config(nodes, Config)). + +t_rebalance_chaotic_converges(Config) -> + %% This testcase verifies that even a very chaotic sequence of join/leave + %% operations will still be handled consistently, and that the shard + %% allocation will converge to the target state. + + NMsgs = 500, + Nodes = [N1, N2, N3] = ?config(nodes, Config), + + %% Initialize DB on first two nodes. + Opts = opts(#{n_shards => 16, n_sites => 2, replication_factor => 3}), + ?assertEqual( + [{ok, ok}, {ok, ok}], + erpc:multicall([N1, N2], emqx_ds, open_db, [?DB, Opts]) + ), + + %% Open DB on the last node. + ?assertEqual( + ok, + erpc:call(N3, emqx_ds, open_db, [?DB, Opts]) + ), + + %% Find out which sites there are. + Sites = [S1, S2, S3] = [ds_repl_meta(N, this_site) || N <- Nodes], + ct:pal("Sites: ~p~n", [Sites]), + + %% Initially, the DB is assigned to [S1, S2]. + ?retry(500, 10, ?assertEqual([16, 16], [n_shards_online(N, ?DB) || N <- [N1, N2]])), + ?assertEqual( + lists:sort([S1, S2]), + ds_repl_meta(N1, db_sites, [?DB]) + ), + + %% Fill the storage with messages and few additional generations. + Messages0 = lists:append([ + fill_storage(N1, ?DB, NMsgs, #{client_id => <<"C1">>}), + fill_storage(N2, ?DB, NMsgs, #{client_id => <<"C2">>}), + fill_storage(N3, ?DB, NMsgs, #{client_id => <<"C3">>}) + ]), + + %% Construct a chaotic transition sequence that changes assignment to [S2, S3]. + Sequence = [ + {N1, join_db_site, S3}, + {N2, leave_db_site, S2}, + {N3, leave_db_site, S1}, + {N1, join_db_site, S2}, + {N2, join_db_site, S1}, + {N3, leave_db_site, S3}, + {N1, leave_db_site, S1}, + {N2, join_db_site, S3} + ], + + %% Apply the sequence while also filling the storage with messages. + TransitionMessages = lists:map( + fun({N, Operation, Site}) -> + %% Apply the transition. + ?assertEqual(ok, ds_repl_meta(N, Operation, [?DB, Site])), + %% Give some time for at least one transition to complete. + Transitions = transitions(N, ?DB), + ct:pal("Transitions after ~p: ~p", [Operation, Transitions]), + ?retry(200, 10, ?assertNotEqual(Transitions, transitions(N, ?DB))), + %% Fill the storage with messages. + CID = integer_to_binary(erlang:system_time()), + fill_storage(N, ?DB, NMsgs, #{client_id => CID}) + end, + Sequence + ), + + %% Wait for the last transition to complete. + ?retry(500, 20, ?assertEqual([], transitions(N1, ?DB))), + + ?assertEqual( + lists:sort([S2, S3]), + ds_repl_meta(N1, db_sites, [?DB]) + ), + + %% Check that all messages are still there. + Messages = lists:append(TransitionMessages) ++ Messages0, + MessagesDB = lists:sort(fun compare_message/2, consume(N1, ?DB, ['#'], 0)), + ?assertEqual(sample(20, Messages), sample(20, MessagesDB)), + ?assertEqual(Messages, MessagesDB). + +t_rebalance_offline_restarts(init, Config) -> + Apps = [appspec(emqx_durable_storage)], + Specs = emqx_cth_cluster:mk_nodespecs( + [ + {t_rebalance_offline_restarts1, #{apps => Apps}}, + {t_rebalance_offline_restarts2, #{apps => Apps}}, + {t_rebalance_offline_restarts3, #{apps => Apps}} + ], + #{work_dir => emqx_cth_suite:work_dir(?FUNCTION_NAME, Config)} + ), + Nodes = emqx_cth_cluster:start(Specs), + [{nodes, Nodes}, {nodespecs, Specs} | Config]; +t_rebalance_offline_restarts('end', Config) -> + ok = emqx_cth_cluster:stop(?config(nodes, Config)). + +t_rebalance_offline_restarts(Config) -> + %% This testcase verifies that rebalancing progresses if nodes restart or + %% go offline and never come back. + + Nodes = [N1, N2, N3] = ?config(nodes, Config), + _Specs = [NS1, NS2, _] = ?config(nodespecs, Config), + + %% Initialize DB on all 3 nodes. + Opts = opts(#{n_shards => 8, n_sites => 3, replication_factor => 3}), + ?assertEqual( + [{ok, ok} || _ <- Nodes], + erpc:multicall(Nodes, emqx_ds, open_db, [?DB, Opts]) + ), + ?retry( + 1000, + 5, + ?assertEqual([8 || _ <- Nodes], [n_shards_online(N, ?DB) || N <- Nodes]) + ), + + %% Find out which sites are there. + Sites = [S1, S2, S3] = [ds_repl_meta(N, this_site) || N <- Nodes], + ct:pal("Sites: ~p~n", [Sites]), + + %% Shut down N3 and then remove it from the DB. + ok = emqx_cth_cluster:stop_node(N3), + ?assertEqual(ok, ds_repl_meta(N1, leave_db_site, [?DB, S3])), + Transitions = transitions(N1, ?DB), + ct:pal("Transitions: ~p~n", [Transitions]), + + %% Wait until at least one transition completes. + ?block_until(#{?snk_kind := dsrepl_shard_transition_end}), + + %% Restart N1 and N2. + [N1] = emqx_cth_cluster:restart(NS1), + [N2] = emqx_cth_cluster:restart(NS2), + ?assertEqual( + [{ok, ok}, {ok, ok}], + erpc:multicall([N1, N2], emqx_ds, open_db, [?DB, Opts]) + ), + + %% Target state should still be reached eventually. + ?retry(1000, 20, ?assertEqual([], transitions(N1, ?DB))), + ?assertEqual(lists:sort([S1, S2]), ds_repl_meta(N1, db_sites, [?DB])). + +%% + +shard_server_info(Node, DB, Shard, Site, Info) -> + Server = shard_server(Node, DB, Shard, Site), + {Server, ds_repl_shard(Node, server_info, [Info, Server])}. + +shard_server(Node, DB, Shard, Site) -> + ds_repl_shard(Node, shard_server, [DB, Shard, Site]). + +ds_repl_meta(Node, Fun) -> + ds_repl_meta(Node, Fun, []). + +ds_repl_meta(Node, Fun, Args) -> + erpc:call(Node, emqx_ds_replication_layer_meta, Fun, Args). + +ds_repl_shard(Node, Fun, Args) -> + erpc:call(Node, emqx_ds_replication_layer_shard, Fun, Args). + +transitions(Node, DB) -> + Shards = shards(Node, DB), + [{S, T} || S <- Shards, T <- ds_repl_meta(Node, replica_set_transitions, [DB, S])]. + +shards(Node, DB) -> + erpc:call(Node, emqx_ds_replication_layer_meta, shards, [DB]). + +shards_online(Node, DB) -> + erpc:call(Node, emqx_ds_builtin_db_sup, which_shards, [DB]). + +n_shards_online(Node, DB) -> + length(shards_online(Node, DB)). + +fill_storage(Node, DB, NMsgs, Opts) -> + fill_storage(Node, DB, NMsgs, 0, Opts). + +fill_storage(Node, DB, NMsgs, I, Opts) when I < NMsgs -> + PAddGen = maps:get(p_addgen, Opts, 0.001), + R1 = push_message(Node, DB, I, Opts), + R2 = probably(PAddGen, fun() -> add_generation(Node, DB) end), + R1 ++ R2 ++ fill_storage(Node, DB, NMsgs, I + 1, Opts); +fill_storage(_Node, _DB, NMsgs, NMsgs, _Opts) -> + []. + +push_message(Node, DB, I, Opts) -> + Topic = emqx_topic:join([<<"topic">>, <<"foo">>, integer_to_binary(I)]), + {Bytes, _} = rand:bytes_s(120, rand:seed_s(default, I)), + ClientId = maps:get(client_id, Opts, <>), + Message = message(ClientId, Topic, Bytes, I * 100), + ok = erpc:call(Node, emqx_ds, store_batch, [DB, [Message], #{sync => true}]), + [Message]. + +add_generation(Node, DB) -> + ok = erpc:call(Node, emqx_ds, add_generation, [DB]), + []. + +message(ClientId, Topic, Payload, PublishedAt) -> + #message{ + from = ClientId, + topic = Topic, + payload = Payload, + timestamp = PublishedAt, + id = emqx_guid:gen() + }. + +compare_message(M1, M2) -> + {M1#message.from, M1#message.timestamp} < {M2#message.from, M2#message.timestamp}. + +consume(Node, DB, TopicFilter, StartTime) -> + erpc:call(Node, emqx_ds_test_helpers, consume, [DB, TopicFilter, StartTime]). + +consume_shard(Node, DB, Shard, TopicFilter, StartTime) -> + erpc:call(Node, emqx_ds_test_helpers, storage_consume, [{DB, Shard}, TopicFilter, StartTime]). + +probably(P, Fun) -> + case rand:uniform() of + X when X < P -> Fun(); + _ -> [] + end. + +sample(N, List) -> + L = length(List), + H = N div 2, + Filler = integer_to_list(L - N) ++ " more", + lists:sublist(List, H) ++ [Filler] ++ lists:sublist(List, L - H, L). + +%% + +suite() -> [{timetrap, {seconds, 60}}]. + +all() -> emqx_common_test_helpers:all(?MODULE). + +init_per_testcase(TCName, Config0) -> + Config = emqx_common_test_helpers:init_per_testcase(?MODULE, TCName, Config0), + ok = snabbkaffe:start_trace(), + Config. + +end_per_testcase(TCName, Config) -> + ok = snabbkaffe:stop(), + emqx_common_test_helpers:end_per_testcase(?MODULE, TCName, Config). diff --git a/apps/emqx_durable_storage/test/emqx_ds_storage_SUITE.erl b/apps/emqx_durable_storage/test/emqx_ds_storage_SUITE.erl new file mode 100644 index 000000000..eaddab0c6 --- /dev/null +++ b/apps/emqx_durable_storage/test/emqx_ds_storage_SUITE.erl @@ -0,0 +1,148 @@ +%%-------------------------------------------------------------------- +%% Copyright (c) 2024 EMQ Technologies Co., Ltd. All Rights Reserved. +%% +%% Licensed under the Apache License, Version 2.0 (the "License"); +%% you may not use this file except in compliance with the License. +%% You may obtain a copy of the License at +%% +%% http://www.apache.org/licenses/LICENSE-2.0 +%% +%% Unless required by applicable law or agreed to in writing, software +%% distributed under the License is distributed on an "AS IS" BASIS, +%% WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +%% See the License for the specific language governing permissions and +%% limitations under the License. +%%-------------------------------------------------------------------- +-module(emqx_ds_storage_SUITE). + +-compile(export_all). +-compile(nowarn_export_all). + +-include_lib("emqx/include/emqx.hrl"). +-include_lib("common_test/include/ct.hrl"). +-include_lib("stdlib/include/assert.hrl"). + +opts() -> + #{storage => {emqx_ds_storage_bitfield_lts, #{}}}. + +%% + +t_idempotent_store_batch(_Config) -> + Shard = {?FUNCTION_NAME, _ShardId = <<"42">>}, + {ok, Pid} = emqx_ds_storage_layer:start_link(Shard, opts()), + %% Push some messages to the shard. + Msgs1 = [gen_message(N) || N <- lists:seq(10, 20)], + GenTs = 30, + Msgs2 = [gen_message(N) || N <- lists:seq(40, 50)], + ?assertEqual(ok, emqx_ds_storage_layer:store_batch(Shard, batch(Msgs1), #{})), + %% Add new generation and push the same batch + some more. + ?assertEqual(ok, emqx_ds_storage_layer:add_generation(Shard, GenTs)), + ?assertEqual(ok, emqx_ds_storage_layer:store_batch(Shard, batch(Msgs1), #{})), + ?assertEqual(ok, emqx_ds_storage_layer:store_batch(Shard, batch(Msgs2), #{})), + %% First batch should have been handled idempotently. + ?assertEqual( + Msgs1 ++ Msgs2, + lists:keysort(#message.timestamp, emqx_ds_test_helpers:storage_consume(Shard, ['#'])) + ), + ok = stop_shard(Pid). + +t_snapshot_take_restore(_Config) -> + Shard = {?FUNCTION_NAME, _ShardId = <<"42">>}, + {ok, Pid} = emqx_ds_storage_layer:start_link(Shard, opts()), + + %% Push some messages to the shard. + Msgs1 = [gen_message(N) || N <- lists:seq(1000, 2000)], + ?assertEqual(ok, emqx_ds_storage_layer:store_batch(Shard, batch(Msgs1), #{})), + + %% Add new generation and push some more. + ?assertEqual(ok, emqx_ds_storage_layer:add_generation(Shard, 3000)), + Msgs2 = [gen_message(N) || N <- lists:seq(4000, 5000)], + ?assertEqual(ok, emqx_ds_storage_layer:store_batch(Shard, batch(Msgs2), #{})), + ?assertEqual(ok, emqx_ds_storage_layer:add_generation(Shard, 6000)), + + %% Take a snapshot of the shard. + {ok, SnapReader} = emqx_ds_storage_layer:take_snapshot(Shard), + + %% Push even more messages to the shard AFTER taking the snapshot. + Msgs3 = [gen_message(N) || N <- lists:seq(7000, 8000)], + ?assertEqual(ok, emqx_ds_storage_layer:store_batch(Shard, batch(Msgs3), #{})), + + %% Destroy the shard. + ok = stop_shard(Pid), + ok = emqx_ds_storage_layer:drop_shard(Shard), + + %% Restore the shard from the snapshot. + {ok, SnapWriter} = emqx_ds_storage_layer:accept_snapshot(Shard), + ?assertEqual(ok, transfer_snapshot(SnapReader, SnapWriter)), + + %% Verify that the restored shard contains the messages up until the snapshot. + {ok, _Pid} = emqx_ds_storage_layer:start_link(Shard, opts()), + ?assertEqual( + Msgs1 ++ Msgs2, + lists:keysort(#message.timestamp, emqx_ds_test_helpers:storage_consume(Shard, ['#'])) + ). + +transfer_snapshot(Reader, Writer) -> + ChunkSize = rand:uniform(1024), + ReadResult = emqx_ds_storage_snapshot:read_chunk(Reader, ChunkSize), + ?assertMatch({RStatus, _, _} when RStatus == next; RStatus == last, ReadResult), + {RStatus, Chunk, NReader} = ReadResult, + Data = iolist_to_binary(Chunk), + {WStatus, NWriter} = emqx_ds_storage_snapshot:write_chunk(Writer, Data), + %% Verify idempotency. + ?assertMatch( + {WStatus, NWriter}, + emqx_ds_storage_snapshot:write_chunk(NWriter, Data) + ), + %% Verify convergence. + ?assertEqual( + RStatus, + WStatus, + #{reader => NReader, writer => NWriter} + ), + case WStatus of + last -> + ?assertEqual(ok, emqx_ds_storage_snapshot:release_reader(NReader)), + ?assertEqual(ok, emqx_ds_storage_snapshot:release_writer(NWriter)), + ok; + next -> + transfer_snapshot(NReader, NWriter) + end. + +%% + +batch(Msgs) -> + [{emqx_message:timestamp(Msg), Msg} || Msg <- Msgs]. + +gen_message(N) -> + Topic = emqx_topic:join([<<"foo">>, <<"bar">>, integer_to_binary(N)]), + message(Topic, crypto:strong_rand_bytes(16), N). + +message(Topic, Payload, PublishedAt) -> + #message{ + from = <>, + topic = Topic, + payload = Payload, + timestamp = PublishedAt, + id = emqx_guid:gen() + }. + +stop_shard(Pid) -> + _ = unlink(Pid), + proc_lib:stop(Pid, shutdown, infinity). + +%% + +all() -> emqx_common_test_helpers:all(?MODULE). + +init_per_testcase(TCName, Config) -> + WorkDir = emqx_cth_suite:work_dir(TCName, Config), + Apps = emqx_cth_suite:start( + [{emqx_durable_storage, #{override_env => [{db_data_dir, WorkDir}]}}], + #{work_dir => WorkDir} + ), + [{apps, Apps} | Config]. + +end_per_testcase(_TCName, Config) -> + ok = emqx_cth_suite:stop(?config(apps, Config)), + ok. diff --git a/apps/emqx_durable_storage/test/emqx_ds_storage_bitfield_lts_SUITE.erl b/apps/emqx_durable_storage/test/emqx_ds_storage_bitfield_lts_SUITE.erl index 636b57b89..78838e675 100644 --- a/apps/emqx_durable_storage/test/emqx_ds_storage_bitfield_lts_SUITE.erl +++ b/apps/emqx_durable_storage/test/emqx_ds_storage_bitfield_lts_SUITE.erl @@ -261,8 +261,7 @@ t_atomic_store_batch(_Config) -> sync => true }) ), - - ok + timer:sleep(1000) end, fun(Trace) -> %% Must contain exactly one flush with all messages. @@ -293,19 +292,18 @@ t_non_atomic_store_batch(_Config) -> sync => true }) ), - - ok + Msgs end, - fun(Trace) -> - %% Should contain one flush per message. - ?assertMatch( - [#{batch := [_]}, #{batch := [_]}, #{batch := [_]}], - ?of_kind(emqx_ds_replication_layer_egress_flush, Trace) + fun(ExpectedMsgs, Trace) -> + ProcessedMsgs = lists:append( + ?projection(batch, ?of_kind(emqx_ds_replication_layer_egress_flush, Trace)) ), - ok + ?assertEqual( + ExpectedMsgs, + ProcessedMsgs + ) end - ), - ok. + ). check(Shard, TopicFilter, StartTime, ExpectedMessages) -> ExpectedFiltered = lists:filter( diff --git a/apps/emqx_durable_storage/test/emqx_ds_test_helpers.erl b/apps/emqx_durable_storage/test/emqx_ds_test_helpers.erl index d26c6dd30..be4f7bcdf 100644 --- a/apps/emqx_durable_storage/test/emqx_ds_test_helpers.erl +++ b/apps/emqx_durable_storage/test/emqx_ds_test_helpers.erl @@ -56,3 +56,71 @@ mock_rpc_result(gen_rpc, ExpectFun) -> {badrpc, timeout} end end). + +%% Consuming streams and iterators + +consume(DB, TopicFilter) -> + consume(DB, TopicFilter, 0). + +consume(DB, TopicFilter, StartTime) -> + lists:flatmap( + fun({_Stream, Msgs}) -> + Msgs + end, + consume_per_stream(DB, TopicFilter, StartTime) + ). + +consume_per_stream(DB, TopicFilter, StartTime) -> + Streams = emqx_ds:get_streams(DB, TopicFilter, StartTime), + lists:map( + fun({_Rank, Stream}) -> {Stream, consume_stream(DB, Stream, TopicFilter, StartTime)} end, + Streams + ). + +consume_stream(DB, Stream, TopicFilter, StartTime) -> + {ok, It0} = emqx_ds:make_iterator(DB, Stream, TopicFilter, StartTime), + {ok, _It, Msgs} = consume_iter(DB, It0), + Msgs. + +consume_iter(DB, It) -> + consume_iter(DB, It, #{}). + +consume_iter(DB, It, Opts) -> + consume_iter_with(fun emqx_ds:next/3, [DB], It, Opts). + +storage_consume(ShardId, TopicFilter) -> + storage_consume(ShardId, TopicFilter, 0). + +storage_consume(ShardId, TopicFilter, StartTime) -> + Streams = emqx_ds_storage_layer:get_streams(ShardId, TopicFilter, StartTime), + lists:flatmap( + fun({_Rank, Stream}) -> + storage_consume_stream(ShardId, Stream, TopicFilter, StartTime) + end, + Streams + ). + +storage_consume_stream(ShardId, Stream, TopicFilter, StartTime) -> + {ok, It0} = emqx_ds_storage_layer:make_iterator(ShardId, Stream, TopicFilter, StartTime), + {ok, _It, Msgs} = storage_consume_iter(ShardId, It0), + Msgs. + +storage_consume_iter(ShardId, It) -> + storage_consume_iter(ShardId, It, #{}). + +storage_consume_iter(ShardId, It, Opts) -> + consume_iter_with(fun emqx_ds_storage_layer:next/3, [ShardId], It, Opts). + +consume_iter_with(NextFun, Args, It0, Opts) -> + BatchSize = maps:get(batch_size, Opts, 5), + case erlang:apply(NextFun, Args ++ [It0, BatchSize]) of + {ok, It, _Msgs = []} -> + {ok, It, []}; + {ok, It1, Batch} -> + {ok, It, Msgs} = consume_iter_with(NextFun, Args, It1, Opts), + {ok, It, [Msg || {_DSKey, Msg} <- Batch] ++ Msgs}; + {ok, Eos = end_of_stream} -> + {ok, Eos, []}; + {error, Class, Reason} -> + error({error, Class, Reason}) + end. diff --git a/apps/emqx_eviction_agent/src/emqx_eviction_agent.app.src b/apps/emqx_eviction_agent/src/emqx_eviction_agent.app.src index 10a464f26..7e692bf9c 100644 --- a/apps/emqx_eviction_agent/src/emqx_eviction_agent.app.src +++ b/apps/emqx_eviction_agent/src/emqx_eviction_agent.app.src @@ -9,7 +9,8 @@ {applications, [ kernel, stdlib, - emqx_ctl + emqx_ctl, + emqx ]}, {mod, {emqx_eviction_agent_app, []}}, {env, []}, diff --git a/apps/emqx_gateway/src/emqx_gateway.app.src b/apps/emqx_gateway/src/emqx_gateway.app.src index 731a1807c..3c6634edc 100644 --- a/apps/emqx_gateway/src/emqx_gateway.app.src +++ b/apps/emqx_gateway/src/emqx_gateway.app.src @@ -1,7 +1,7 @@ %% -*- mode: erlang -*- {application, emqx_gateway, [ {description, "The Gateway management application"}, - {vsn, "0.1.31"}, + {vsn, "0.1.32"}, {registered, []}, {mod, {emqx_gateway_app, []}}, {applications, [kernel, stdlib, emqx, emqx_auth, emqx_ctl]}, diff --git a/apps/emqx_gateway/src/emqx_gateway_api_listeners.erl b/apps/emqx_gateway/src/emqx_gateway_api_listeners.erl index 22d76fe60..30e9762e4 100644 --- a/apps/emqx_gateway/src/emqx_gateway_api_listeners.erl +++ b/apps/emqx_gateway/src/emqx_gateway_api_listeners.erl @@ -247,9 +247,10 @@ page_params(Qs) -> get_cluster_listeners_info(GwName) -> Listeners = emqx_gateway_conf:listeners(GwName), ListenOns = lists:map( - fun(#{id := Id} = Conf) -> + fun(#{id := Id, type := Type0} = Conf) -> + Type = binary_to_existing_atom(Type0), ListenOn = emqx_gateway_conf:get_bind(Conf), - {Id, ListenOn} + {Type, Id, ListenOn} end, Listeners ), @@ -293,17 +294,11 @@ listeners_cluster_status(Listeners) -> do_listeners_cluster_status(Listeners) -> Node = node(), lists:foldl( - fun({Id, ListenOn}, Acc) -> - BinId = erlang:atom_to_binary(Id), - {ok, #{<<"max_connections">> := Max}} = emqx_gateway_conf:listener(BinId), - {Running, Curr} = - try esockd:get_current_connections({Id, ListenOn}) of - Int -> {true, Int} - catch - %% not started - error:not_found -> - {false, 0} - end, + fun({Type, Id, ListenOn}, Acc) -> + {Running, Curr} = current_listener_status(Type, Id, ListenOn), + {ok, #{<<"max_connections">> := Max}} = emqx_gateway_conf:listener( + erlang:atom_to_binary(Id) + ), Acc#{ Id => #{ node => Node, @@ -319,6 +314,24 @@ do_listeners_cluster_status(Listeners) -> Listeners ). +current_listener_status(Type, Id, _ListenOn) when Type =:= ws; Type =:= wss -> + Info = ranch:info(Id), + Conns = proplists:get_value(all_connections, Info, 0), + Running = + case proplists:get_value(status, Info) of + running -> true; + _ -> false + end, + {Running, Conns}; +current_listener_status(_Type, Id, ListenOn) -> + try esockd:get_current_connections({Id, ListenOn}) of + Int -> {true, Int} + catch + %% not started + error:not_found -> + {false, 0} + end. + ensure_integer_or_infinity(infinity) -> infinity; ensure_integer_or_infinity(<<"infinity">>) -> @@ -762,9 +775,9 @@ examples_listener() -> <<"tlsv1.1">>, <<"tlsv1">> ], - cacertfile => <<"/etc/emqx/certs/cacert.pem">>, - certfile => <<"/etc/emqx/certs/cert.pem">>, - keyfile => <<"/etc/emqx/certs/key.pem">>, + cacertfile => <<"${EMQX_ETC_DIR}/certs/cacert.pem">>, + certfile => <<"${EMQX_ETC_DIR}/certs/cert.pem">>, + keyfile => <<"${EMQX_ETC_DIR}/certs/key.pem">>, verify => <<"verify_none">>, fail_if_no_peer_cert => false }, @@ -808,9 +821,9 @@ examples_listener() -> dtls_options => #{ versions => [<<"dtlsv1.2">>, <<"dtlsv1">>], - cacertfile => <<"/etc/emqx/certs/cacert.pem">>, - certfile => <<"/etc/emqx/certs/cert.pem">>, - keyfile => <<"/etc/emqx/certs/key.pem">>, + cacertfile => <<"${EMQX_ETC_DIR}/certs/cacert.pem">>, + certfile => <<"${EMQX_ETC_DIR}/certs/cert.pem">>, + keyfile => <<"${EMQX_ETC_DIR}/certs/key.pem">>, verify => <<"verify_none">>, fail_if_no_peer_cert => false }, @@ -835,9 +848,9 @@ examples_listener() -> dtls_options => #{ versions => [<<"dtlsv1.2">>, <<"dtlsv1">>], - cacertfile => <<"/etc/emqx/certs/cacert.pem">>, - certfile => <<"/etc/emqx/certs/cert.pem">>, - keyfile => <<"/etc/emqx/certs/key.pem">>, + cacertfile => <<"${EMQX_ETC_DIR}/certs/cacert.pem">>, + certfile => <<"${EMQX_ETC_DIR}/certs/cert.pem">>, + keyfile => <<"${EMQX_ETC_DIR}/certs/key.pem">>, verify => <<"verify_none">>, user_lookup_fun => <<"emqx_tls_psk:lookup">>, ciphers => @@ -869,5 +882,95 @@ examples_listener() -> user_id_type => <<"username">> } } + }, + ws_listener => + #{ + summary => <<"A simple WebSocket listener example">>, + value => + #{ + name => <<"ws-def">>, + type => <<"ws">>, + bind => <<"33043">>, + acceptors => 16, + max_connections => 1024000, + max_conn_rate => 1000, + websocket => + #{ + path => <<"/ocpp">>, + fail_if_no_subprotocol => true, + supported_subprotocols => <<"ocpp1.6">>, + check_origin_enable => false, + check_origins => + <<"http://localhost:18083, http://127.0.0.1:18083">>, + compress => false, + piggyback => <<"single">> + }, + tcp_options => + #{ + active_n => 100, + backlog => 1024, + send_timeout => <<"15s">>, + send_timeout_close => true, + recbuf => <<"10KB">>, + sndbuf => <<"10KB">>, + buffer => <<"10KB">>, + high_watermark => <<"1MB">>, + nodelay => false, + reuseaddr => true, + keepalive => "none" + } + } + }, + wss_listener => + #{ + summary => <<"A simple WebSocket/TLS listener example">>, + value => + #{ + name => <<"ws-ssl-def">>, + type => <<"wss">>, + bind => <<"33053">>, + acceptors => 16, + max_connections => 1024000, + max_conn_rate => 1000, + websocket => + #{ + path => <<"/ocpp">>, + fail_if_no_subprotocol => true, + supported_subprotocols => <<"ocpp1.6">>, + check_origin_enable => false, + check_origins => + <<"http://localhost:18083, http://127.0.0.1:18083">>, + compress => false, + piggyback => <<"single">> + }, + ssl_options => + #{ + versions => [ + <<"tlsv1.3">>, + <<"tlsv1.2">>, + <<"tlsv1.1">>, + <<"tlsv1">> + ], + cacertfile => <<"${EMQX_ETC_DIR}/certs/cacert.pem">>, + certfile => <<"${EMQX_ETC_DIR}/certs/cert.pem">>, + keyfile => <<"${EMQX_ETC_DIR}/certs/key.pem">>, + verify => <<"verify_none">>, + fail_if_no_peer_cert => false + }, + tcp_options => + #{ + active_n => 100, + backlog => 1024, + send_timeout => <<"15s">>, + send_timeout_close => true, + recbuf => <<"10KB">>, + sndbuf => <<"10KB">>, + buffer => <<"10KB">>, + high_watermark => <<"1MB">>, + nodelay => false, + reuseaddr => true, + keepalive => "none" + } + } } }. diff --git a/apps/emqx_gateway_ocpp/include/emqx_ocpp.hrl b/apps/emqx_gateway_ocpp/include/emqx_ocpp.hrl index dc779dc76..805c5f6f4 100644 --- a/apps/emqx_gateway_ocpp/include/emqx_ocpp.hrl +++ b/apps/emqx_gateway_ocpp/include/emqx_ocpp.hrl @@ -86,10 +86,9 @@ -define(IS_ERROR(F), F = #{type := ?OCPP_MSG_TYPE_ID_CALLERROR}). -define(IS_ERROR(F, Id), F = #{type := ?OCPP_MSG_TYPE_ID_CALLERROR, id := Id}). --define(IS_BootNotification_RESP(Payload), #{ +-define(IS_BootNotification_RESP(Status, Interval), #{ type := ?OCPP_MSG_TYPE_ID_CALLRESULT, - action := ?OCPP_ACT_BootNotification, - payload := Payload + payload := #{<<"status">> := Status, <<"interval">> := Interval} }). -define(ERR_FRAME(Id, Code, Desc), #{ diff --git a/apps/emqx_gateway_ocpp/src/emqx_gateway_ocpp.app.src b/apps/emqx_gateway_ocpp/src/emqx_gateway_ocpp.app.src index c7981a033..8682c164c 100644 --- a/apps/emqx_gateway_ocpp/src/emqx_gateway_ocpp.app.src +++ b/apps/emqx_gateway_ocpp/src/emqx_gateway_ocpp.app.src @@ -1,6 +1,6 @@ {application, emqx_gateway_ocpp, [ {description, "OCPP-J 1.6 Gateway for EMQX"}, - {vsn, "0.1.3"}, + {vsn, "0.1.4"}, {registered, []}, {applications, [kernel, stdlib, jesse, emqx, emqx_gateway]}, {env, []}, diff --git a/apps/emqx_gateway_ocpp/src/emqx_ocpp_channel.erl b/apps/emqx_gateway_ocpp/src/emqx_ocpp_channel.erl index cb8ec7e91..d20b35d04 100644 --- a/apps/emqx_gateway_ocpp/src/emqx_ocpp_channel.erl +++ b/apps/emqx_gateway_ocpp/src/emqx_ocpp_channel.erl @@ -527,20 +527,19 @@ apply_frame(Frames, Channel) when is_list(Frames) -> {Outgoings, NChannel} = lists:foldl(fun do_apply_frame/2, {[], Channel}, Frames), {lists:reverse(Outgoings), NChannel}; apply_frame(Frames, Channel) -> - ?SLOG(error, #{msg => "unexpected_frame_list", frames => Frames, channel => Channel}), + ?SLOG(error, #{msg => "unexpected_frame_list", frames => Frames}), Channel. -do_apply_frame(?IS_BootNotification_RESP(Payload), {Outgoings, Channel}) -> - case maps:get(<<"status">>, Payload) of +do_apply_frame(?IS_BootNotification_RESP(Status, Interval), {Outgoings, Channel}) -> + case Status of <<"Accepted">> -> - Intv = maps:get(<<"interval">>, Payload), - ?SLOG(info, #{msg => "adjust_heartbeat_timer", new_interval_s => Intv}), - {[{event, updated} | Outgoings], reset_keepalive(Intv, Channel)}; + ?SLOG(info, #{msg => "adjust_heartbeat_timer", new_interval_s => Interval}), + {[{event, updated} | Outgoings], reset_keepalive(Interval, Channel)}; _ -> {Outgoings, Channel} end; -do_apply_frame(Frame, Acc = {_Outgoings, Channel}) -> - ?SLOG(error, #{msg => "unexpected_frame", frame => Frame, channel => Channel}), +do_apply_frame(Frame, Acc = {_Outgoings, _Channel}) -> + ?SLOG(info, #{msg => "skip_to_apply_frame", frame => Frame}), Acc. %%-------------------------------------------------------------------- @@ -762,19 +761,15 @@ payload2frame(#{ action => Action, payload => Payload }; -payload2frame( - MqttPayload = - #{ - <<"MessageTypeId">> := ?OCPP_MSG_TYPE_ID_CALLRESULT, - <<"UniqueId">> := Id, - <<"Payload">> := Payload - } -) -> - Action = maps:get(<<"Action">>, MqttPayload, undefined), +payload2frame(#{ + <<"MessageTypeId">> := ?OCPP_MSG_TYPE_ID_CALLRESULT, + <<"UniqueId">> := Id, + <<"Payload">> := Payload +}) -> #{ type => ?OCPP_MSG_TYPE_ID_CALLRESULT, id => Id, - action => Action, + action => undefined, payload => Payload }; payload2frame(#{ diff --git a/apps/emqx_gateway_ocpp/test/emqx_ocpp_SUITE.erl b/apps/emqx_gateway_ocpp/test/emqx_ocpp_SUITE.erl index 6d00726cf..e63f8891d 100644 --- a/apps/emqx_gateway_ocpp/test/emqx_ocpp_SUITE.erl +++ b/apps/emqx_gateway_ocpp/test/emqx_ocpp_SUITE.erl @@ -16,6 +16,7 @@ -module(emqx_ocpp_SUITE). +-include("emqx_ocpp.hrl"). -include_lib("eunit/include/eunit.hrl"). -include_lib("common_test/include/ct.hrl"). @@ -145,3 +146,136 @@ t_enable_disable_gw_ocpp(_Config) -> AssertEnabled(false), ?assertEqual({204, #{}}, request(put, "/gateways/ocpp/enable/true", <<>>)), AssertEnabled(true). + +t_adjust_keepalive_timer(_Config) -> + {ok, ClientPid} = connect("127.0.0.1", 33033, <<"client1">>), + UniqueId = <<"3335862321">>, + BootNotification = #{ + id => UniqueId, + type => ?OCPP_MSG_TYPE_ID_CALL, + action => <<"BootNotification">>, + payload => #{ + <<"chargePointVendor">> => <<"vendor1">>, + <<"chargePointModel">> => <<"model1">> + } + }, + ok = send_msg(ClientPid, BootNotification), + %% check the default keepalive timer + timer:sleep(1000), + ?assertMatch( + #{conninfo := #{keepalive := 60}}, emqx_gateway_cm:get_chan_info(ocpp, <<"client1">>) + ), + %% publish the BootNotification.ack + AckPayload = emqx_utils_json:encode(#{ + <<"MessageTypeId">> => ?OCPP_MSG_TYPE_ID_CALLRESULT, + <<"UniqueId">> => UniqueId, + <<"Payload">> => #{ + <<"currentTime">> => "2023-06-21T14:20:39+00:00", + <<"interval">> => 300, + <<"status">> => <<"Accepted">> + } + }), + _ = emqx:publish(emqx_message:make(<<"ocpp/cs/client1">>, AckPayload)), + {ok, _Resp} = receive_msg(ClientPid), + %% assert: check the keepalive timer is adjusted + ?assertMatch( + #{conninfo := #{keepalive := 300}}, emqx_gateway_cm:get_chan_info(ocpp, <<"client1">>) + ), + %% close conns + close(ClientPid), + timer:sleep(1000), + %% assert: + ?assertEqual(undefined, emqx_gateway_cm:get_chan_info(ocpp, <<"client1">>)), + ok. + +t_listeners_status(_Config) -> + {200, [Listener]} = request(get, "/gateways/ocpp/listeners"), + ?assertMatch( + #{ + status := #{running := true, current_connections := 0} + }, + Listener + ), + %% add a connection + {ok, ClientPid} = connect("127.0.0.1", 33033, <<"client1">>), + UniqueId = <<"3335862321">>, + BootNotification = #{ + id => UniqueId, + type => ?OCPP_MSG_TYPE_ID_CALL, + action => <<"BootNotification">>, + payload => #{ + <<"chargePointVendor">> => <<"vendor1">>, + <<"chargePointModel">> => <<"model1">> + } + }, + ok = send_msg(ClientPid, BootNotification), + timer:sleep(1000), + %% assert: the current_connections is 1 + {200, [Listener1]} = request(get, "/gateways/ocpp/listeners"), + ?assertMatch( + #{ + status := #{running := true, current_connections := 1} + }, + Listener1 + ), + %% close conns + close(ClientPid), + timer:sleep(1000), + %% assert: the current_connections is 0 + {200, [Listener2]} = request(get, "/gateways/ocpp/listeners"), + ?assertMatch( + #{ + status := #{running := true, current_connections := 0} + }, + Listener2 + ). + +%%-------------------------------------------------------------------- +%% ocpp simple client + +connect(Host, Port, ClientId) -> + Timeout = 5000, + ConnOpts = #{connect_timeout => 5000}, + case gun:open(Host, Port, ConnOpts) of + {ok, ConnPid} -> + {ok, _} = gun:await_up(ConnPid, Timeout), + case upgrade(ConnPid, ClientId, Timeout) of + {ok, _Headers} -> {ok, ConnPid}; + Error -> Error + end; + Error -> + Error + end. + +upgrade(ConnPid, ClientId, Timeout) -> + Path = binary_to_list(<<"/ocpp/", ClientId/binary>>), + WsHeaders = [{<<"cache-control">>, <<"no-cache">>}], + StreamRef = gun:ws_upgrade(ConnPid, Path, WsHeaders, #{protocols => [{<<"ocpp1.6">>, gun_ws_h}]}), + receive + {gun_upgrade, ConnPid, StreamRef, [<<"websocket">>], Headers} -> + {ok, Headers}; + {gun_response, ConnPid, _, _, Status, Headers} -> + {error, {ws_upgrade_failed, Status, Headers}}; + {gun_error, ConnPid, StreamRef, Reason} -> + {error, {ws_upgrade_failed, Reason}} + after Timeout -> + {error, timeout} + end. + +send_msg(ConnPid, Frame) when is_map(Frame) -> + Opts = emqx_ocpp_frame:serialize_opts(), + Msg = emqx_ocpp_frame:serialize_pkt(Frame, Opts), + gun:ws_send(ConnPid, {text, Msg}). + +receive_msg(ConnPid) -> + receive + {gun_ws, ConnPid, _Ref, {_Type, Msg}} -> + ParseState = emqx_ocpp_frame:initial_parse_state(#{}), + {ok, Frame, _Rest, _NewParseStaet} = emqx_ocpp_frame:parse(Msg, ParseState), + {ok, Frame} + after 5000 -> + {error, timeout} + end. + +close(ConnPid) -> + gun:shutdown(ConnPid). diff --git a/apps/emqx_gateway_stomp/src/emqx_gateway_stomp.app.src b/apps/emqx_gateway_stomp/src/emqx_gateway_stomp.app.src index 08214aee2..c7c9b6143 100644 --- a/apps/emqx_gateway_stomp/src/emqx_gateway_stomp.app.src +++ b/apps/emqx_gateway_stomp/src/emqx_gateway_stomp.app.src @@ -1,7 +1,7 @@ %% -*- mode: erlang -*- {application, emqx_gateway_stomp, [ {description, "Stomp Gateway"}, - {vsn, "0.1.5"}, + {vsn, "0.1.6"}, {registered, []}, {applications, [kernel, stdlib, emqx, emqx_gateway]}, {env, []}, diff --git a/apps/emqx_gateway_stomp/src/emqx_stomp_channel.erl b/apps/emqx_gateway_stomp/src/emqx_stomp_channel.erl index 20d769378..71458f15e 100644 --- a/apps/emqx_gateway_stomp/src/emqx_stomp_channel.erl +++ b/apps/emqx_gateway_stomp/src/emqx_stomp_channel.erl @@ -1039,7 +1039,7 @@ handle_deliver( {<<"subscription">>, Id}, {<<"message-id">>, next_msgid()}, {<<"destination">>, emqx_message:topic(NMessage)}, - {<<"content-type">>, <<"text/plain">>} + {<<"content-type">>, content_type_from_mqtt_message(NMessage)} ], Headers1 = case Ack of @@ -1080,6 +1080,13 @@ handle_deliver( ), {ok, [{outgoing, lists:reverse(Frames0)}], Channel}. +content_type_from_mqtt_message(Message) -> + Properties = emqx_message:get_header(properties, Message, #{}), + case maps:get('Content-Type', Properties, undefined) of + undefined -> <<"text/plain">>; + ContentType -> ContentType + end. + %%-------------------------------------------------------------------- %% Handle timeout %%-------------------------------------------------------------------- diff --git a/apps/emqx_gateway_stomp/test/emqx_stomp_SUITE.erl b/apps/emqx_gateway_stomp/test/emqx_stomp_SUITE.erl index 44c498405..64d95dc42 100644 --- a/apps/emqx_gateway_stomp/test/emqx_stomp_SUITE.erl +++ b/apps/emqx_gateway_stomp/test/emqx_stomp_SUITE.erl @@ -289,6 +289,67 @@ t_subscribe_inuse(_) -> with_connection(TopicIdInuseViaHttp), with_connection(SubscriptionInuseViaHttp). +t_receive_from_mqtt_publish(_) -> + with_connection(fun(Sock) -> + ok = send_connection_frame(Sock, <<"guest">>, <<"guest">>), + ?assertMatch({ok, #stomp_frame{command = <<"CONNECTED">>}}, recv_a_frame(Sock)), + + ok = send_subscribe_frame(Sock, 0, <<"/queue/foo">>), + ?assertMatch({ok, #stomp_frame{command = <<"RECEIPT">>}}, recv_a_frame(Sock)), + + %% send mqtt publish with content-type + Msg = emqx_message:make( + _From = from_testsuite, + _QoS = 1, + _Topic = <<"/queue/foo">>, + _Payload = <<"hello">>, + _Flags = #{}, + _Headers = #{properties => #{'Content-Type' => <<"application/json">>}} + ), + emqx:publish(Msg), + + {ok, Frame} = recv_a_frame(Sock), + ?assertEqual( + <<"application/json">>, + proplists:get_value(<<"content-type">>, Frame#stomp_frame.headers) + ), + + ?assertMatch( + #stomp_frame{ + command = <<"MESSAGE">>, + headers = _, + body = <<"hello">> + }, + Frame + ), + lists:foreach( + fun({Key, Val}) -> + Val = proplists:get_value(Key, Frame#stomp_frame.headers) + end, + [ + {<<"destination">>, <<"/queue/foo">>}, + {<<"subscription">>, <<"0">>} + ] + ), + + %% assert subscription stats + [ClientInfo1] = clients(), + ?assertMatch(#{subscriptions_cnt := 1}, ClientInfo1), + + %% Unsubscribe + ok = send_unsubscribe_frame(Sock, 0), + ?assertMatch({ok, #stomp_frame{command = <<"RECEIPT">>}}, recv_a_frame(Sock)), + + %% assert subscription stats + [ClientInfo2] = clients(), + ?assertMatch(#{subscriptions_cnt := 0}, ClientInfo2), + + ok = send_message_frame(Sock, <<"/queue/foo">>, <<"You will not receive this msg">>), + ?assertMatch({ok, #stomp_frame{command = <<"RECEIPT">>}}, recv_a_frame(Sock)), + + {error, timeout} = gen_tcp:recv(Sock, 0, 500) + end). + t_transaction(_) -> with_connection(fun(Sock) -> gen_tcp:send( diff --git a/apps/emqx_license/include/emqx_license.hrl b/apps/emqx_license/include/emqx_license.hrl index bfc1d2cfe..35aa62f5b 100644 --- a/apps/emqx_license/include/emqx_license.hrl +++ b/apps/emqx_license/include/emqx_license.hrl @@ -31,6 +31,7 @@ -define(SMALL_CUSTOMER, 0). -define(MEDIUM_CUSTOMER, 1). -define(LARGE_CUSTOMER, 2). +-define(BUSINESS_CRITICAL_CUSTOMER, 3). -define(EVALUATION_CUSTOMER, 10). -define(EXPIRED_DAY, -90). diff --git a/apps/emqx_license/src/emqx_license.app.src b/apps/emqx_license/src/emqx_license.app.src index 18545cbed..e24a152c7 100644 --- a/apps/emqx_license/src/emqx_license.app.src +++ b/apps/emqx_license/src/emqx_license.app.src @@ -1,6 +1,6 @@ {application, emqx_license, [ {description, "EMQX License"}, - {vsn, "5.0.16"}, + {vsn, "5.0.17"}, {modules, []}, {registered, [emqx_license_sup]}, {applications, [kernel, stdlib, emqx_ctl]}, diff --git a/apps/emqx_license/src/emqx_license.erl b/apps/emqx_license/src/emqx_license.erl index c95ad0e7f..dfb747a96 100644 --- a/apps/emqx_license/src/emqx_license.erl +++ b/apps/emqx_license/src/emqx_license.erl @@ -10,6 +10,7 @@ -include_lib("typerefl/include/types.hrl"). -behaviour(emqx_config_handler). +-behaviour(emqx_config_backup). -export([ pre_config_update/3, @@ -26,6 +27,8 @@ update_setting/1 ]). +-export([import_config/1]). + -define(CONF_KEY_PATH, [license]). %% Give the license app the highest priority. @@ -58,21 +61,20 @@ unload() -> -spec update_key(binary() | string()) -> {ok, emqx_config:update_result()} | {error, emqx_config:update_error()}. update_key(Value) when is_binary(Value); is_list(Value) -> - Result = emqx_conf:update( - ?CONF_KEY_PATH, - {key, Value}, - #{rawconf_with_defaults => true, override_to => cluster} - ), + Result = exec_config_update({key, Value}), handle_config_update_result(Result). update_setting(Setting) when is_map(Setting) -> - Result = emqx_conf:update( - ?CONF_KEY_PATH, - {setting, Setting}, - #{rawconf_with_defaults => true, override_to => cluster} - ), + Result = exec_config_update({setting, Setting}), handle_config_update_result(Result). +exec_config_update(Param) -> + emqx_conf:update( + ?CONF_KEY_PATH, + Param, + #{rawconf_with_defaults => true, override_to => cluster} + ). + %%------------------------------------------------------------------------------ %% emqx_hooks %%------------------------------------------------------------------------------ @@ -106,6 +108,17 @@ check(_ConnInfo, AckProps) -> {stop, {error, ?RC_QUOTA_EXCEEDED}} end. +import_config(#{<<"license">> := Config}) -> + OldConf = emqx:get_config(?CONF_KEY_PATH), + case exec_config_update(Config) of + {ok, #{config := NewConf}} -> + Changed = maps:get(changed, emqx_utils_maps:diff_maps(NewConf, OldConf)), + Changed1 = lists:map(fun(Key) -> [license, Key] end, maps:keys(Changed)), + {ok, #{root_key => license, changed => Changed1}}; + Error -> + {error, #{root_key => license, reason => Error}} + end. + %%------------------------------------------------------------------------------ %% emqx_config_handler callbacks %%------------------------------------------------------------------------------ @@ -141,7 +154,16 @@ do_update({key, Content}, Conf) when is_binary(Content); is_list(Content) -> {error, Reason} -> erlang:throw(Reason) end; -do_update({setting, Setting}, Conf) -> +do_update({setting, Setting0}, Conf) -> + #{<<"key">> := Key} = Conf, + %% only allow updating dynamic_max_connections when it's BUSINESS_CRITICAL + Setting = + case emqx_license_parser:is_business_critical(Key) of + true -> + Setting0; + false -> + maps:without([<<"dynamic_max_connections">>], Setting0) + end, maps:merge(Conf, Setting); do_update(NewConf, _PrevConf) -> #{<<"key">> := NewKey} = NewConf, diff --git a/apps/emqx_license/src/emqx_license_checker.erl b/apps/emqx_license/src/emqx_license_checker.erl index 8270e03d2..5d8393037 100644 --- a/apps/emqx_license/src/emqx_license_checker.erl +++ b/apps/emqx_license/src/emqx_license_checker.erl @@ -33,7 +33,9 @@ expiry_epoch/0, purge/0, limits/0, - print_warnings/1 + print_warnings/1, + get_max_connections/1, + get_dynamic_max_connections/0 ]). %% gen_server callbacks @@ -46,21 +48,23 @@ -define(LICENSE_TAB, emqx_license). +-type limits() :: #{max_connections := non_neg_integer() | ?ERR_EXPIRED}. +-type license() :: emqx_license_parser:license(). +-type fetcher() :: fun(() -> {ok, license()} | {error, term()}). + %%------------------------------------------------------------------------------ %% API %%------------------------------------------------------------------------------ --type limits() :: #{max_connections := non_neg_integer() | ?ERR_EXPIRED}. - --spec start_link(emqx_license_parser:license()) -> {ok, pid()}. +-spec start_link(fetcher()) -> {ok, pid()}. start_link(LicenseFetcher) -> start_link(LicenseFetcher, ?CHECK_INTERVAL). --spec start_link(emqx_license_parser:license(), timeout()) -> {ok, pid()}. +-spec start_link(fetcher(), timeout()) -> {ok, pid()}. start_link(LicenseFetcher, CheckInterval) -> gen_server:start_link({local, ?MODULE}, ?MODULE, [LicenseFetcher, CheckInterval], []). --spec update(emqx_license_parser:license()) -> map(). +-spec update(license()) -> map(). update(License) -> gen_server:call(?MODULE, {update, License}, infinity). @@ -210,8 +214,7 @@ check_license(License) -> DaysLeft = days_left(License), IsOverdue = is_overdue(License, DaysLeft), NeedRestriction = IsOverdue, - MaxConn = emqx_license_parser:max_connections(License), - Limits = limits(License, NeedRestriction), + #{max_connections := MaxConn} = Limits = limits(License, NeedRestriction), true = apply_limits(Limits), #{ warn_evaluation => warn_evaluation(License, NeedRestriction, MaxConn), @@ -223,8 +226,34 @@ warn_evaluation(License, false, MaxConn) -> warn_evaluation(_License, _NeedRestrict, _Limits) -> false. -limits(License, false) -> #{max_connections => emqx_license_parser:max_connections(License)}; -limits(_License, true) -> #{max_connections => ?ERR_EXPIRED}. +limits(License, false) -> + #{ + max_connections => get_max_connections(License) + }; +limits(_License, true) -> + #{ + max_connections => ?ERR_EXPIRED + }. + +%% @doc Return the max_connections limit defined in license. +%% For business-critical type, it returns the dynamic value set in config. +-spec get_max_connections(license()) -> non_neg_integer(). +get_max_connections(License) -> + Max = emqx_license_parser:max_connections(License), + Dyn = + case emqx_license_parser:customer_type(License) of + ?BUSINESS_CRITICAL_CUSTOMER -> + min(get_dynamic_max_connections(), Max); + _ -> + Max + end, + min(Max, Dyn). + +%% @doc Get the dynamic max_connections limit set in config. +%% It's only meaningful for business-critical license. +-spec get_dynamic_max_connections() -> non_neg_integer(). +get_dynamic_max_connections() -> + emqx_conf:get([license, dynamic_max_connections]). days_left(License) -> DateEnd = emqx_license_parser:expiry_date(License), diff --git a/apps/emqx_license/src/emqx_license_http_api.erl b/apps/emqx_license/src/emqx_license_http_api.erl index dcf7afc7e..4d869f840 100644 --- a/apps/emqx_license/src/emqx_license_http_api.erl +++ b/apps/emqx_license/src/emqx_license_http_api.erl @@ -147,7 +147,7 @@ error_msg(Code, Msg) -> {400, error_msg(?BAD_REQUEST, <<"Invalid request params">>)}. '/license/setting'(get, _Params) -> - {200, maps:remove(<<"key">>, emqx_config:get_raw([license]))}; + {200, get_setting()}; '/license/setting'(put, #{body := Setting}) -> case emqx_license:update_setting(Setting) of {error, Error} -> @@ -170,3 +170,14 @@ fields(key_license) -> setting() -> lists:keydelete(key, 1, emqx_license_schema:fields(key_license)). + +%% Drop dynamic_max_connections unless it's a BUSINESS_CRITICAL license. +get_setting() -> + #{<<"key">> := Key} = Raw = emqx_config:get_raw([license]), + Result = maps:remove(<<"key">>, Raw), + case emqx_license_parser:is_business_critical(Key) of + true -> + Result; + false -> + maps:remove(<<"dynamic_max_connections">>, Result) + end. diff --git a/apps/emqx_license/src/emqx_license_parser.erl b/apps/emqx_license/src/emqx_license_parser.erl index d7fcde338..67ad801bc 100644 --- a/apps/emqx_license/src/emqx_license_parser.erl +++ b/apps/emqx_license/src/emqx_license_parser.erl @@ -28,6 +28,7 @@ ?SMALL_CUSTOMER | ?MEDIUM_CUSTOMER | ?LARGE_CUSTOMER + | ?BUSINESS_CRITICAL_CUSTOMER | ?EVALUATION_CUSTOMER. -type license_type() :: ?OFFICIAL | ?TRIAL. @@ -41,6 +42,8 @@ source := binary() }. +-type raw_license() :: string() | binary() | default. + -export_type([ license_data/0, customer_type/0, @@ -56,7 +59,8 @@ customer_type/1, license_type/1, expiry_date/1, - max_connections/1 + max_connections/1, + is_business_critical/1 ]). %% for testing purpose @@ -94,7 +98,7 @@ default() -> emqx_license_schema:default_license(). %% @doc Parse license key. %% If the license key is prefixed with "file://path/to/license/file", %% then the license key is read from the file. --spec parse(default | string() | binary()) -> {ok, license()} | {error, map()}. +-spec parse(raw_license()) -> {ok, license()} | {error, map()}. parse(Content) -> parse(to_bin(Content), ?MODULE:pubkey()). @@ -146,6 +150,13 @@ expiry_date(#{module := Module, data := LicenseData}) -> max_connections(#{module := Module, data := LicenseData}) -> Module:max_connections(LicenseData). +-spec is_business_critical(license() | raw_license()) -> boolean(). +is_business_critical(#{module := Module, data := LicenseData}) -> + Module:customer_type(LicenseData) =:= ?BUSINESS_CRITICAL_CUSTOMER; +is_business_critical(Key) when is_binary(Key) -> + {ok, License} = parse(Key), + is_business_critical(License). + %%-------------------------------------------------------------------- %% Private functions %%-------------------------------------------------------------------- diff --git a/apps/emqx_license/src/emqx_license_schema.erl b/apps/emqx_license/src/emqx_license_schema.erl index 1a1f388d9..0780f5971 100644 --- a/apps/emqx_license/src/emqx_license_schema.erl +++ b/apps/emqx_license/src/emqx_license_schema.erl @@ -16,7 +16,8 @@ -export([namespace/0, roots/0, fields/1, validations/0, desc/1, tags/0]). -export([ - default_license/0 + default_license/0, + default_setting/0 ]). namespace() -> "license". @@ -45,16 +46,26 @@ fields(key_license) -> required => true, desc => ?DESC(key_field) }}, + %% This feature is not made GA yet, hence hidden. + %% When license is issued to cutomer-type BUSINESS_CRITICAL (code 3) + %% This config is taken as the real max_connections limit. + {dynamic_max_connections, #{ + type => non_neg_integer(), + default => default(dynamic_max_connections), + required => false, + importance => ?IMPORTANCE_HIDDEN, + desc => ?DESC(dynamic_max_connections) + }}, {connection_low_watermark, #{ type => emqx_schema:percent(), - default => <<"75%">>, - example => <<"75%">>, + default => default(connection_low_watermark), + example => default(connection_low_watermark), desc => ?DESC(connection_low_watermark_field) }}, {connection_high_watermark, #{ type => emqx_schema:percent(), - default => <<"80%">>, - example => <<"80%">>, + default => default(connection_high_watermark), + example => default(connection_high_watermark), desc => ?DESC(connection_high_watermark_field) }} ]. @@ -87,11 +98,39 @@ check_license_watermark(Conf) -> %% @doc The default license key. %% This default license has 25 connections limit. -%% Issued on 2023-12-08 and valid for 5 years (1825 days) -%% NOTE: when updating a new key, the schema doc in emqx_license_schema.hocon -%% should be updated accordingly +%% Issued on 2024-04-18 and valid for 5 years (1825 days) +%% +%% NOTE: when updating a new key, below should be updated accordingly: +%% - emqx_license_schema.hocon default connections limit +%% - default(dynamic_max_connections) return value default_license() -> << - "MjIwMTExCjAKMTAKRXZhbHVhdGlvbgpjb250YWN0QGVtcXguaW8KdHJpYWwKMjAyMzEyMDgKMTgyNQoyNQo=." - "MEUCIE271MtH+4bb39OZKD4mvVkurwZ3LX44KUvuOxkbjQz2AiEAqL7BP44PMUS5z5SAN1M4y3v3h47J8qORAqcuetnyexw=" + "MjIwMTExCjAKMTAKRXZhbHVhdGlvbgpjb250YWN0QGVtcXguaW8KdHJpYWwKMjAyNDA0MTgKMTgyNQoyNQo=" + "." + "MEUCICMWWkfrvyMwQaQAOXEsEcs+d6+5uXc1BDxR7j25fRy4AiEAmblQ4p+FFmdsvnKgcRRkv1zj7PExmZKVk3mVcxH3fgw=" >>. + +%% @doc Exported for testing +default_setting() -> + Keys = + [ + connection_low_watermark, + connection_high_watermark, + dynamic_max_connections + ], + maps:from_list( + lists:map( + fun(K) -> + {K, default(K)} + end, + Keys + ) + ). + +default(connection_low_watermark) -> + <<"75%">>; +default(connection_high_watermark) -> + <<"80%">>; +default(dynamic_max_connections) -> + %Must match the value encoded in default license. + 25. diff --git a/apps/emqx_license/test/emqx_license_SUITE.erl b/apps/emqx_license/test/emqx_license_SUITE.erl index 1aa370359..7c041aad1 100644 --- a/apps/emqx_license/test/emqx_license_SUITE.erl +++ b/apps/emqx_license/test/emqx_license_SUITE.erl @@ -149,6 +149,36 @@ t_check_not_loaded(_Config) -> emqx_license:check(#{}, #{}) ). +t_import_config(_Config) -> + %% Import to default license + ?assertMatch( + {ok, #{root_key := license, changed := _}}, + emqx_license:import_config(#{<<"license">> => #{<<"key">> => <<"default">>}}) + ), + ?assertEqual(default, emqx:get_config([license, key])), + ?assertMatch({ok, #{max_connections := 10}}, emqx_license_checker:limits()), + + %% Import to a new license + EncodedLicense = emqx_license_test_lib:make_license(#{max_connections => "100"}), + ?assertMatch( + {ok, #{root_key := license, changed := _}}, + emqx_license:import_config( + #{ + <<"license">> => + #{ + <<"key">> => EncodedLicense, + <<"connection_low_watermark">> => <<"20%">>, + <<"connection_high_watermark">> => <<"50%">> + } + } + ) + ), + ?assertMatch({ok, #{max_connections := 100}}, emqx_license_checker:limits()), + ?assertMatch( + #{connection_low_watermark := 0.2, connection_high_watermark := 0.5}, + emqx:get_config([license]) + ). + %%------------------------------------------------------------------------------ %% Helpers %%------------------------------------------------------------------------------ diff --git a/apps/emqx_license/test/emqx_license_cli_SUITE.erl b/apps/emqx_license/test/emqx_license_cli_SUITE.erl index b362efd95..1e8cfe7de 100644 --- a/apps/emqx_license/test/emqx_license_cli_SUITE.erl +++ b/apps/emqx_license/test/emqx_license_cli_SUITE.erl @@ -65,6 +65,7 @@ t_conf_update(_Config) -> #{ connection_high_watermark => 0.5, connection_low_watermark => 0.45, + dynamic_max_connections => 25, key => LicenseKey }, emqx:get_config([license]) diff --git a/apps/emqx_license/test/emqx_license_http_api_SUITE.erl b/apps/emqx_license/test/emqx_license_http_api_SUITE.erl index c207b3a40..b64a4d5af 100644 --- a/apps/emqx_license/test/emqx_license_http_api_SUITE.erl +++ b/apps/emqx_license/test/emqx_license_http_api_SUITE.erl @@ -19,17 +19,16 @@ all() -> init_per_suite(Config) -> emqx_license_test_lib:mock_parser(), + Setting = emqx_license_schema:default_setting(), + Key = emqx_license_test_lib:make_license(#{max_connections => "100"}), + LicenseConf = maps:merge(#{key => Key}, Setting), Apps = emqx_cth_suite:start( [ emqx, emqx_conf, {emqx_license, #{ config => #{ - license => #{ - key => emqx_license_test_lib:make_license(#{max_connections => "100"}), - connection_low_watermark => <<"75%">>, - connection_high_watermark => <<"80%">> - } + license => LicenseConf } }}, {emqx_dashboard, @@ -50,7 +49,7 @@ init_per_testcase(_TestCase, Config) -> Config. end_per_testcase(_TestCase, _Config) -> - {ok, _} = reset_license(), + ok = reset_license(), ok. %%------------------------------------------------------------------------------ @@ -70,7 +69,11 @@ default_license() -> emqx_license_test_lib:make_license(#{max_connections => "100"}). reset_license() -> - emqx_license:update_key(default_license()). + {ok, _} = emqx_license:update_key(default_license()), + Setting = emqx_license_schema:default_setting(), + Req = maps:from_list([{atom_to_binary(K), V} || {K, V} <- maps:to_list(Setting)]), + {ok, _} = emqx_license:update_setting(Req), + ok. assert_untouched_license() -> ?assertMatch( @@ -224,6 +227,26 @@ t_license_setting(_Config) -> ), ok. +t_license_setting_bc(_Config) -> + %% Create a BC license + Key = emqx_license_test_lib:make_license(#{customer_type => "3"}), + Res = request(post, uri(["license"]), #{key => Key}), + ?assertMatch({ok, 200, _}, Res), + %% get + GetRes = request(get, uri(["license", "setting"]), []), + validate_setting(GetRes, <<"75%">>, <<"80%">>, 25), + %% update + Low = <<"50%">>, + High = <<"55%">>, + UpdateRes = request(put, uri(["license", "setting"]), #{ + <<"connection_low_watermark">> => Low, + <<"connection_high_watermark">> => High, + <<"dynamic_max_connections">> => 26 + }), + validate_setting(UpdateRes, Low, High, 26), + ?assertEqual(26, emqx_config:get([license, dynamic_max_connections])), + ok. + validate_setting(Res, ExpectLow, ExpectHigh) -> ?assertMatch({ok, 200, _}, Res), {ok, 200, Payload} = Res, @@ -234,3 +257,13 @@ validate_setting(Res, ExpectLow, ExpectHigh) -> }, emqx_utils_json:decode(Payload, [return_maps]) ). + +validate_setting(Res, ExpectLow, ExpectHigh, DynMax) -> + ?assertMatch({ok, 200, _}, Res), + {ok, 200, Payload} = Res, + #{ + <<"connection_low_watermark">> := ExpectLow, + <<"connection_high_watermark">> := ExpectHigh, + <<"dynamic_max_connections">> := DynMax + } = + emqx_utils_json:decode(Payload, [return_maps]). diff --git a/apps/emqx_machine/test/emqx_machine_SUITE.erl b/apps/emqx_machine/test/emqx_machine_SUITE.erl index 3838e6aad..c28ffa313 100644 --- a/apps/emqx_machine/test/emqx_machine_SUITE.erl +++ b/apps/emqx_machine/test/emqx_machine_SUITE.erl @@ -144,7 +144,13 @@ t_open_ports_check(Config) -> ?assertEqual(ok, erpc:call(Core2, emqx_machine, open_ports_check, [])), ?assertEqual(ok, erpc:call(Replicant, emqx_machine, open_ports_check, [])), + true = erlang:monitor_node(Core2, true), ok = emqx_cth_cluster:stop_node(Core2), + receive + {nodedown, Core2} -> ok + after 10000 -> + ct:fail("nodedown message not received after 10 seconds.") + end, ?assertEqual(ok, erpc:call(Replicant, emqx_machine, open_ports_check, [])), ?retry(200, 20, begin diff --git a/apps/emqx_management/src/emqx_mgmt.erl b/apps/emqx_management/src/emqx_mgmt.erl index 35908d3bd..9177d255e 100644 --- a/apps/emqx_management/src/emqx_mgmt.erl +++ b/apps/emqx_management/src/emqx_mgmt.erl @@ -221,7 +221,9 @@ vm_stats('cpu') -> case emqx_vm:cpu_util([CpuUtilArg]) of %% return 0.0 when `emqx_cpu_sup_worker` is not started {all, Use, Idle, _} -> - [{cpu_use, Use}, {cpu_idle, Idle}]; + NUse = floor(Use * 100) / 100, + NIdle = ceil(Idle * 100) / 100, + [{cpu_use, NUse}, {cpu_idle, NIdle}]; _ -> [{cpu_use, 0}, {cpu_idle, 0}] end; @@ -711,5 +713,24 @@ call_conn(ConnMod, Pid, Req) -> exit:R when R =:= shutdown; R =:= normal -> {error, shutdown}; exit:{R, _} when R =:= shutdown; R =:= noproc -> - {error, shutdown} + {error, shutdown}; + exit:{{shutdown, _OOMInfo}, _Location} -> + {error, shutdown}; + exit:timeout -> + LogData = #{ + msg => "call_client_connection_process_timeout", + request => Req, + pid => Pid, + module => ConnMod + }, + LogData1 = + case node(Pid) =:= node() of + true -> + LogData#{stacktrace => erlang:process_info(Pid, current_stacktrace)}; + false -> + LogData + end, + + ?SLOG(warning, LogData1), + {error, timeout} end. diff --git a/apps/emqx_management/src/emqx_mgmt_api_clients.erl b/apps/emqx_management/src/emqx_mgmt_api_clients.erl index 07f407430..38320780d 100644 --- a/apps/emqx_management/src/emqx_mgmt_api_clients.erl +++ b/apps/emqx_management/src/emqx_mgmt_api_clients.erl @@ -38,6 +38,7 @@ -export([ clients/2, + list_clients_v2/2, kickout_clients/2, client/2, subscriptions/2, @@ -63,6 +64,10 @@ %% for batch operation -export([do_subscribe/3]). +-ifdef(TEST). +-export([parse_cursor/2, serialize_cursor/1]). +-endif. + -define(TAGS, [<<"Clients">>]). -define(CLIENT_QSCHEMA, [ @@ -95,6 +100,14 @@ message => <<"Client connection has been shutdown">> }). +%% tags +-define(CURSOR_VSN1, 1). +-define(CURSOR_TYPE_ETS, 1). +-define(CURSOR_TYPE_DS, 2). +%% field keys +-define(CURSOR_ETS_NODE_IDX, 1). +-define(CURSOR_ETS_CONT, 2). + namespace() -> undefined. api_spec() -> @@ -103,6 +116,7 @@ api_spec() -> paths() -> [ "/clients", + "/clients_v2", "/clients/kickout/bulk", "/clients/:clientid", "/clients/:clientid/authorization/cache", @@ -117,115 +131,38 @@ paths() -> "/sessions_count" ]. +schema("/clients_v2") -> + #{ + 'operationId' => list_clients_v2, + get => #{ + security => [], + description => ?DESC(list_clients), + tags => ?TAGS, + parameters => fields(list_clients_v2_inputs), + responses => #{ + 200 => + emqx_dashboard_swagger:schema_with_example(?R_REF(list_clients_v2_response), #{ + <<"data">> => [client_example()], + <<"meta">> => #{ + <<"count">> => 1, + <<"cursor">> => <<"g2wAAAADYQFhAm0AAAACYzJq">>, + <<"hasnext">> => true + } + }), + 400 => + emqx_dashboard_swagger:error_codes( + ['INVALID_PARAMETER'], <<"Invalid parameters">> + ) + } + } + }; schema("/clients") -> #{ 'operationId' => clients, get => #{ description => ?DESC(list_clients), tags => ?TAGS, - parameters => [ - hoconsc:ref(emqx_dashboard_swagger, page), - hoconsc:ref(emqx_dashboard_swagger, limit), - {node, - hoconsc:mk(binary(), #{ - in => query, - required => false, - desc => <<"Node name">>, - example => <<"emqx@127.0.0.1">> - })}, - {username, - hoconsc:mk(hoconsc:array(binary()), #{ - in => query, - required => false, - desc => << - "User name, multiple values can be specified by" - " repeating the parameter: username=u1&username=u2" - >> - })}, - {ip_address, - hoconsc:mk(binary(), #{ - in => query, - required => false, - desc => <<"Client's IP address">>, - example => <<"127.0.0.1">> - })}, - {conn_state, - hoconsc:mk(hoconsc:enum([connected, idle, disconnected]), #{ - in => query, - required => false, - desc => - <<"The current connection status of the client, ", - "the possible values are connected,idle,disconnected">> - })}, - {clean_start, - hoconsc:mk(boolean(), #{ - in => query, - required => false, - description => <<"Whether the client uses a new session">> - })}, - {proto_ver, - hoconsc:mk(binary(), #{ - in => query, - required => false, - desc => <<"Client protocol version">> - })}, - {like_clientid, - hoconsc:mk(binary(), #{ - in => query, - required => false, - desc => <<"Fuzzy search `clientid` as substring">> - })}, - {like_username, - hoconsc:mk(binary(), #{ - in => query, - required => false, - desc => <<"Fuzzy search `username` as substring">> - })}, - {gte_created_at, - hoconsc:mk(emqx_utils_calendar:epoch_millisecond(), #{ - in => query, - required => false, - desc => - <<"Search client session creation time by greater", - " than or equal method, rfc3339 or timestamp(millisecond)">> - })}, - {lte_created_at, - hoconsc:mk(emqx_utils_calendar:epoch_millisecond(), #{ - in => query, - required => false, - desc => - <<"Search client session creation time by less", - " than or equal method, rfc3339 or timestamp(millisecond)">> - })}, - {gte_connected_at, - hoconsc:mk(emqx_utils_calendar:epoch_millisecond(), #{ - in => query, - required => false, - desc => << - "Search client connection creation time by greater" - " than or equal method, rfc3339 or timestamp(epoch millisecond)" - >> - })}, - {lte_connected_at, - hoconsc:mk(emqx_utils_calendar:epoch_millisecond(), #{ - in => query, - required => false, - desc => << - "Search client connection creation time by less" - " than or equal method, rfc3339 or timestamp(millisecond)" - >> - })}, - {clientid, - hoconsc:mk(hoconsc:array(binary()), #{ - in => query, - required => false, - desc => << - "Client ID, multiple values can be specified by" - " repeating the parameter: clientid=c1&clientid=c2" - >> - })}, - ?R_REF(requested_client_fields) - ], + parameters => fields(list_clients_v1_inputs), responses => #{ 200 => emqx_dashboard_swagger:schema_with_example(?R_REF(clients), #{ @@ -453,11 +390,129 @@ schema("/sessions_count") -> } }. +fields(list_clients_v2_inputs) -> + [ + hoconsc:ref(emqx_dashboard_swagger, cursor) + | fields(common_list_clients_input) + ]; +fields(list_clients_v1_inputs) -> + [ + hoconsc:ref(emqx_dashboard_swagger, page), + {node, + hoconsc:mk(binary(), #{ + in => query, + required => false, + desc => <<"Node name">>, + example => <<"emqx@127.0.0.1">> + })} + | fields(common_list_clients_input) + ]; +fields(common_list_clients_input) -> + [ + hoconsc:ref(emqx_dashboard_swagger, limit), + {username, + hoconsc:mk(hoconsc:array(binary()), #{ + in => query, + required => false, + desc => << + "User name, multiple values can be specified by" + " repeating the parameter: username=u1&username=u2" + >> + })}, + {ip_address, + hoconsc:mk(binary(), #{ + in => query, + required => false, + desc => <<"Client's IP address">>, + example => <<"127.0.0.1">> + })}, + {conn_state, + hoconsc:mk(hoconsc:enum([connected, idle, disconnected]), #{ + in => query, + required => false, + desc => + <<"The current connection status of the client, ", + "the possible values are connected,idle,disconnected">> + })}, + {clean_start, + hoconsc:mk(boolean(), #{ + in => query, + required => false, + description => <<"Whether the client uses a new session">> + })}, + {proto_ver, + hoconsc:mk(binary(), #{ + in => query, + required => false, + desc => <<"Client protocol version">> + })}, + {like_clientid, + hoconsc:mk(binary(), #{ + in => query, + required => false, + desc => <<"Fuzzy search `clientid` as substring">> + })}, + {like_username, + hoconsc:mk(binary(), #{ + in => query, + required => false, + desc => <<"Fuzzy search `username` as substring">> + })}, + {gte_created_at, + hoconsc:mk(emqx_utils_calendar:epoch_millisecond(), #{ + in => query, + required => false, + desc => + <<"Search client session creation time by greater", + " than or equal method, rfc3339 or timestamp(millisecond)">> + })}, + {lte_created_at, + hoconsc:mk(emqx_utils_calendar:epoch_millisecond(), #{ + in => query, + required => false, + desc => + <<"Search client session creation time by less", + " than or equal method, rfc3339 or timestamp(millisecond)">> + })}, + {gte_connected_at, + hoconsc:mk(emqx_utils_calendar:epoch_millisecond(), #{ + in => query, + required => false, + desc => << + "Search client connection creation time by greater" + " than or equal method, rfc3339 or timestamp(epoch millisecond)" + >> + })}, + {lte_connected_at, + hoconsc:mk(emqx_utils_calendar:epoch_millisecond(), #{ + in => query, + required => false, + desc => << + "Search client connection creation time by less" + " than or equal method, rfc3339 or timestamp(millisecond)" + >> + })}, + {clientid, + hoconsc:mk(hoconsc:array(binary()), #{ + in => query, + required => false, + desc => << + "Client ID, multiple values can be specified by" + " repeating the parameter: clientid=c1&clientid=c2" + >> + })}, + ?R_REF(requested_client_fields) + ]; fields(clients) -> [ {data, hoconsc:mk(hoconsc:array(?REF(client)), #{})}, {meta, hoconsc:mk(hoconsc:ref(emqx_dashboard_swagger, meta), #{})} ]; +fields(list_clients_v2_response) -> + [ + {data, hoconsc:mk(hoconsc:array(?REF(client)), #{})}, + {meta, hoconsc:mk(hoconsc:ref(emqx_dashboard_swagger, meta_with_cursor), #{})} + ]; fields(client) -> [ {awaiting_rel_cnt, @@ -890,6 +945,218 @@ list_clients(QString) -> {200, Response} end. +list_clients_v2(get, #{query_string := QString0}) -> + Nodes = emqx:running_nodes(), + case maps:get(<<"cursor">>, QString0, none) of + none -> + Cursor = initial_ets_cursor(Nodes), + do_list_clients_v2(Nodes, Cursor, QString0); + CursorBin when is_binary(CursorBin) -> + case parse_cursor(CursorBin, Nodes) of + {ok, Cursor} -> + do_list_clients_v2(Nodes, Cursor, QString0); + {error, bad_cursor} -> + ?BAD_REQUEST(<<"bad cursor">>) + end + end. + +do_list_clients_v2(Nodes, Cursor, QString0) -> + Limit = maps:get(<<"limit">>, QString0, 100), + Acc = #{ + rows => [], + n => 0, + limit => Limit + }, + do_list_clients_v2(Nodes, Cursor, QString0, Acc). + +do_list_clients_v2(_Nodes, Cursor = done, _QString, Acc) -> + format_results(Acc, Cursor); +do_list_clients_v2(Nodes, Cursor = #{type := ?CURSOR_TYPE_ETS, node := Node}, QString0, Acc0) -> + {Rows, NewCursor} = do_ets_select(Nodes, QString0, Cursor), + Acc1 = maps:update_with(rows, fun(Rs) -> [{Node, Rows} | Rs] end, Acc0), + Acc = #{limit := Limit, n := N} = maps:update_with(n, fun(N) -> N + length(Rows) end, Acc1), + case N >= Limit of + true -> + format_results(Acc, NewCursor); + false -> + do_list_clients_v2(Nodes, NewCursor, QString0, Acc) + end; +do_list_clients_v2(Nodes, _Cursor = #{type := ?CURSOR_TYPE_DS, iterator := Iter0}, QString0, Acc0) -> + #{limit := Limit} = Acc0, + {Rows0, Iter} = emqx_persistent_session_ds_state:session_iterator_next(Iter0, Limit), + NewCursor = next_ds_cursor(Iter), + Rows1 = drop_live_and_expired(Rows0), + Rows = maybe_run_fuzzy_filter(Rows1, QString0), + Acc1 = maps:update_with(rows, fun(Rs) -> [{undefined, Rows} | Rs] end, Acc0), + Acc = #{n := N} = maps:update_with(n, fun(N) -> N + length(Rows) end, Acc1), + case N >= Limit of + true -> + format_results(Acc, NewCursor); + false -> + do_list_clients_v2(Nodes, NewCursor, QString0, Acc) + end. + +format_results(Acc, Cursor) -> + #{ + rows := NodeRows, + n := N + } = Acc, + Meta = + case Cursor of + done -> + #{ + hasnext => false, + count => N + }; + _ -> + #{ + hasnext => true, + count => N, + cursor => serialize_cursor(Cursor) + } + end, + Resp = #{ + meta => Meta, + data => [ + format_channel_info(Node, Row) + || {Node, Rows} <- NodeRows, + Row <- Rows + ] + }, + ?OK(Resp). + +do_ets_select(Nodes, QString0, #{node := Node, node_idx := NodeIdx, cont := Cont} = _Cursor) -> + {_, QString1} = emqx_mgmt_api:parse_qstring(QString0, ?CLIENT_QSCHEMA), + Limit = maps:get(<<"limit">>, QString0, 10), + {Rows, #{cont := NewCont, node_idx := NewNodeIdx}} = ets_select( + QString1, Limit, Node, NodeIdx, Cont + ), + {Rows, next_ets_cursor(Nodes, NewNodeIdx, NewCont)}. + +maybe_run_fuzzy_filter(Rows, QString0) -> + {_, {_, FuzzyQString}} = emqx_mgmt_api:parse_qstring(QString0, ?CLIENT_QSCHEMA), + FuzzyFilterFn = fuzzy_filter_fun(FuzzyQString), + case FuzzyFilterFn of + undefined -> + Rows; + {Fn, Args} -> + lists:filter( + fun(E) -> erlang:apply(Fn, [E | Args]) end, + Rows + ) + end. + +initial_ets_cursor([Node | _Rest] = _Nodes) -> + #{ + type => ?CURSOR_TYPE_ETS, + node => Node, + node_idx => 1, + cont => undefined + }. + +initial_ds_cursor() -> + case emqx_persistent_message:is_persistence_enabled() of + true -> + #{ + type => ?CURSOR_TYPE_DS, + iterator => init_persistent_session_iterator() + }; + false -> + done + end. + +next_ets_cursor(Nodes, NodeIdx, Cont) -> + case NodeIdx > length(Nodes) of + true -> + initial_ds_cursor(); + false -> + Node = lists:nth(NodeIdx, Nodes), + #{ + type => ?CURSOR_TYPE_ETS, + node_idx => NodeIdx, + node => Node, + cont => Cont + } + end. + +next_ds_cursor('$end_of_table') -> + done; +next_ds_cursor(Iter) -> + #{ + type => ?CURSOR_TYPE_DS, + iterator => Iter + }. + +parse_cursor(CursorBin, Nodes) -> + try emqx_base62:decode(CursorBin) of + Bin -> + parse_cursor1(Bin, Nodes) + catch + _:_ -> + {error, bad_cursor} + end. + +parse_cursor1(CursorBin, Nodes) -> + try binary_to_term(CursorBin, [safe]) of + [ + ?CURSOR_VSN1, + ?CURSOR_TYPE_ETS, + #{?CURSOR_ETS_NODE_IDX := NodeIdx, ?CURSOR_ETS_CONT := Cont} + ] -> + case NodeIdx > length(Nodes) of + true -> + {error, bad_cursor}; + false -> + Node = lists:nth(NodeIdx, Nodes), + Cursor = #{ + type => ?CURSOR_TYPE_ETS, + node => Node, + node_idx => NodeIdx, + cont => Cont + }, + {ok, Cursor} + end; + [?CURSOR_VSN1, ?CURSOR_TYPE_DS, DSIter] -> + Cursor = #{type => ?CURSOR_TYPE_DS, iterator => DSIter}, + {ok, Cursor}; + _ -> + {error, bad_cursor} + catch + error:badarg -> + {error, bad_cursor} + end. + +serialize_cursor(#{type := ?CURSOR_TYPE_ETS, node_idx := NodeIdx, cont := Cont}) -> + Cursor0 = [ + ?CURSOR_VSN1, + ?CURSOR_TYPE_ETS, + #{?CURSOR_ETS_NODE_IDX => NodeIdx, ?CURSOR_ETS_CONT => Cont} + ], + Bin = term_to_binary(Cursor0, [{compressed, 9}]), + emqx_base62:encode(Bin); +serialize_cursor(#{type := ?CURSOR_TYPE_DS, iterator := Iter}) -> + Cursor0 = [?CURSOR_VSN1, ?CURSOR_TYPE_DS, Iter], + Bin = term_to_binary(Cursor0, [{compressed, 9}]), + emqx_base62:encode(Bin). + +%% An adapter function so we can reutilize all the logic in `emqx_mgmt_api' for +%% selecting/fuzzy filters, and also reutilize its BPAPI for selecting rows. +ets_select(NQString, Limit, Node, NodeIdx, Cont) -> + QueryState0 = emqx_mgmt_api:init_query_state( + ?CHAN_INFO_TAB, + NQString, + fun ?MODULE:qs2ms/2, + _Meta = #{page => unused, limit => Limit}, + _Options = #{} + ), + QueryState = QueryState0#{continuation => Cont}, + case emqx_mgmt_api:do_query(Node, QueryState) of + {Rows, #{complete := true}} -> + {Rows, #{node_idx => NodeIdx + 1, cont => undefined}}; + {Rows, #{continuation := NCont}} -> + {Rows, #{node_idx => NodeIdx, cont => NCont}} + end. + lookup(#{clientid := ClientID}) -> case emqx_mgmt:lookup_client({clientid, ClientID}, ?FORMAT_FUN) of [] -> @@ -1292,6 +1559,8 @@ list_client_msgs(MsgType, ClientID, QString) -> code => 'NOT_IMPLEMENTED', message => <<"API not implemented for persistent sessions">> }}; + {error, Reason} -> + ?INTERNAL_ERROR(Reason); {Msgs, Meta = #{}} when is_list(Msgs) -> format_msgs_resp(MsgType, Msgs, Meta, QString) end @@ -1410,13 +1679,25 @@ fuzzy_filter_fun(Fuzzy) -> run_fuzzy_filter(_, []) -> true; -run_fuzzy_filter(E = {_, #{clientinfo := ClientInfo}, _}, [{Key, like, SubStr} | Fuzzy]) -> +run_fuzzy_filter( + Row = {_, #{metadata := #{clientinfo := ClientInfo}}}, + [{Key, like, SubStr} | RestArgs] +) -> + %% Row from DS + run_fuzzy_filter1(ClientInfo, Key, SubStr) andalso + run_fuzzy_filter(Row, RestArgs); +run_fuzzy_filter(Row = {_, #{clientinfo := ClientInfo}, _}, [{Key, like, SubStr} | RestArgs]) -> + %% Row from ETS + run_fuzzy_filter1(ClientInfo, Key, SubStr) andalso + run_fuzzy_filter(Row, RestArgs). + +run_fuzzy_filter1(ClientInfo, Key, SubStr) -> Val = case maps:get(Key, ClientInfo, <<>>) of undefined -> <<>>; V -> V end, - binary:match(Val, SubStr) /= nomatch andalso run_fuzzy_filter(E, Fuzzy). + binary:match(Val, SubStr) /= nomatch. %%-------------------------------------------------------------------- %% format funcs @@ -1466,6 +1747,7 @@ format_channel_info(undefined, {ClientId, PSInfo0 = #{}}, _Opts) -> format_persistent_session_info(ClientId, PSInfo0) -> Metadata = maps:get(metadata, PSInfo0, #{}), + {ProtoName, ProtoVer} = maps:get(protocol, Metadata), PSInfo1 = maps:with([created_at, expiry_interval], Metadata), CreatedAt = maps:get(created_at, PSInfo1), case Metadata of @@ -1482,7 +1764,12 @@ format_persistent_session_info(ClientId, PSInfo0) -> connected_at => CreatedAt, ip_address => IpAddress, is_persistent => true, - port => Port + port => Port, + heap_size => 0, + mqueue_len => 0, + proto_name => ProtoName, + proto_ver => ProtoVer, + subscriptions_cnt => maps:size(maps:get(subscriptions, PSInfo0, #{})) }, PSInfo = lists:foldl( fun result_format_time_fun/2, diff --git a/apps/emqx_management/src/emqx_mgmt_api_ds.erl b/apps/emqx_management/src/emqx_mgmt_api_ds.erl new file mode 100644 index 000000000..c1a03feb4 --- /dev/null +++ b/apps/emqx_management/src/emqx_mgmt_api_ds.erl @@ -0,0 +1,481 @@ +%%-------------------------------------------------------------------- +%% Copyright (c) 2024 EMQ Technologies Co., Ltd. All Rights Reserved. +%% +%% Licensed under the Apache License, Version 2.0 (the "License"); +%% you may not use this file except in compliance with the License. +%% You may obtain a copy of the License at +%% +%% http://www.apache.org/licenses/LICENSE-2.0 +%% +%% Unless required by applicable law or agreed to in writing, software +%% distributed under the License is distributed on an "AS IS" BASIS, +%% WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +%% See the License for the specific language governing permissions and +%% limitations under the License. +%%-------------------------------------------------------------------- +-module(emqx_mgmt_api_ds). + +-behaviour(minirest_api). + +-include_lib("emqx/include/logger.hrl"). +-include_lib("typerefl/include/types.hrl"). +-include_lib("hocon/include/hoconsc.hrl"). +-include_lib("emqx_utils/include/emqx_utils_api.hrl"). + +-import(hoconsc, [mk/2, ref/1, enum/1, array/1]). + +%% API: +-export([ + list_sites/2, + get_site/2, + list_dbs/2, + get_db/2, + db_replicas/2, + db_replica/2, + + update_db_sites/3, + join/3, + leave/3 +]). + +%% behavior callbacks: +-export([ + namespace/0, + api_spec/0, + schema/1, + paths/0, + fields/1 +]). + +%% internal exports: +-export([]). + +-export_type([]). + +%%================================================================================ +%% Type declarations +%%================================================================================ + +-define(TAGS, [<<"Durable storage">>]). + +%%================================================================================ +%% behavior callbacks +%%================================================================================ + +namespace() -> + undefined. + +api_spec() -> + emqx_dashboard_swagger:spec(?MODULE, #{check_schema => true}). + +paths() -> + [ + "/ds/sites", + "/ds/sites/:site", + "/ds/storages", + "/ds/storages/:ds", + "/ds/storages/:ds/replicas", + "/ds/storages/:ds/replicas/:site" + ]. + +schema("/ds/sites") -> + #{ + 'operationId' => list_sites, + get => + #{ + description => <<"List sites">>, + tags => ?TAGS, + responses => + #{ + 200 => mk(array(binary()), #{desc => <<"List sites">>}) + } + } + }; +schema("/ds/sites/:site") -> + #{ + 'operationId' => get_site, + get => + #{ + description => <<"Get sites">>, + parameters => [param_site_id()], + tags => ?TAGS, + responses => + #{ + 200 => mk(ref(site), #{desc => <<"Get information about the site">>}), + 404 => not_found(<<"Site">>) + } + } + }; +schema("/ds/storages") -> + #{ + 'operationId' => list_dbs, + get => + #{ + description => <<"List durable storages">>, + tags => ?TAGS, + responses => + #{ + 200 => mk(array(atom()), #{desc => <<"List durable storages">>}) + } + } + }; +schema("/ds/storages/:ds") -> + #{ + 'operationId' => get_db, + get => + #{ + description => <<"Get durable storage">>, + tags => ?TAGS, + parameters => [param_storage_id()], + responses => + #{ + 200 => mk(ref(db), #{desc => <<"Get information about a durable storage">>}), + 400 => not_found(<<"Durable storage">>) + } + } + }; +schema("/ds/storages/:ds/replicas") -> + Parameters = [param_storage_id()], + #{ + 'operationId' => db_replicas, + get => + #{ + description => <<"List replicas of the durable storage">>, + tags => ?TAGS, + parameters => Parameters, + responses => + #{ + 200 => mk(array(binary()), #{ + desc => <<"List sites that contain replicas of the durable storage">> + }), + 400 => not_found(<<"Durable storage">>) + } + }, + put => + #{ + description => <<"Update replicas of the durable storage">>, + tags => ?TAGS, + parameters => Parameters, + responses => + #{ + 202 => mk(array(binary()), #{}), + 400 => bad_request() + }, + 'requestBody' => mk(array(binary()), #{desc => <<"New list of sites">>}) + } + }; +schema("/ds/storages/:ds/replicas/:site") -> + Parameters = [param_storage_id(), param_site_id()], + #{ + 'operationId' => db_replica, + put => + #{ + description => <<"Add site as a replica for the durable storage">>, + tags => ?TAGS, + parameters => Parameters, + responses => + #{ + 202 => <<"OK">>, + 400 => bad_request(), + 404 => not_found(<<"Object">>) + } + }, + delete => + #{ + description => <<"Remove site as a replica for the durable storage">>, + tags => ?TAGS, + parameters => Parameters, + responses => + #{ + 202 => <<"OK">>, + 400 => bad_request(), + 404 => not_found(<<"Object">>) + } + } + }. + +fields(site) -> + [ + {node, + mk( + atom(), + #{ + desc => <<"Name of the EMQX handling the site">>, + example => <<"'emqx@example.com'">> + } + )}, + {up, + mk( + boolean(), + #{desc => <<"Site is up and running">>} + )}, + {shards, + mk( + array(ref(sites_shard)), + #{desc => <<"Durable storages that have replicas at the site">>} + )} + ]; +fields(sites_shard) -> + [ + {storage, + mk( + atom(), + #{ + desc => <<"Durable storage ID">>, + example => 'emqx_persistent_message' + } + )}, + {id, + mk( + binary(), + #{ + desc => <<"Shard ID">>, + example => <<"1">> + } + )}, + {status, + mk( + atom(), + #{ + desc => <<"Shard status">>, + example => up + } + )} + ]; +fields(db) -> + [ + {name, + mk( + atom(), + #{ + desc => <<"Name of the durable storage">>, + example => 'emqx_persistent_message' + } + )}, + {shards, + mk( + array(ref(db_shard)), + #{desc => <<"List of storage shards">>} + )} + ]; +fields(db_shard) -> + [ + {id, + mk( + binary(), + #{ + desc => <<"Shard ID">>, + example => <<"1">> + } + )}, + {replicas, + mk( + hoconsc:array(ref(db_site)), + #{desc => <<"List of sites containing replicas of the storage">>} + )} + ]; +fields(db_site) -> + [ + {site, + mk( + binary(), + #{ + desc => <<"Site ID">>, + example => example_site() + } + )}, + {status, + mk( + enum([up, joining]), + #{desc => <<"Status of the replica">>} + )} + ]. + +%%================================================================================ +%% Internal exports +%%================================================================================ + +list_sites(get, _Params) -> + {200, emqx_ds_replication_layer_meta:sites()}. + +get_site(get, #{bindings := #{site := Site}}) -> + case lists:member(Site, emqx_ds_replication_layer_meta:sites()) of + false -> + ?NOT_FOUND(<<"Site not found: ", Site/binary>>); + true -> + Node = emqx_ds_replication_layer_meta:node(Site), + IsUp = lists:member(Node, [node() | nodes()]), + Shards = shards_of_site(Site), + ?OK(#{ + node => Node, + up => IsUp, + shards => Shards + }) + end. + +list_dbs(get, _Params) -> + ?OK(dbs()). + +get_db(get, #{bindings := #{ds := DB}}) -> + ?OK(#{ + name => DB, + shards => list_shards(DB) + }). + +db_replicas(get, #{bindings := #{ds := DB}}) -> + Replicas = lists:flatmap( + fun(Shard) -> + #{replica_set := RS} = emqx_ds_replication_layer_meta:shard_info(DB, Shard), + maps:keys(RS) + end, + emqx_ds_replication_layer_meta:shards(DB) + ), + ?OK(lists:usort(Replicas)); +db_replicas(put, #{bindings := #{ds := DB}, body := Sites}) -> + case update_db_sites(DB, Sites, rest) of + ok -> + {202, <<"OK">>}; + {error, Description} -> + ?BAD_REQUEST(400, Description) + end. + +db_replica(put, #{bindings := #{ds := DB, site := Site}}) -> + case join(DB, Site, rest) of + ok -> + {202, <<"OK">>}; + {error, Description} -> + ?BAD_REQUEST(400, Description) + end; +db_replica(delete, #{bindings := #{ds := DB, site := Site}}) -> + case leave(DB, Site, rest) of + ok -> + {202, <<"OK">>}; + {error, Description} -> + ?BAD_REQUEST(400, Description) + end. + +-spec update_db_sites(emqx_ds:db(), [emqx_ds_replication_layer_meta:site()], rest | cli) -> + ok | {error, binary()}. +update_db_sites(DB, Sites, Via) when is_list(Sites) -> + ?SLOG(warning, #{ + msg => "durable_storage_rebalance_request", ds => DB, sites => Sites, via => Via + }), + meta_result_to_binary(emqx_ds_replication_layer_meta:assign_db_sites(DB, Sites)); +update_db_sites(_, _, _) -> + {error, <<"Bad type">>}. + +-spec join(emqx_ds:db(), emqx_ds_replication_layer_meta:site(), rest | cli) -> ok | {error, _}. +join(DB, Site, Via) -> + ?SLOG(warning, #{ + msg => "durable_storage_join_request", ds => DB, site => Site, via => Via + }), + meta_result_to_binary(emqx_ds_replication_layer_meta:join_db_site(DB, Site)). + +-spec leave(emqx_ds:db(), emqx_ds_replication_layer_meta:site(), rest | cli) -> ok | {error, _}. +leave(DB, Site, Via) -> + ?SLOG(warning, #{ + msg => "durable_storage_leave_request", ds => DB, site => Site, via => Via + }), + meta_result_to_binary(emqx_ds_replication_layer_meta:leave_db_site(DB, Site)). + +%%================================================================================ +%% Internal functions +%%================================================================================ + +%% site_info(Site) -> +%% #{}. + +not_found(What) -> + emqx_dashboard_swagger:error_codes(['NOT_FOUND'], <>). + +bad_request() -> + emqx_dashboard_swagger:error_codes(['BAD_REQUEST'], <<"Bad request">>). + +param_site_id() -> + Info = #{ + required => true, + in => path, + desc => <<"Site ID">>, + example => example_site() + }, + {site, mk(binary(), Info)}. + +param_storage_id() -> + Info = #{ + required => true, + in => path, + desc => <<"Durable storage ID">>, + example => emqx_persistent_message + }, + {ds, mk(enum(dbs()), Info)}. + +example_site() -> + try + emqx_ds_replication_layer_meta:this_site() + catch + _:_ -> + <<"AFA18CB1C22F0157">> + end. + +dbs() -> + [emqx_persistent_message]. + +shards_of_site(Site) -> + lists:flatmap( + fun({DB, Shard}) -> + case emqx_ds_replication_layer_meta:shard_info(DB, Shard) of + #{replica_set := #{Site := Info}} -> + [ + #{ + storage => DB, + id => Shard, + status => maps:get(status, Info) + } + ]; + _ -> + [] + end + end, + [ + {DB, Shard} + || DB <- dbs(), + Shard <- emqx_ds_replication_layer_meta:shards(DB) + ] + ). + +list_shards(DB) -> + [ + begin + #{replica_set := RS} = emqx_ds_replication_layer_meta:shard_info(DB, Shard), + Replicas = maps:fold( + fun(Site, #{status := Status}, Acc) -> + [ + #{ + site => Site, + status => Status + } + | Acc + ] + end, + [], + RS + ), + #{ + id => Shard, + replicas => Replicas + } + end + || Shard <- emqx_ds_replication_layer_meta:shards(DB) + ]. + +meta_result_to_binary(ok) -> + ok; +meta_result_to_binary({error, {nonexistent_sites, UnknownSites}}) -> + Msg = ["Unknown sites: " | lists:join(", ", UnknownSites)], + {error, iolist_to_binary(Msg)}; +meta_result_to_binary({error, {nonexistent_db, DB}}) -> + IOList = io_lib:format("Unknown storage: ~p", [DB]), + {error, iolist_to_binary(IOList)}; +meta_result_to_binary({error, Err}) -> + IOList = io_lib:format("Error: ~p", [Err]), + {error, iolist_to_binary(IOList)}. diff --git a/apps/emqx_management/src/emqx_mgmt_api_subscriptions.erl b/apps/emqx_management/src/emqx_mgmt_api_subscriptions.erl index cb8421211..b1a8fbce2 100644 --- a/apps/emqx_management/src/emqx_mgmt_api_subscriptions.erl +++ b/apps/emqx_management/src/emqx_mgmt_api_subscriptions.erl @@ -86,7 +86,8 @@ fields(subscription) -> {qos, hoconsc:mk(emqx_schema:qos(), #{desc => <<"QoS">>, example => 0})}, {nl, hoconsc:mk(integer(), #{desc => <<"No Local">>, example => 0})}, {rap, hoconsc:mk(integer(), #{desc => <<"Retain as Published">>, example => 0})}, - {rh, hoconsc:mk(integer(), #{desc => <<"Retain Handling">>, example => 0})} + {rh, hoconsc:mk(integer(), #{desc => <<"Retain Handling">>, example => 0})}, + {durable, hoconsc:mk(boolean(), #{desc => <<"Durable subscription">>, example => false})} ]. parameters() -> @@ -141,6 +142,14 @@ parameters() -> required => false, desc => <<"Shared subscription group name">> }) + }, + { + durable, + hoconsc:mk(boolean(), #{ + in => query, + required => false, + desc => <<"Filter subscriptions by durability">> + }) } ]. @@ -167,7 +176,8 @@ format(WhichNode, {{Topic, _Subscriber}, SubOpts}) -> #{ topic => emqx_topic:maybe_format_share(Topic), clientid => maps:get(subid, SubOpts, null), - node => WhichNode + node => WhichNode, + durable => false }, maps:with([qos, nl, rap, rh], SubOpts) ). @@ -187,7 +197,22 @@ check_match_topic(#{<<"match_topic">> := MatchTopic}) -> check_match_topic(_) -> ok. -do_subscriptions_query(QString) -> +do_subscriptions_query(QString0) -> + {IsDurable, QString} = maps:take( + <<"durable">>, maps:merge(#{<<"durable">> => undefined}, QString0) + ), + case emqx_persistent_message:is_persistence_enabled() andalso IsDurable of + false -> + do_subscriptions_query_mem(QString); + true -> + do_subscriptions_query_persistent(QString); + undefined -> + merge_queries( + QString, fun do_subscriptions_query_mem/1, fun do_subscriptions_query_persistent/1 + ) + end. + +do_subscriptions_query_mem(QString) -> Args = [?SUBOPTION, QString, ?SUBS_QSCHEMA, fun ?MODULE:qs2ms/2, fun ?MODULE:format/2], case maps:get(<<"node">>, QString, undefined) of undefined -> @@ -201,8 +226,196 @@ do_subscriptions_query(QString) -> end end. +do_subscriptions_query_persistent(#{<<"page">> := Page, <<"limit">> := Limit} = QString) -> + Count = emqx_persistent_session_ds_router:stats(n_routes), + %% TODO: filtering by client ID can be implemented more efficiently: + FilterTopic = maps:get(<<"topic">>, QString, '_'), + Stream0 = emqx_persistent_session_ds_router:stream(FilterTopic), + SubPred = fun(Sub) -> + compare_optional(<<"topic">>, QString, topic, Sub) andalso + compare_optional(<<"clientid">>, QString, clientid, Sub) andalso + compare_optional(<<"qos">>, QString, qos, Sub) andalso + compare_match_topic_optional(<<"match_topic">>, QString, topic, Sub) + end, + NDropped = (Page - 1) * Limit, + {_, Stream} = consume_n_matching( + fun persistent_route_to_subscription/1, SubPred, NDropped, Stream0 + ), + {Subscriptions, Stream1} = consume_n_matching( + fun persistent_route_to_subscription/1, SubPred, Limit, Stream + ), + HasNext = Stream1 =/= [], + Meta = + case maps:is_key(<<"match_topic">>, QString) orelse maps:is_key(<<"qos">>, QString) of + true -> + %% Fuzzy searches shouldn't return count: + #{ + limit => Limit, + page => Page, + hasnext => HasNext + }; + false -> + #{ + count => Count, + limit => Limit, + page => Page, + hasnext => HasNext + } + end, + + #{ + meta => Meta, + data => Subscriptions + }. + +compare_optional(QField, Query, SField, Subscription) -> + case Query of + #{QField := Expected} -> + maps:get(SField, Subscription) =:= Expected; + _ -> + true + end. + +compare_match_topic_optional(QField, Query, SField, Subscription) -> + case Query of + #{QField := TopicFilter} -> + Topic = maps:get(SField, Subscription), + emqx_topic:match(Topic, TopicFilter); + _ -> + true + end. + +%% @doc Drop elements from the stream until encountered N elements +%% matching the predicate function. +-spec consume_n_matching( + fun((T) -> Q), + fun((Q) -> boolean()), + non_neg_integer(), + emqx_utils_stream:stream(T) +) -> {[Q], emqx_utils_stream:stream(T) | empty}. +consume_n_matching(Map, Pred, N, S) -> + consume_n_matching(Map, Pred, N, S, []). + +consume_n_matching(_Map, _Pred, _N, [], Acc) -> + {lists:reverse(Acc), []}; +consume_n_matching(_Map, _Pred, 0, S, Acc) -> + {lists:reverse(Acc), S}; +consume_n_matching(Map, Pred, N, S0, Acc) -> + case emqx_utils_stream:next(S0) of + [] -> + consume_n_matching(Map, Pred, N, [], Acc); + [Elem | S] -> + Mapped = Map(Elem), + case Pred(Mapped) of + true -> consume_n_matching(Map, Pred, N - 1, S, [Mapped | Acc]); + false -> consume_n_matching(Map, Pred, N, S, Acc) + end + end. + +persistent_route_to_subscription(#route{topic = Topic, dest = SessionId}) -> + case emqx_persistent_session_ds:get_client_subscription(SessionId, Topic) of + #{subopts := SubOpts} -> + #{qos := Qos, nl := Nl, rh := Rh, rap := Rap} = SubOpts, + #{ + topic => Topic, + clientid => SessionId, + node => all, + + qos => Qos, + nl => Nl, + rh => Rh, + rap => Rap, + durable => true + }; + undefined -> + #{ + topic => Topic, + clientid => SessionId, + node => all, + durable => true + } + end. + +%% @private This function merges paginated results from two sources. +%% +%% Note: this implementation is far from ideal: `count' for the +%% queries may be missing, it may be larger than the actual number of +%% elements. This may lead to empty pages that can confuse the user. +%% +%% Not much can be done to mitigate that, though: since the count may +%% be incorrect, we cannot run simple math to determine when one +%% stream begins and another ends: it requires actual iteration. +%% +%% Ideally, the dashboard must be split between durable and mem +%% subscriptions, and this function should be removed for good. +merge_queries(QString0, Q1, Q2) -> + #{<<"limit">> := Limit, <<"page">> := Page} = QString0, + C1 = resp_count(QString0, Q1), + C2 = resp_count(QString0, Q2), + Meta = + case is_number(C1) andalso is_number(C2) of + true -> + #{ + count => C1 + C2, + limit => Limit, + page => Page + }; + false -> + #{ + limit => Limit, + page => Page + } + end, + case {C1, C2} of + {_, 0} -> + %% The second query is empty. Just return the result of Q1 as usual: + Q1(QString0); + {0, _} -> + %% The first query is empty. Just return the result of Q2 as usual: + Q2(QString0); + _ when is_number(C1) -> + %% Both queries are potentially non-empty, but we at least + %% have the page number for the first query. We try to + %% stich the pages together and thus respect the limit + %% (except for the page where the results switch from Q1 + %% to Q2). + + %% Page where data from the second query is estimated to + %% begin: + Q2Page = ceil(C1 / Limit), + case Page =< Q2Page of + true -> + #{data := Data, meta := #{hasnext := HN}} = Q1(QString0), + #{ + data => Data, + meta => Meta#{hasnext => HN orelse C2 > 0} + }; + false -> + QString = QString0#{<<"page">> => Page - Q2Page}, + #{data := Data, meta := #{hasnext := HN}} = Q2(QString), + #{data => Data, meta => Meta#{hasnext => HN}} + end; + _ -> + %% We don't know how many items is there in the first + %% query, and the second query is not empty (this includes + %% the case where `C2' is `undefined'). Best we can do is + %% to interleave the queries. This may produce less + %% results per page than `Limit'. + QString = QString0#{<<"limit">> => ceil(Limit / 2)}, + #{data := D1, meta := #{hasnext := HN1}} = Q1(QString), + #{data := D2, meta := #{hasnext := HN2}} = Q2(QString), + #{ + meta => Meta#{hasnext => HN1 or HN2}, + data => D1 ++ D2 + } + end. + +resp_count(Query, QFun) -> + #{meta := Meta} = QFun(Query#{<<"limit">> => 1, <<"page">> => 1}), + maps:get(count, Meta, undefined). + %%-------------------------------------------------------------------- -%% QueryString to MatchSpec +%% QueryString to MatchSpec (mem sessions) %%-------------------------------------------------------------------- -spec qs2ms(atom(), {list(), list()}) -> emqx_mgmt_api:match_spec_and_filter(). diff --git a/apps/emqx_management/src/emqx_mgmt_api_topics.erl b/apps/emqx_management/src/emqx_mgmt_api_topics.erl index 1cb12f8f3..ff935ce10 100644 --- a/apps/emqx_management/src/emqx_mgmt_api_topics.erl +++ b/apps/emqx_management/src/emqx_mgmt_api_topics.erl @@ -225,7 +225,10 @@ format_response_meta(Meta, _Query, #{hasnext := HasNext}) -> Meta#{hasnext => HasNext}. format(#route{topic = Topic, dest = {Group, Node}}) -> - #{topic => ?SHARE(Group, Topic), node => Node}; + #{ + topic => emqx_topic:maybe_format_share(emqx_topic:make_shared_record(Group, Topic)), + node => Node + }; format(#route{topic = Topic, dest = Node}) when is_atom(Node) -> #{topic => Topic, node => Node}; format(#route{topic = Topic, dest = SessionId}) when is_binary(SessionId) -> diff --git a/apps/emqx_management/src/emqx_mgmt_api_trace.erl b/apps/emqx_management/src/emqx_mgmt_api_trace.erl index 5cdbc65ff..e5ccde4f2 100644 --- a/apps/emqx_management/src/emqx_mgmt_api_trace.erl +++ b/apps/emqx_management/src/emqx_mgmt_api_trace.erl @@ -222,7 +222,7 @@ fields(trace) -> )}, {type, hoconsc:mk( - hoconsc:enum([clientid, topic, ip_address]), + hoconsc:enum([clientid, topic, ip_address, ruleid]), #{ description => ?DESC(filter_type), required => true, @@ -257,6 +257,15 @@ fields(trace) -> example => <<"127.0.0.1">> } )}, + {ruleid, + hoconsc:mk( + binary(), + #{ + description => ?DESC(ruleid_field), + required => false, + example => <<"my_rule">> + } + )}, {status, hoconsc:mk( hoconsc:enum([running, stopped, waiting]), @@ -305,6 +314,15 @@ fields(trace) -> example => [#{<<"node">> => <<"emqx@127.0.0.1">>, <<"size">> => 1024}], required => false } + )}, + {formatter, + hoconsc:mk( + hoconsc:union([text, json]), + #{ + description => ?DESC(trace_log_formatter), + example => text, + required => false + } )} ]; fields(name) -> diff --git a/apps/emqx_management/src/emqx_mgmt_cli.erl b/apps/emqx_management/src/emqx_mgmt_cli.erl index ddbc60d5c..32a24d9bd 100644 --- a/apps/emqx_management/src/emqx_mgmt_cli.erl +++ b/apps/emqx_management/src/emqx_mgmt_cli.erl @@ -108,6 +108,7 @@ cluster(["join", SNode]) -> emqx_ctl:print("Failed to join the cluster: ~0p~n", [Error]) end; cluster(["leave"]) -> + _ = maybe_disable_autocluster(), case mria:leave() of ok -> emqx_ctl:print("Leave the cluster successfully.~n"), @@ -139,12 +140,15 @@ cluster(["status"]) -> cluster(["status", "--json"]) -> Info = sort_map_list_fields(cluster_info()), emqx_ctl:print("~ts~n", [emqx_logger_jsonfmt:best_effort_json(Info)]); +cluster(["discovery", "enable"]) -> + enable_autocluster(); cluster(_) -> emqx_ctl:usage([ {"cluster join ", "Join the cluster"}, {"cluster leave", "Leave the cluster"}, {"cluster force-leave ", "Force the node leave from cluster"}, - {"cluster status [--json]", "Cluster status"} + {"cluster status [--json]", "Cluster status"}, + {"cluster discovery enable", "Enable and run automatic cluster discovery (if configured)"} ]). %% sort lists for deterministic output @@ -163,6 +167,25 @@ sort_map_list_field(Field, Map) -> _ -> Map end. +enable_autocluster() -> + ok = ekka:enable_autocluster(), + _ = ekka:autocluster(emqx), + emqx_ctl:print("Automatic cluster discovery enabled.~n"). + +maybe_disable_autocluster() -> + case ekka:autocluster_enabled() of + true -> + ok = ekka:disable_autocluster(), + emqx_ctl:print( + "Automatic cluster discovery is disabled on this node: ~p to avoid" + " re-joining the same cluster again, if the node is not stopped soon." + " To enable it run: 'emqx ctl cluster discovery enable' or restart the node.~n", + [node()] + ); + false -> + ok + end. + %%-------------------------------------------------------------------- %% @doc Query clients @@ -484,21 +507,24 @@ trace(["list"]) -> ) end; trace(["stop", Operation, Filter0]) -> - case trace_type(Operation, Filter0) of - {ok, Type, Filter} -> trace_off(Type, Filter); + case trace_type(Operation, Filter0, text) of + {ok, Type, Filter, _} -> trace_off(Type, Filter); error -> trace([]) end; trace(["start", Operation, ClientId, LogFile]) -> trace(["start", Operation, ClientId, LogFile, "all"]); trace(["start", Operation, Filter0, LogFile, Level]) -> - case trace_type(Operation, Filter0) of - {ok, Type, Filter} -> + trace(["start", Operation, Filter0, LogFile, Level, text]); +trace(["start", Operation, Filter0, LogFile, Level, Formatter0]) -> + case trace_type(Operation, Filter0, Formatter0) of + {ok, Type, Filter, Formatter} -> trace_on( name(Filter0), Type, Filter, list_to_existing_atom(Level), - LogFile + LogFile, + Formatter ); error -> trace([]) @@ -506,17 +532,22 @@ trace(["start", Operation, Filter0, LogFile, Level]) -> trace(_) -> emqx_ctl:usage([ {"trace list", "List all traces started on local node"}, - {"trace start client []", "Traces for a client on local node"}, + {"trace start client [] []", + "Traces for a client on local node (Formatter=text|json)"}, {"trace stop client ", "Stop tracing for a client on local node"}, - {"trace start topic [] ", "Traces for a topic on local node"}, + {"trace start topic [] []", + "Traces for a topic on local node (Formatter=text|json)"}, {"trace stop topic ", "Stop tracing for a topic on local node"}, - {"trace start ip_address [] ", - "Traces for a client ip on local node"}, - {"trace stop ip_address ", "Stop tracing for a client ip on local node"} + {"trace start ip_address [] []", + "Traces for a client ip on local node (Formatter=text|json)"}, + {"trace stop ip_address ", "Stop tracing for a client ip on local node"}, + {"trace start ruleid [] []", + "Traces for a rule ID on local node (Formatter=text|json)"}, + {"trace stop ruleid ", "Stop tracing for a rule ID on local node"} ]). -trace_on(Name, Type, Filter, Level, LogFile) -> - case emqx_trace_handler:install(Name, Type, Filter, Level, LogFile) of +trace_on(Name, Type, Filter, Level, LogFile, Formatter) -> + case emqx_trace_handler:install(Name, Type, Filter, Level, LogFile, Formatter) of ok -> emqx_trace:check(), emqx_ctl:print("trace ~s ~s successfully~n", [Filter, Name]); @@ -567,27 +598,33 @@ traces(["delete", Name]) -> trace_cluster_del(Name); traces(["start", Name, Operation, Filter]) -> traces(["start", Name, Operation, Filter, ?DEFAULT_TRACE_DURATION]); -traces(["start", Name, Operation, Filter0, DurationS]) -> - case trace_type(Operation, Filter0) of - {ok, Type, Filter} -> trace_cluster_on(Name, Type, Filter, DurationS); +traces(["start", Name, Operation, Filter, DurationS]) -> + traces(["start", Name, Operation, Filter, DurationS, text]); +traces(["start", Name, Operation, Filter0, DurationS, Formatter0]) -> + case trace_type(Operation, Filter0, Formatter0) of + {ok, Type, Filter, Formatter} -> trace_cluster_on(Name, Type, Filter, DurationS, Formatter); error -> traces([]) end; traces(_) -> emqx_ctl:usage([ {"traces list", "List all cluster traces started"}, - {"traces start client []", "Traces for a client in cluster"}, - {"traces start topic []", "Traces for a topic in cluster"}, - {"traces start ip_address []", + {"traces start client [] []", + "Traces for a client in cluster (Formatter=text|json)"}, + {"traces start topic [] []", + "Traces for a topic in cluster (Formatter=text|json)"}, + {"traces start ruleid [] []", + "Traces for a rule ID in cluster (Formatter=text|json)"}, + {"traces start ip_address [] []", "Traces for a client IP in cluster\n" "Trace will start immediately on all nodes, including the core and replicant,\n" "and will end after seconds. The default value for is " ?DEFAULT_TRACE_DURATION - " seconds."}, + " seconds. (Formatter=text|json)"}, {"traces stop ", "Stop trace in cluster"}, {"traces delete ", "Delete trace in cluster"} ]). -trace_cluster_on(Name, Type, Filter, DurationS0) -> +trace_cluster_on(Name, Type, Filter, DurationS0, Formatter) -> Now = emqx_trace:now_second(), DurationS = list_to_integer(DurationS0), Trace = #{ @@ -595,7 +632,8 @@ trace_cluster_on(Name, Type, Filter, DurationS0) -> type => Type, Type => bin(Filter), start_at => Now, - end_at => Now + DurationS + end_at => Now + DurationS, + formatter => Formatter }, case emqx_trace:create(Trace) of {ok, _} -> @@ -619,10 +657,12 @@ trace_cluster_off(Name) -> {error, Error} -> emqx_ctl:print("[error] Stop cluster_trace ~s: ~p~n", [Name, Error]) end. -trace_type("client", ClientId) -> {ok, clientid, bin(ClientId)}; -trace_type("topic", Topic) -> {ok, topic, bin(Topic)}; -trace_type("ip_address", IP) -> {ok, ip_address, IP}; -trace_type(_, _) -> error. +trace_type(Op, Match, "text") -> trace_type(Op, Match, text); +trace_type(Op, Match, "json") -> trace_type(Op, Match, json); +trace_type("client", ClientId, Formatter) -> {ok, clientid, bin(ClientId), Formatter}; +trace_type("topic", Topic, Formatter) -> {ok, topic, bin(Topic), Formatter}; +trace_type("ip_address", IP, Formatter) -> {ok, ip_address, IP, Formatter}; +trace_type(_, _, _) -> error. %%-------------------------------------------------------------------- %% @doc Listeners Command @@ -810,9 +850,50 @@ ds(CMD) -> do_ds(["info"]) -> emqx_ds_replication_layer_meta:print_status(); +do_ds(["set_replicas", DBStr | SitesStr]) -> + case emqx_utils:safe_to_existing_atom(DBStr) of + {ok, DB} -> + Sites = lists:map(fun list_to_binary/1, SitesStr), + case emqx_mgmt_api_ds:update_db_sites(DB, Sites, cli) of + ok -> + emqx_ctl:print("ok~n"); + {error, Description} -> + emqx_ctl:print("Unable to update replicas: ~s~n", [Description]) + end; + {error, _} -> + emqx_ctl:print("Unknown durable storage") + end; +do_ds(["join", DBStr, Site]) -> + case emqx_utils:safe_to_existing_atom(DBStr) of + {ok, DB} -> + case emqx_mgmt_api_ds:join(DB, list_to_binary(Site), cli) of + ok -> + emqx_ctl:print("ok~n"); + {error, Description} -> + emqx_ctl:print("Unable to update replicas: ~s~n", [Description]) + end; + {error, _} -> + emqx_ctl:print("Unknown durable storage~n") + end; +do_ds(["leave", DBStr, Site]) -> + case emqx_utils:safe_to_existing_atom(DBStr) of + {ok, DB} -> + case emqx_mgmt_api_ds:leave(DB, list_to_binary(Site), cli) of + ok -> + emqx_ctl:print("ok~n"); + {error, Description} -> + emqx_ctl:print("Unable to update replicas: ~s~n", [Description]) + end; + {error, _} -> + emqx_ctl:print("Unknown durable storage~n") + end; do_ds(_) -> emqx_ctl:usage([ - {"ds info", "Show overview of the embedded durable storage state"} + {"ds info", "Show overview of the embedded durable storage state"}, + {"ds set_replicas ...", + "Change the replica set of the durable storage"}, + {"ds join ", "Add site to the replica set of the storage"}, + {"ds leave ", "Remove site from the replica set of the storage"} ]). %%-------------------------------------------------------------------- diff --git a/apps/emqx_management/src/emqx_mgmt_data_backup.erl b/apps/emqx_management/src/emqx_mgmt_data_backup.erl index 2aaa014a8..03eb7ac06 100644 --- a/apps/emqx_management/src/emqx_mgmt_data_backup.erl +++ b/apps/emqx_management/src/emqx_mgmt_data_backup.erl @@ -773,23 +773,42 @@ validate_cluster_hocon(RawConf) -> do_import_conf(RawConf, Opts) -> GenConfErrs = filter_errors(maps:from_list(import_generic_conf(RawConf))), maybe_print_conf_errors(GenConfErrs, Opts), - Errors = - lists:foldl( - fun(Module, ErrorsAcc) -> - case Module:import_config(RawConf) of - {ok, #{changed := Changed}} -> - maybe_print_changed(Changed, Opts), - ErrorsAcc; - {error, #{root_key := RootKey, reason := Reason}} -> - ErrorsAcc#{[RootKey] => Reason} - end - end, - GenConfErrs, - sort_importer_modules(find_behaviours(emqx_config_backup)) - ), + Modules = sort_importer_modules(find_behaviours(emqx_config_backup)), + Errors = lists:foldl(print_ok_results_collect_errors(RawConf, Opts), GenConfErrs, Modules), maybe_print_conf_errors(Errors, Opts), Errors. +print_ok_results_collect_errors(RawConf, Opts) -> + fun(Module, Errors) -> + case Module:import_config(RawConf) of + {results, {OkResults, ErrResults}} -> + print_ok_results(OkResults, Opts), + collect_errors(ErrResults, Errors); + {ok, OkResult} -> + print_ok_results([OkResult], Opts), + Errors; + {error, ErrResult} -> + collect_errors([ErrResult], Errors) + end + end. + +print_ok_results(Results, Opts) -> + lists:foreach( + fun(#{changed := Changed}) -> + maybe_print_changed(Changed, Opts) + end, + Results + ). + +collect_errors(Results, Errors) -> + lists:foldr( + fun(#{root_key := RootKey, reason := Reason}, Acc) -> + Acc#{[RootKey] => Reason} + end, + Errors, + Results + ). + sort_importer_modules(Modules) -> lists:sort( fun(M1, M2) -> order(M1, ?IMPORT_ORDER) =< order(M2, ?IMPORT_ORDER) end, diff --git a/apps/emqx_management/test/emqx_mgmt_api_clients_SUITE.erl b/apps/emqx_management/test/emqx_mgmt_api_clients_SUITE.erl index 2f4804158..2623e6d4d 100644 --- a/apps/emqx_management/test/emqx_mgmt_api_clients_SUITE.erl +++ b/apps/emqx_management/test/emqx_mgmt_api_clients_SUITE.erl @@ -19,8 +19,9 @@ -include_lib("emqx/include/emqx_mqtt.hrl"). -include_lib("emqx/include/emqx_router.hrl"). --include_lib("eunit/include/eunit.hrl"). +-include_lib("stdlib/include/assert.hrl"). -include_lib("common_test/include/ct.hrl"). +-include_lib("proper/include/proper.hrl"). -include_lib("snabbkaffe/include/snabbkaffe.hrl"). -include_lib("emqx/include/asserts.hrl"). -include_lib("emqx/include/emqx_mqtt.hrl"). @@ -47,7 +48,8 @@ persistent_session_testcases() -> t_persistent_sessions2, t_persistent_sessions3, t_persistent_sessions4, - t_persistent_sessions5 + t_persistent_sessions5, + t_list_clients_v2 ]. client_msgs_testcases() -> [ @@ -56,11 +58,23 @@ client_msgs_testcases() -> ]. init_per_suite(Config) -> - emqx_mgmt_api_test_util:init_suite(), - Config. + ok = snabbkaffe:start_trace(), + Apps = emqx_cth_suite:start( + [ + emqx, + emqx_conf, + emqx_management, + emqx_mgmt_api_test_util:emqx_dashboard() + ], + #{work_dir => emqx_cth_suite:work_dir(Config)} + ), + {ok, _} = emqx_common_test_http:create_default_app(), + [{apps, Apps} | Config]. -end_per_suite(_) -> - emqx_mgmt_api_test_util:end_suite(). +end_per_suite(Config) -> + Apps = ?config(apps, Config), + emqx_cth_suite:stop(Apps), + ok. init_per_group(persistent_sessions, Config) -> AppSpecs = [ @@ -109,9 +123,12 @@ end_per_testcase(TC, _Config) when ?LINE, fun() -> [] =:= emqx_cm:lookup_channels(local, ClientId) end, 5000 - ); + ), + ok = snabbkaffe:stop(), + ok; end_per_testcase(_TC, _Config) -> - ok = snabbkaffe:stop(). + ok = snabbkaffe:stop(), + ok. t_clients(_) -> process_flag(trap_exit, true), @@ -522,6 +539,12 @@ t_persistent_sessions5(Config) -> ), lists:foreach(fun emqtt:stop/1, [C3, C4]), + lists:foreach( + fun(ClientId) -> + ok = erpc:call(N1, emqx_persistent_session_ds, destroy_session, [ClientId]) + end, + [ClientId1, ClientId2, ClientId3, ClientId4] + ), ok end, @@ -1415,6 +1438,319 @@ t_subscribe_shared_topic_nl(_Config) -> PostFun(post, PathFun(["subscribe"]), #{topic => T, qos => 1, nl => 1, rh => 1}) ). +t_list_clients_v2(Config) -> + [N1, N2] = ?config(nodes, Config), + APIPort = 18084, + Port1 = get_mqtt_port(N1, tcp), + Port2 = get_mqtt_port(N2, tcp), + + ?check_trace( + begin + ClientId1 = <<"ca1">>, + ClientId2 = <<"c2">>, + ClientId3 = <<"c3">>, + ClientId4 = <<"ca4">>, + ClientId5 = <<"ca5">>, + ClientId6 = <<"c6">>, + AllClientIds = [ + ClientId1, + ClientId2, + ClientId3, + ClientId4, + ClientId5, + ClientId6 + ], + C1 = connect_client(#{port => Port1, clientid => ClientId1, clean_start => true}), + C2 = connect_client(#{port => Port2, clientid => ClientId2, clean_start => true}), + C3 = connect_client(#{port => Port1, clientid => ClientId3, clean_start => true}), + C4 = connect_client(#{port => Port2, clientid => ClientId4, clean_start => true}), + %% in-memory clients + C5 = connect_client(#{ + port => Port1, clientid => ClientId5, expiry => 0, clean_start => true + }), + C6 = connect_client(#{ + port => Port2, clientid => ClientId6, expiry => 0, clean_start => true + }), + %% offline persistent clients + ok = emqtt:stop(C3), + ok = emqtt:stop(C4), + + %% one by one + QueryParams1 = #{limit => "1"}, + Res1 = list_all_v2(APIPort, QueryParams1), + ?assertMatch( + [ + #{ + <<"data">> := [_], + <<"meta">> := + #{ + <<"hasnext">> := true, + <<"count">> := 1, + <<"cursor">> := _ + } + }, + #{ + <<"data">> := [_], + <<"meta">> := + #{ + <<"hasnext">> := true, + <<"count">> := 1, + <<"cursor">> := _ + } + }, + #{ + <<"data">> := [_], + <<"meta">> := + #{ + <<"hasnext">> := true, + <<"count">> := 1, + <<"cursor">> := _ + } + }, + #{ + <<"data">> := [_], + <<"meta">> := + #{ + <<"hasnext">> := true, + <<"count">> := 1, + <<"cursor">> := _ + } + }, + #{ + <<"data">> := [_], + <<"meta">> := + #{ + <<"hasnext">> := true, + <<"count">> := 1, + <<"cursor">> := _ + } + }, + #{ + <<"data">> := [_], + <<"meta">> := + #{ + <<"hasnext">> := false, + <<"count">> := 1 + } + } + ], + Res1 + ), + assert_contains_clientids(Res1, AllClientIds), + + %% Reusing the same cursors yield the same pages + traverse_in_reverse_v2(APIPort, QueryParams1, Res1), + + %% paging + QueryParams2 = #{limit => "4"}, + Res2 = list_all_v2(APIPort, QueryParams2), + ?assertMatch( + [ + #{ + <<"data">> := [_, _, _, _], + <<"meta">> := + #{ + <<"hasnext">> := true, + <<"count">> := 4, + <<"cursor">> := _ + } + }, + #{ + <<"data">> := [_, _], + <<"meta">> := + #{ + <<"hasnext">> := false, + <<"count">> := 2 + } + } + ], + Res2 + ), + assert_contains_clientids(Res2, AllClientIds), + traverse_in_reverse_v2(APIPort, QueryParams2, Res2), + + QueryParams3 = #{limit => "2"}, + Res3 = list_all_v2(APIPort, QueryParams3), + ?assertMatch( + [ + #{ + <<"data">> := [_, _], + <<"meta">> := + #{ + <<"hasnext">> := true, + <<"count">> := 2, + <<"cursor">> := _ + } + }, + #{ + <<"data">> := [_, _], + <<"meta">> := + #{ + <<"hasnext">> := true, + <<"count">> := 2, + <<"cursor">> := _ + } + }, + #{ + <<"data">> := [_, _], + <<"meta">> := + #{ + <<"hasnext">> := false, + <<"count">> := 2 + } + } + ], + Res3 + ), + assert_contains_clientids(Res3, AllClientIds), + traverse_in_reverse_v2(APIPort, QueryParams3, Res3), + + %% fuzzy filters + QueryParams4 = #{limit => "100", like_clientid => "ca"}, + Res4 = list_all_v2(APIPort, QueryParams4), + ?assertMatch( + [ + #{ + <<"data">> := [_, _, _], + <<"meta">> := + #{ + <<"hasnext">> := false, + <<"count">> := 3 + } + } + ], + Res4 + ), + assert_contains_clientids(Res4, [ClientId1, ClientId4, ClientId5]), + traverse_in_reverse_v2(APIPort, QueryParams4, Res4), + QueryParams5 = #{limit => "1", like_clientid => "ca"}, + Res5 = list_all_v2(APIPort, QueryParams5), + ?assertMatch( + [ + #{ + <<"data">> := [_], + <<"meta">> := + #{ + <<"hasnext">> := true, + <<"count">> := 1, + <<"cursor">> := _ + } + }, + #{ + <<"data">> := [_], + <<"meta">> := + #{ + <<"hasnext">> := true, + <<"count">> := 1, + <<"cursor">> := _ + } + }, + #{ + <<"data">> := [_], + <<"meta">> := + #{ + <<"hasnext">> := false, + <<"count">> := 1 + } + } + ], + Res5 + ), + assert_contains_clientids(Res5, [ClientId1, ClientId4, ClientId5]), + traverse_in_reverse_v2(APIPort, QueryParams5, Res5), + + lists:foreach( + fun(C) -> + {_, {ok, _}} = + ?wait_async_action( + emqtt:stop(C), + #{?snk_kind := emqx_cm_clean_down} + ) + end, + [C1, C2, C5, C6] + ), + + %% Verify that a malicious cursor that could generate an atom on the node is + %% rejected + EvilAtomBin0 = <<131, 100, 0, 5, "some_atom_that_doesnt_exist_on_the_remote_node">>, + EvilAtomBin = emqx_base62:encode(EvilAtomBin0), + + ?assertMatch( + {error, {{_, 400, _}, _, #{<<"message">> := <<"bad cursor">>}}}, + list_v2_request(APIPort, #{limit => "1", cursor => EvilAtomBin}) + ), + %% Verify that the atom was not created + erpc:call(N1, fun() -> + ?assertError(badarg, binary_to_term(EvilAtomBin0, [safe])) + end), + ?assert(is_atom(binary_to_term(EvilAtomBin0))), + + lists:foreach( + fun(ClientId) -> + ok = erpc:call(N1, emqx_persistent_session_ds, destroy_session, [ClientId]) + end, + AllClientIds + ), + + ok + end, + [] + ), + ok. + +t_cursor_serde_prop(_Config) -> + ?assert(proper:quickcheck(cursor_serde_prop(), [{numtests, 100}, {to_file, user}])). + +cursor_serde_prop() -> + ?FORALL( + NumNodes, + range(1, 10), + ?FORALL( + Cursor, + list_clients_cursor_gen(NumNodes), + begin + Nodes = lists:seq(1, NumNodes), + Bin = emqx_mgmt_api_clients:serialize_cursor(Cursor), + Res = emqx_mgmt_api_clients:parse_cursor(Bin, Nodes), + ?WHENFAIL( + ct:pal("original:\n ~p\nroundtrip:\n ~p", [Cursor, Res]), + {ok, Cursor} =:= Res + ) + end + ) + ). + +list_clients_cursor_gen(NumNodes) -> + oneof([ + lists_clients_ets_cursor_gen(NumNodes), + lists_clients_ds_cursor_gen() + ]). + +-define(CURSOR_TYPE_ETS, 1). +-define(CURSOR_TYPE_DS, 2). + +lists_clients_ets_cursor_gen(NumNodes) -> + ?LET( + {NodeIdx, Cont}, + {range(1, NumNodes), oneof([undefined, tuple()])}, + #{ + type => ?CURSOR_TYPE_ETS, + node => NodeIdx, + node_idx => NodeIdx, + cont => Cont + } + ). + +lists_clients_ds_cursor_gen() -> + ?LET( + Iter, + oneof(['$end_of_table', list(term())]), + #{ + type => ?CURSOR_TYPE_DS, + iterator => Iter + } + ). + time_string_to_epoch_millisecond(DateTime) -> time_string_to_epoch(DateTime, millisecond). @@ -1472,6 +1808,31 @@ list_request(Port, QueryParams) -> Path = emqx_mgmt_api_test_util:api_path(Host, ["clients"]), request(get, Path, [], QueryParams). +list_v2_request(Port, QueryParams = #{}) -> + Host = "http://127.0.0.1:" ++ integer_to_list(Port), + Path = emqx_mgmt_api_test_util:api_path(Host, ["clients_v2"]), + QS = uri_string:compose_query(maps:to_list(emqx_utils_maps:binary_key_map(QueryParams))), + request(get, Path, [], QS). + +list_all_v2(Port, QueryParams = #{}) -> + do_list_all_v2(Port, QueryParams, _Acc = []). + +do_list_all_v2(Port, QueryParams, Acc) -> + case list_v2_request(Port, QueryParams) of + {ok, {{_, 200, _}, _, Resp = #{<<"meta">> := #{<<"cursor">> := Cursor}}}} -> + do_list_all_v2(Port, QueryParams#{cursor => Cursor}, [Resp | Acc]); + {ok, {{_, 200, _}, _, Resp = #{<<"meta">> := #{<<"hasnext">> := false}}}} -> + lists:reverse([Resp | Acc]); + Other -> + error( + {unexpected_response, #{ + acc_so_far => Acc, + response => Other, + query_params => QueryParams + }} + ) + end. + lookup_request(ClientId) -> lookup_request(ClientId, 18083). @@ -1535,3 +1896,44 @@ connect_client(Opts) -> ]), {ok, _} = emqtt:connect(C), C. + +assert_contains_clientids(Results, ExpectedClientIds) -> + ContainedClientIds = [ + ClientId + || #{<<"data">> := Rows} <- Results, + #{<<"clientid">> := ClientId} <- Rows + ], + ?assertEqual( + lists:sort(ExpectedClientIds), + lists:sort(ContainedClientIds), + #{results => Results} + ). + +traverse_in_reverse_v2(APIPort, QueryParams0, Results) -> + Cursors0 = + lists:map( + fun(#{<<"meta">> := Meta}) -> + maps:get(<<"cursor">>, Meta, <<"wontbeused">>) + end, + Results + ), + Cursors1 = [<<"none">> | lists:droplast(Cursors0)], + DirectOrderClientIds = [ + ClientId + || #{<<"data">> := Rows} <- Results, + #{<<"clientid">> := ClientId} <- Rows + ], + ReverseCursors = lists:reverse(Cursors1), + do_traverse_in_reverse_v2( + APIPort, QueryParams0, ReverseCursors, DirectOrderClientIds, _Acc = [] + ). + +do_traverse_in_reverse_v2(_APIPort, _QueryParams0, _Cursors = [], DirectOrderClientIds, Acc) -> + ?assertEqual(DirectOrderClientIds, Acc); +do_traverse_in_reverse_v2(APIPort, QueryParams0, [Cursor | Rest], DirectOrderClientIds, Acc) -> + QueryParams = QueryParams0#{cursor => Cursor}, + Res0 = list_v2_request(APIPort, QueryParams), + ?assertMatch({ok, {{_, 200, _}, _, #{<<"data">> := _}}}, Res0), + {ok, {{_, 200, _}, _, #{<<"data">> := Rows}}} = Res0, + ClientIds = [ClientId || #{<<"clientid">> := ClientId} <- Rows], + do_traverse_in_reverse_v2(APIPort, QueryParams0, Rest, DirectOrderClientIds, ClientIds ++ Acc). diff --git a/apps/emqx_management/test/emqx_mgmt_api_ds_SUITE.erl b/apps/emqx_management/test/emqx_mgmt_api_ds_SUITE.erl new file mode 100644 index 000000000..fef9276ca --- /dev/null +++ b/apps/emqx_management/test/emqx_mgmt_api_ds_SUITE.erl @@ -0,0 +1,180 @@ +%%-------------------------------------------------------------------- +%% Copyright (c) 2024 EMQ Technologies Co., Ltd. All Rights Reserved. +%% +%% Licensed under the Apache License, Version 2.0 (the "License"); +%% you may not use this file except in compliance with the License. +%% You may obtain a copy of the License at +%% +%% http://www.apache.org/licenses/LICENSE-2.0 +%% +%% Unless required by applicable law or agreed to in writing, software +%% distributed under the License is distributed on an "AS IS" BASIS, +%% WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +%% See the License for the specific language governing permissions and +%% limitations under the License. +%%-------------------------------------------------------------------- +-module(emqx_mgmt_api_ds_SUITE). + +-compile(export_all). +-compile(nowarn_export_all). + +-include_lib("eunit/include/eunit.hrl"). +-include_lib("common_test/include/ct.hrl"). + +-import(emqx_mgmt_api_test_util, [api_path/1, request_api/2, request_api_with_body/3]). + +all() -> + emqx_common_test_helpers:all(?MODULE). + +init_per_suite(Config) -> + Apps = emqx_cth_suite:start( + [ + {emqx, "session_persistence.enable = true"}, + emqx_management, + {emqx_dashboard, "dashboard.listeners.http { enable = true, bind = 18083 }"} + ], + #{work_dir => emqx_cth_suite:work_dir(Config)} + ), + {ok, _} = emqx_common_test_http:create_default_app(), + [{suite_apps, Apps} | Config]. + +end_per_suite(Config) -> + ok = emqx_cth_suite:stop(?config(suite_apps, Config)). + +init_per_testcase(_, Config) -> + Config. + +end_per_testcase(_, Config) -> + Config. + +t_get_sites(_) -> + Path = api_path(["ds", "sites"]), + {ok, Response} = request_api(get, Path), + ?assertEqual( + [emqx_ds_replication_layer_meta:this_site()], + emqx_utils_json:decode(Response, [return_maps]) + ). + +t_get_storages(_) -> + Path = api_path(["ds", "storages"]), + {ok, Response} = request_api(get, Path), + ?assertEqual( + [<<"emqx_persistent_message">>], + emqx_utils_json:decode(Response, [return_maps]) + ). + +t_get_site(_) -> + %% Unknown sites must result in error 404: + Path404 = api_path(["ds", "sites", "unknown_site"]), + ?assertMatch( + {error, {_, 404, _}}, + request_api(get, Path404) + ), + %% Valid path: + Path = api_path(["ds", "sites", emqx_ds_replication_layer_meta:this_site()]), + {ok, Response} = request_api(get, Path), + ThisNode = atom_to_binary(node()), + ?assertMatch( + #{ + <<"node">> := ThisNode, + <<"up">> := true, + <<"shards">> := + [ + #{ + <<"storage">> := <<"emqx_persistent_message">>, + <<"id">> := _, + <<"status">> := <<"up">> + } + | _ + ] + }, + emqx_utils_json:decode(Response, [return_maps]) + ). + +t_get_db(_) -> + %% Unknown DBs must result in error 400 (since the DS parameter is an enum): + Path400 = api_path(["ds", "storages", "unknown_ds"]), + ?assertMatch( + {error, {_, 400, _}}, + request_api(get, Path400) + ), + %% Valid path: + Path = api_path(["ds", "storages", "emqx_persistent_message"]), + {ok, Response} = request_api(get, Path), + ThisSite = emqx_ds_replication_layer_meta:this_site(), + ?assertMatch( + #{ + <<"name">> := <<"emqx_persistent_message">>, + <<"shards">> := + [ + #{ + <<"id">> := _, + <<"replicas">> := + [ + #{ + <<"site">> := ThisSite, + <<"status">> := <<"up">> + } + | _ + ] + } + | _ + ] + }, + emqx_utils_json:decode(Response) + ). + +t_get_replicas(_) -> + %% Unknown DBs must result in error 400 (since the DS parameter is an enum): + Path400 = api_path(["ds", "storages", "unknown_ds", "replicas"]), + ?assertMatch( + {error, {_, 400, _}}, + request_api(get, Path400) + ), + %% Valid path: + Path = api_path(["ds", "storages", "emqx_persistent_message", "replicas"]), + {ok, Response} = request_api(get, Path), + ThisSite = emqx_ds_replication_layer_meta:this_site(), + ?assertEqual( + [ThisSite], + emqx_utils_json:decode(Response) + ). + +t_put_replicas(_) -> + Path = api_path(["ds", "storages", "emqx_persistent_message", "replicas"]), + %% Error cases: + ?assertMatch( + {ok, 400, #{<<"message">> := <<"Unknown sites: invalid_site">>}}, + parse_error(request_api_with_body(put, Path, [<<"invalid_site">>])) + ), + %% Success case: + ?assertMatch( + {ok, 202, <<"OK">>}, + request_api_with_body(put, Path, [emqx_ds_replication_layer_meta:this_site()]) + ). + +t_join(_) -> + Path400 = api_path(["ds", "storages", "emqx_persistent_message", "replicas", "unknown_site"]), + ?assertMatch( + {error, {_, 400, _}}, + parse_error(request_api(put, Path400)) + ), + ThisSite = emqx_ds_replication_layer_meta:this_site(), + Path = api_path(["ds", "storages", "emqx_persistent_message", "replicas", ThisSite]), + ?assertMatch( + {ok, "OK"}, + request_api(put, Path) + ). + +t_leave(_) -> + ThisSite = emqx_ds_replication_layer_meta:this_site(), + Path = api_path(["ds", "storages", "emqx_persistent_message", "replicas", ThisSite]), + ?assertMatch( + {error, {_, 400, _}}, + request_api(delete, Path) + ). + +parse_error({ok, Code, JSON}) -> + {ok, Code, emqx_utils_json:decode(JSON)}; +parse_error(Err) -> + Err. diff --git a/apps/emqx_management/test/emqx_mgmt_api_subscription_SUITE.erl b/apps/emqx_management/test/emqx_mgmt_api_subscription_SUITE.erl index 356ae97e4..435a837e3 100644 --- a/apps/emqx_management/test/emqx_mgmt_api_subscription_SUITE.erl +++ b/apps/emqx_management/test/emqx_mgmt_api_subscription_SUITE.erl @@ -36,17 +36,72 @@ -define(TOPIC_SORT, #{?TOPIC1 => 1, ?TOPIC2 => 2}). all() -> - emqx_common_test_helpers:all(?MODULE). + [ + {group, mem}, + {group, persistent} + ]. + +groups() -> + CommonTCs = emqx_common_test_helpers:all(?MODULE), + [ + {mem, CommonTCs}, + %% Shared subscriptions are currently not supported: + {persistent, CommonTCs -- [t_list_with_shared_sub, t_subscription_api]} + ]. init_per_suite(Config) -> - emqx_mgmt_api_test_util:init_suite(), + Apps = emqx_cth_suite:start( + [ + {emqx, + "session_persistence {\n" + " enable = true\n" + " renew_streams_interval = 10ms\n" + "}"}, + emqx_management, + emqx_mgmt_api_test_util:emqx_dashboard() + ], + #{work_dir => emqx_cth_suite:work_dir(Config)} + ), + [{apps, Apps} | Config]. + +end_per_suite(Config) -> + ok = emqx_cth_suite:stop(?config(apps, Config)). + +init_per_group(persistent, Config) -> + ClientConfig = #{ + username => ?USERNAME, + clientid => ?CLIENTID, + proto_ver => v5, + clean_start => true, + properties => #{'Session-Expiry-Interval' => 300} + }, + [{client_config, ClientConfig}, {durable, true} | Config]; +init_per_group(mem, Config) -> + ClientConfig = #{ + username => ?USERNAME, clientid => ?CLIENTID, proto_ver => v5, clean_start => true + }, + [{client_config, ClientConfig}, {durable, false} | Config]. + +end_per_group(_, Config) -> Config. -end_per_suite(_) -> - emqx_mgmt_api_test_util:end_suite(). +init_per_testcase(_TC, Config) -> + case ?config(client_config, Config) of + ClientConfig when is_map(ClientConfig) -> + {ok, Client} = emqtt:start_link(ClientConfig), + {ok, _} = emqtt:connect(Client), + [{client, Client} | Config]; + _ -> + Config + end. + +end_per_testcase(_TC, Config) -> + Client = proplists:get_value(client, Config), + emqtt:disconnect(Client). t_subscription_api(Config) -> Client = proplists:get_value(client, Config), + Durable = atom_to_list(?config(durable, Config)), {ok, _, _} = emqtt:subscribe( Client, [ {?TOPIC1, [{rh, ?TOPIC1RH}, {rap, ?TOPIC1RAP}, {nl, ?TOPIC1NL}, {qos, ?TOPIC1QOS}]} @@ -54,12 +109,13 @@ t_subscription_api(Config) -> ), {ok, _, _} = emqtt:subscribe(Client, ?TOPIC2), Path = emqx_mgmt_api_test_util:api_path(["subscriptions"]), + timer:sleep(100), {ok, Response} = emqx_mgmt_api_test_util:request_api(get, Path), Data = emqx_utils_json:decode(Response, [return_maps]), Meta = maps:get(<<"meta">>, Data), ?assertEqual(1, maps:get(<<"page">>, Meta)), ?assertEqual(emqx_mgmt:default_row_limit(), maps:get(<<"limit">>, Meta)), - ?assertEqual(2, maps:get(<<"count">>, Meta)), + ?assertEqual(2, maps:get(<<"count">>, Meta), Data), Subscriptions = maps:get(<<"data">>, Data), ?assertEqual(length(Subscriptions), 2), Sort = @@ -90,7 +146,8 @@ t_subscription_api(Config) -> {"node", atom_to_list(node())}, {"qos", "0"}, {"share_group", "test_group"}, - {"match_topic", "t/#"} + {"match_topic", "t/#"}, + {"durable", Durable} ], Headers = emqx_mgmt_api_test_util:auth_header_(), @@ -103,6 +160,7 @@ t_subscription_api(Config) -> t_subscription_fuzzy_search(Config) -> Client = proplists:get_value(client, Config), + Durable = atom_to_list(?config(durable, Config)), Topics = [ <<"t/foo">>, <<"t/foo/bar">>, @@ -116,7 +174,8 @@ t_subscription_fuzzy_search(Config) -> MatchQs = [ {"clientid", ?CLIENTID}, {"node", atom_to_list(node())}, - {"match_topic", "t/#"} + {"match_topic", "t/#"}, + {"durable", Durable} ], MatchData1 = #{<<"meta">> := MatchMeta1} = request_json(get, MatchQs, Headers), @@ -130,12 +189,13 @@ t_subscription_fuzzy_search(Config) -> LimitMatchQuery = [ {"clientid", ?CLIENTID}, {"match_topic", "+/+/+"}, - {"limit", "3"} + {"limit", "3"}, + {"durable", Durable} ], MatchData2 = #{<<"meta">> := MatchMeta2} = request_json(get, LimitMatchQuery, Headers), ?assertEqual(#{<<"page">> => 1, <<"limit">> => 3, <<"hasnext">> => true}, MatchMeta2), - ?assertEqual(3, length(maps:get(<<"data">>, MatchData2))), + ?assertEqual(3, length(maps:get(<<"data">>, MatchData2)), MatchData2), MatchData2P2 = #{<<"meta">> := MatchMeta2P2} = @@ -176,8 +236,8 @@ t_list_with_shared_sub(_Config) -> ok. -t_list_with_invalid_match_topic(_Config) -> - Client = proplists:get_value(client, _Config), +t_list_with_invalid_match_topic(Config) -> + Client = proplists:get_value(client, Config), RealTopic = <<"t/+">>, Topic = <<"$share/g1/", RealTopic/binary>>, @@ -212,12 +272,3 @@ request_json(Method, Query, Headers) when is_list(Query) -> path() -> emqx_mgmt_api_test_util:api_path(["subscriptions"]). - -init_per_testcase(_TC, Config) -> - {ok, Client} = emqtt:start_link(#{username => ?USERNAME, clientid => ?CLIENTID, proto_ver => v5}), - {ok, _} = emqtt:connect(Client), - [{client, Client} | Config]. - -end_per_testcase(_TC, Config) -> - Client = proplists:get_value(client, Config), - emqtt:disconnect(Client). diff --git a/apps/emqx_management/test/emqx_mgmt_api_topics_SUITE.erl b/apps/emqx_management/test/emqx_mgmt_api_topics_SUITE.erl index 55113c9e2..a8f912802 100644 --- a/apps/emqx_management/test/emqx_mgmt_api_topics_SUITE.erl +++ b/apps/emqx_management/test/emqx_mgmt_api_topics_SUITE.erl @@ -187,6 +187,30 @@ t_shared_topics(_Configs) -> ok = emqtt:stop(Client). +t_queue_topics(_Configs) -> + Node = atom_to_binary(node(), utf8), + RealTopic = <<"t/+">>, + Topic = <<"$queue/", RealTopic/binary>>, + + Client = client(?FUNCTION_NAME), + {ok, _, _} = emqtt:subscribe(Client, Topic), + {ok, _, _} = emqtt:subscribe(Client, RealTopic), + + %% exact match with shared topic + MatchData = request_json(get, ["topics"], [ + {"topic", Topic}, + {"node", atom_to_list(node())} + ]), + ?assertMatch( + #{ + <<"data">> := [#{<<"topic">> := Topic, <<"node">> := Node}], + <<"meta">> := #{<<"page">> := 1, <<"limit">> := 100, <<"count">> := 1} + }, + MatchData + ), + + ok = emqtt:stop(Client). + t_shared_topics_invalid(_Config) -> %% no real topic InvalidShareTopicFilter = <<"$share/group">>, diff --git a/apps/emqx_management/test/emqx_mgmt_api_trace_SUITE.erl b/apps/emqx_management/test/emqx_mgmt_api_trace_SUITE.erl index cb93bc9d6..c5f5c475d 100644 --- a/apps/emqx_management/test/emqx_mgmt_api_trace_SUITE.erl +++ b/apps/emqx_management/test/emqx_mgmt_api_trace_SUITE.erl @@ -23,6 +23,7 @@ -include_lib("kernel/include/file.hrl"). -include_lib("stdlib/include/zip.hrl"). -include_lib("snabbkaffe/include/snabbkaffe.hrl"). +-include_lib("emqx/include/logger.hrl"). %%-------------------------------------------------------------------- %% Setups @@ -122,6 +123,233 @@ t_http_test(_Config) -> unload(), ok. +t_http_test_rule_trace(_Config) -> + emqx_trace:clear(), + load(), + %% create + Name = atom_to_binary(?FUNCTION_NAME), + Trace = [ + {<<"name">>, Name}, + {<<"type">>, <<"ruleid">>}, + {<<"ruleid">>, Name} + ], + + {ok, Create} = request_api(post, api_path("trace"), Trace), + ?assertMatch(#{<<"name">> := Name}, json(Create)), + + {ok, List} = request_api(get, api_path("trace")), + [Data] = json(List), + ?assertEqual(Name, maps:get(<<"name">>, Data)), + + %% update + {ok, Update} = request_api(put, api_path(iolist_to_binary(["trace/", Name, "/stop"])), #{}), + ?assertEqual( + #{ + <<"enable">> => false, + <<"name">> => Name + }, + json(Update) + ), + {ok, List1} = request_api(get, api_path("trace")), + [Data1] = json(List1), + Node = atom_to_binary(node()), + ?assertMatch( + #{ + <<"status">> := <<"stopped">>, + <<"name">> := Name, + <<"log_size">> := #{Node := _}, + <<"start_at">> := _, + <<"end_at">> := _, + <<"type">> := <<"ruleid">>, + <<"ruleid">> := Name + }, + Data1 + ), + + %% delete + {ok, Delete} = request_api(delete, api_path(["trace/", Name])), + ?assertEqual(<<>>, Delete), + + emqx_trace:clear(), + unload(), + ok. + +t_http_test_json_formatter(_Config) -> + emqx_trace:clear(), + load(), + + Name = <<"testname">>, + Topic = <<"/x/y/z">>, + Trace = [ + {<<"name">>, Name}, + {<<"type">>, <<"topic">>}, + {<<"topic">>, Topic}, + {<<"formatter">>, <<"json">>} + ], + + {ok, Create} = request_api(post, api_path("trace"), Trace), + ?assertMatch(#{<<"name">> := Name}, json(Create)), + + {ok, List} = request_api(get, api_path("trace")), + [Data] = json(List), + ?assertEqual(<<"json">>, maps:get(<<"formatter">>, Data)), + + {ok, List1} = request_api(get, api_path("trace")), + [Data1] = json(List1), + ?assertMatch( + #{ + <<"formatter">> := <<"json">> + }, + Data1 + ), + + %% Check that the log is empty + ok = emqx_trace_handler_SUITE:filesync(Name, topic), + {ok, _Detail} = request_api(get, api_path("trace/" ++ binary_to_list(Name) ++ "/log_detail")), + %% Trace is empty which results in a not found error + {error, _} = request_api(get, api_path("trace/" ++ binary_to_list(Name) ++ "/download")), + + %% Start a client and send a message to get info to the log + ClientId = <<"my_client_id">>, + {ok, Client} = emqtt:start_link([{clean_start, true}, {clientid, ClientId}]), + {ok, _} = emqtt:connect(Client), + %% Normal message + emqtt:publish(Client, Topic, #{}, <<"log_this_message">>, [{qos, 2}]), + %% Escape line breaks + emqtt:publish(Client, Topic, #{}, <<"\nlog\nthis\nmessage">>, [{qos, 2}]), + %% Escape escape character + emqtt:publish(Client, Topic, #{}, <<"\\\nlog\n_\\n_this\nmessage\\">>, [{qos, 2}]), + %% Escape end of string + emqtt:publish(Client, Topic, #{}, <<"\"log_this_message\"">>, [{qos, 2}]), + + %% Manually create some trace messages to test the JSON formatter + + %% String key and value + ?TRACE("CUSTOM", "my_log_msg", #{topic => Topic, "str" => "str"}), + %% Log Erlang term + ?TRACE("CUSTOM", "my_log_msg", #{topic => Topic, term => {notjson}}), + %% Log Erlang term key + ?TRACE("CUSTOM", "my_log_msg", #{topic => Topic, {'notjson'} => term}), + %% Log Integer + ?TRACE("CUSTOM", "my_log_msg", #{topic => Topic, integer => 42}), + %% Log Float + ?TRACE("CUSTOM", "my_log_msg", #{topic => Topic, float => 1.2}), + %% Log Integer Key + ?TRACE("CUSTOM", "my_log_msg", #{topic => Topic, 42 => integer}), + %% Log Float Key + ?TRACE("CUSTOM", "my_log_msg", #{topic => Topic, 1.2 => float}), + %% Log Map Key + ?TRACE("CUSTOM", "my_log_msg", #{topic => Topic, #{} => value}), + %% Empty submap + ?TRACE("CUSTOM", "my_log_msg", #{topic => Topic, sub => #{}}), + %% Non-empty submap + ?TRACE("CUSTOM", "my_log_msg", #{topic => Topic, sub => #{key => value}}), + %% Bolean values + ?TRACE("CUSTOM", "my_log_msg", #{topic => Topic, true => true, false => false}), + %% Key value list + ?TRACE("CUSTOM", "my_log_msg", #{ + topic => Topic, + list => [ + {<<"key">>, <<"value">>}, + {<<"key2">>, <<"value2">>} + ] + }), + %% We do special formatting for client_ids and rule_ids + ?TRACE("CUSTOM", "my_log_msg", #{ + topic => Topic, + client_ids => maps:from_keys([<<"a">>, <<"b">>, <<"c">>], true) + }), + ?TRACE("CUSTOM", "my_log_msg", #{ + topic => Topic, + rule_ids => maps:from_keys([<<"a">>, <<"b">>, <<"c">>], true) + }), + %% action_id should be rendered as action_info + ?TRACE("CUSTOM", "my_log_msg", #{ + topic => Topic, + action_id => + <<"action:http:emqx_bridge_http_test_lib:connector:http:emqx_bridge_http_test_lib">> + }), + ok = emqx_trace_handler_SUITE:filesync(Name, topic), + {ok, _Detail2} = request_api(get, api_path("trace/" ++ binary_to_list(Name) ++ "/log_detail")), + {ok, Bin} = request_api(get, api_path("trace/" ++ binary_to_list(Name) ++ "/download")), + {ok, [ + _Comment, + #zip_file{ + name = _ZipName, + info = #file_info{size = Size, type = regular, access = read_write} + } + ]} = zip:table(Bin), + ?assert(Size > 0), + {ok, [{_, LogContent}]} = zip:unzip(Bin, [memory]), + LogEntriesTrailing = string:split(LogContent, "\n", all), + LogEntries = lists:droplast(LogEntriesTrailing), + DecodedLogEntries = [ + begin + ct:pal("LOG ENTRY\n~s\n", [JSONEntry]), + emqx_utils_json:decode(JSONEntry) + end + || JSONEntry <- LogEntries + ], + ?assertMatch( + [ + #{<<"meta">> := #{<<"payload">> := <<"log_this_message">>}}, + #{<<"meta">> := #{<<"payload">> := <<"\nlog\nthis\nmessage">>}}, + #{ + <<"meta">> := #{<<"payload">> := <<"\\\nlog\n_\\n_this\nmessage\\">>} + }, + #{<<"meta">> := #{<<"payload">> := <<"\"log_this_message\"">>}}, + #{<<"meta">> := #{<<"str">> := <<"str">>}}, + #{<<"meta">> := #{<<"term">> := <<"{notjson}">>}}, + #{<<"meta">> := <<_/binary>>}, + #{<<"meta">> := #{<<"integer">> := 42}}, + #{<<"meta">> := #{<<"float">> := 1.2}}, + #{<<"meta">> := <<_/binary>>}, + #{<<"meta">> := <<_/binary>>}, + #{<<"meta">> := <<_/binary>>}, + #{<<"meta">> := #{<<"sub">> := #{}}}, + #{<<"meta">> := #{<<"sub">> := #{<<"key">> := <<"value">>}}}, + #{<<"meta">> := #{<<"true">> := <<"true">>, <<"false">> := <<"false">>}}, + #{ + <<"meta">> := #{ + <<"list">> := #{ + <<"key">> := <<"value">>, + <<"key2">> := <<"value2">> + } + } + }, + #{ + <<"meta">> := #{ + <<"client_ids">> := [<<"a">>, <<"b">>, <<"c">>] + } + }, + #{ + <<"meta">> := #{ + <<"rule_ids">> := [<<"a">>, <<"b">>, <<"c">>] + } + }, + #{ + <<"meta">> := #{ + <<"action_info">> := #{ + <<"type">> := <<"http">>, + <<"name">> := <<"emqx_bridge_http_test_lib">> + } + } + } + | _ + ], + DecodedLogEntries + ), + {ok, Delete} = request_api(delete, api_path("trace/" ++ binary_to_list(Name))), + ?assertEqual(<<>>, Delete), + + {ok, List2} = request_api(get, api_path("trace")), + ?assertEqual([], json(List2)), + + ok = emqtt:disconnect(Client), + unload(), + emqx_trace:clear(), + ok. + t_create_failed(_Config) -> load(), Trace = [{<<"type">>, <<"topic">>}, {<<"topic">>, <<"/x/y/z">>}], @@ -252,13 +480,16 @@ t_log_file(_Config) -> ok. create_trace(Name, ClientId, Start) -> + create_trace(Name, clientid, ClientId, Start). + +create_trace(Name, Type, TypeValue, Start) -> ?check_trace( #{timetrap => 900}, begin {ok, _} = emqx_trace:create([ {<<"name">>, Name}, - {<<"type">>, clientid}, - {<<"clientid">>, ClientId}, + {<<"type">>, Type}, + {atom_to_binary(Type), TypeValue}, {<<"start_at">>, Start} ]), ?block_until(#{?snk_kind := update_trace_done}) @@ -268,6 +499,16 @@ create_trace(Name, ClientId, Start) -> end ). +create_rule_trace(RuleId) -> + Now = erlang:system_time(second), + emqx_mgmt_api_trace_SUITE:create_trace(atom_to_binary(?FUNCTION_NAME), ruleid, RuleId, Now - 2). + +t_create_rule_trace(_Config) -> + load(), + create_rule_trace(atom_to_binary(?FUNCTION_NAME)), + unload(), + ok. + t_stream_log(_Config) -> emqx_trace:clear(), load(), diff --git a/apps/emqx_management/test/emqx_mgmt_cli_SUITE.erl b/apps/emqx_management/test/emqx_mgmt_cli_SUITE.erl index c81881c95..b1d646b40 100644 --- a/apps/emqx_management/test/emqx_mgmt_cli_SUITE.erl +++ b/apps/emqx_management/test/emqx_mgmt_cli_SUITE.erl @@ -19,6 +19,7 @@ -compile(nowarn_export_all). -include_lib("eunit/include/eunit.hrl"). +-include_lib("common_test/include/ct.hrl"). all() -> emqx_common_test_helpers:all(?MODULE). @@ -31,6 +32,47 @@ init_per_suite(Config) -> end_per_suite(_) -> emqx_mgmt_api_test_util:end_suite([emqx_management, emqx_conf]). +init_per_testcase(t_autocluster_leave = TC, Config) -> + [Core1, Core2, Repl1, Repl2] = + Nodes = [ + t_autocluster_leave_core1, + t_autocluster_leave_core2, + t_autocluster_leave_replicant1, + t_autocluster_leave_replicant2 + ], + + NodeNames = [emqx_cth_cluster:node_name(N) || N <- Nodes], + AppSpec = [ + emqx, + {emqx_conf, #{ + config => #{ + cluster => #{ + discovery_strategy => static, + static => #{seeds => NodeNames} + } + } + }}, + emqx_management + ], + Cluster = emqx_cth_cluster:start( + [ + {Core1, #{role => core, apps => AppSpec}}, + {Core2, #{role => core, apps => AppSpec}}, + {Repl1, #{role => replicant, apps => AppSpec}}, + {Repl2, #{role => replicant, apps => AppSpec}} + ], + #{work_dir => emqx_cth_suite:work_dir(TC, Config)} + ), + [{cluster, Cluster} | Config]; +init_per_testcase(_TC, Config) -> + Config. + +end_per_testcase(_TC, Config) -> + case ?config(cluster, Config) of + undefined -> ok; + Cluster -> emqx_cth_cluster:stop(Cluster) + end. + t_status(_Config) -> emqx_ctl:run_command([]), emqx_ctl:run_command(["status"]), @@ -263,3 +305,44 @@ t_admin(_Config) -> %% admins passwd # Reset dashboard user password %% admins del # Delete dashboard user ok. + +t_autocluster_leave(Config) -> + [Core1, Core2, Repl1, Repl2] = Cluster = ?config(cluster, Config), + %% Mria membership updates are async, makes sense to wait a little + timer:sleep(300), + ClusterView = [lists:sort(rpc:call(N, emqx, running_nodes, [])) || N <- Cluster], + [View1, View2, View3, View4] = ClusterView, + ?assertEqual(lists:sort(Cluster), View1), + ?assertEqual(View1, View2), + ?assertEqual(View1, View3), + ?assertEqual(View1, View4), + + rpc:call(Core2, emqx_mgmt_cli, cluster, [["leave"]]), + timer:sleep(1000), + %% Replicant nodes can discover Core2 which is now split from [Core1, Core2], + %% but they are expected to ignore Core2, + %% since mria_lb must filter out core nodes that disabled discovery. + ?assertMatch([Core2], rpc:call(Core2, emqx, running_nodes, [])), + ?assertEqual(undefined, rpc:call(Core1, erlang, whereis, [ekka_autocluster])), + ?assertEqual(lists:sort([Core1, Repl1, Repl2]), rpc:call(Core1, emqx, running_nodes, [])), + ?assertEqual(lists:sort([Core1, Repl1, Repl2]), rpc:call(Repl1, emqx, running_nodes, [])), + ?assertEqual(lists:sort([Core1, Repl1, Repl2]), rpc:call(Repl2, emqx, running_nodes, [])), + + rpc:call(Repl1, emqx_mgmt_cli, cluster, [["leave"]]), + timer:sleep(1000), + ?assertEqual(lists:sort([Core1, Repl2]), rpc:call(Core1, emqx, running_nodes, [])), + + rpc:call(Core2, emqx_mgmt_cli, cluster, [["discovery", "enable"]]), + rpc:call(Repl1, emqx_mgmt_cli, cluster, [["discovery", "enable"]]), + %% nodes will join and restart asyncly, may need more time to re-cluster + ?assertEqual( + ok, + emqx_common_test_helpers:wait_for( + ?FUNCTION_NAME, + ?LINE, + fun() -> + [lists:sort(rpc:call(N, emqx, running_nodes, [])) || N <- Cluster] =:= ClusterView + end, + 10_000 + ) + ). diff --git a/apps/emqx_management/test/emqx_mgmt_data_backup_SUITE.erl b/apps/emqx_management/test/emqx_mgmt_data_backup_SUITE.erl index 36a838743..fee392479 100644 --- a/apps/emqx_management/test/emqx_mgmt_data_backup_SUITE.erl +++ b/apps/emqx_management/test/emqx_mgmt_data_backup_SUITE.erl @@ -18,6 +18,7 @@ -compile(export_all). -compile(nowarn_export_all). +-include_lib("emqx_utils/include/emqx_message.hrl"). -include_lib("eunit/include/eunit.hrl"). -include_lib("common_test/include/ct.hrl"). -include_lib("snabbkaffe/include/snabbkaffe.hrl"). @@ -86,6 +87,33 @@ t_empty_export_import(_Config) -> ?assertEqual(Exp, emqx_mgmt_data_backup:import(FileName)), ?assertEqual(ExpRawConf, emqx:get_raw_config([])). +t_cluster_hocon_import_mqtt_subscribers_retainer_messages(Config) -> + case emqx_release:edition() of + ce -> + ok; + ee -> + FNameEmqx44 = "emqx-export-4.4.24-retainer-mqttsub.tar.gz", + BackupFile = filename:join(?config(data_dir, Config), FNameEmqx44), + Exp = {ok, #{db_errors => #{}, config_errors => #{}}}, + ?assertEqual(Exp, emqx_mgmt_data_backup:import(BackupFile)), + RawConfAfterImport = emqx:get_raw_config([]), + %% verify that MQTT sources are imported + ?assertMatch( + #{<<"sources">> := #{<<"mqtt">> := Sources}} when map_size(Sources) > 0, + RawConfAfterImport + ), + %% verify that retainer messages are imported + ?assertMatch( + {ok, [#message{payload = <<"test-payload">>}]}, + emqx_retainer:read_message(<<"test-retained-message/1">>) + ), + %% Export and import again + {ok, #{filename := FileName}} = emqx_mgmt_data_backup:export(), + ?assertEqual(Exp, emqx_mgmt_data_backup:import(FileName)), + ?assertEqual(RawConfAfterImport, emqx:get_raw_config([])) + end, + ok. + t_cluster_hocon_export_import(Config) -> RawConfBeforeImport = emqx:get_raw_config([]), BootstrapFile = filename:join(?config(data_dir, Config), ?BOOTSTRAP_BACKUP), diff --git a/apps/emqx_management/test/emqx_mgmt_data_backup_SUITE_data/emqx-export-4.4.24-retainer-mqttsub.tar.gz b/apps/emqx_management/test/emqx_mgmt_data_backup_SUITE_data/emqx-export-4.4.24-retainer-mqttsub.tar.gz new file mode 100644 index 000000000..67133f19d Binary files /dev/null and b/apps/emqx_management/test/emqx_mgmt_data_backup_SUITE_data/emqx-export-4.4.24-retainer-mqttsub.tar.gz differ diff --git a/apps/emqx_node_rebalance/src/emqx_node_rebalance.app.src b/apps/emqx_node_rebalance/src/emqx_node_rebalance.app.src index beb5f2abb..e8967c556 100644 --- a/apps/emqx_node_rebalance/src/emqx_node_rebalance.app.src +++ b/apps/emqx_node_rebalance/src/emqx_node_rebalance.app.src @@ -1,6 +1,6 @@ {application, emqx_node_rebalance, [ {description, "EMQX Node Rebalance"}, - {vsn, "5.0.7"}, + {vsn, "5.0.8"}, {registered, [ emqx_node_rebalance_sup, emqx_node_rebalance, @@ -10,7 +10,10 @@ ]}, {applications, [ kernel, - stdlib + stdlib, + emqx, + emqx_ctl, + emqx_eviction_agent ]}, {mod, {emqx_node_rebalance_app, []}}, {env, []}, diff --git a/apps/emqx_node_rebalance/src/emqx_node_rebalance_api.erl b/apps/emqx_node_rebalance/src/emqx_node_rebalance_api.erl index a054cfe1f..35461ee5b 100644 --- a/apps/emqx_node_rebalance/src/emqx_node_rebalance_api.erl +++ b/apps/emqx_node_rebalance/src/emqx_node_rebalance_api.erl @@ -423,7 +423,7 @@ param_node() -> fields(rebalance_start) -> [ - {"wait_health_check", + {wait_health_check, mk( emqx_schema:timeout_duration_s(), #{ @@ -431,7 +431,7 @@ fields(rebalance_start) -> required => false } )}, - {"conn_evict_rate", + {conn_evict_rate, mk( pos_integer(), #{ @@ -439,7 +439,7 @@ fields(rebalance_start) -> required => false } )}, - {"sess_evict_rate", + {sess_evict_rate, mk( pos_integer(), #{ @@ -447,7 +447,7 @@ fields(rebalance_start) -> required => false } )}, - {"abs_conn_threshold", + {abs_conn_threshold, mk( pos_integer(), #{ @@ -455,7 +455,7 @@ fields(rebalance_start) -> required => false } )}, - {"rel_conn_threshold", + {rel_conn_threshold, mk( number(), #{ @@ -464,7 +464,7 @@ fields(rebalance_start) -> validator => [fun(Value) -> Value > 1.0 end] } )}, - {"abs_sess_threshold", + {abs_sess_threshold, mk( pos_integer(), #{ @@ -472,7 +472,7 @@ fields(rebalance_start) -> required => false } )}, - {"rel_sess_threshold", + {rel_sess_threshold, mk( number(), #{ @@ -481,7 +481,7 @@ fields(rebalance_start) -> validator => [fun(Value) -> Value > 1.0 end] } )}, - {"wait_takeover", + {wait_takeover, mk( emqx_schema:timeout_duration_s(), #{ @@ -489,7 +489,7 @@ fields(rebalance_start) -> required => false } )}, - {"nodes", + {nodes, mk( list(binary()), #{ @@ -501,7 +501,7 @@ fields(rebalance_start) -> ]; fields(rebalance_evacuation_start) -> [ - {"wait_health_check", + {wait_health_check, mk( emqx_schema:timeout_duration_s(), #{ @@ -509,7 +509,7 @@ fields(rebalance_evacuation_start) -> required => false } )}, - {"conn_evict_rate", + {conn_evict_rate, mk( pos_integer(), #{ @@ -517,7 +517,7 @@ fields(rebalance_evacuation_start) -> required => false } )}, - {"sess_evict_rate", + {sess_evict_rate, mk( pos_integer(), #{ @@ -525,7 +525,7 @@ fields(rebalance_evacuation_start) -> required => false } )}, - {"redirect_to", + {redirect_to, mk( binary(), #{ @@ -533,7 +533,7 @@ fields(rebalance_evacuation_start) -> required => false } )}, - {"wait_takeover", + {wait_takeover, mk( emqx_schema:timeout_duration_s(), #{ @@ -541,7 +541,7 @@ fields(rebalance_evacuation_start) -> required => false } )}, - {"migrate_to", + {migrate_to, mk( nonempty_list(binary()), #{ @@ -552,7 +552,7 @@ fields(rebalance_evacuation_start) -> ]; fields(purge_start) -> [ - {"purge_rate", + {purge_rate, mk( pos_integer(), #{ @@ -563,7 +563,7 @@ fields(purge_start) -> ]; fields(local_status_disabled) -> [ - {"status", + {status, mk( disabled, #{ @@ -574,7 +574,7 @@ fields(local_status_disabled) -> ]; fields(local_status_enabled) -> [ - {"status", + {status, mk( enabled, #{ @@ -582,7 +582,7 @@ fields(local_status_enabled) -> required => true } )}, - {"process", + {process, mk( hoconsc:enum([rebalance, evacuation]), #{ @@ -590,7 +590,7 @@ fields(local_status_enabled) -> required => true } )}, - {"state", + {state, mk( atom(), #{ @@ -598,7 +598,7 @@ fields(local_status_enabled) -> required => true } )}, - {"coordinator_node", + {coordinator_node, mk( binary(), #{ @@ -606,7 +606,7 @@ fields(local_status_enabled) -> required => false } )}, - {"connection_eviction_rate", + {connection_eviction_rate, mk( pos_integer(), #{ @@ -614,7 +614,7 @@ fields(local_status_enabled) -> required => false } )}, - {"session_eviction_rate", + {session_eviction_rate, mk( pos_integer(), #{ @@ -622,7 +622,7 @@ fields(local_status_enabled) -> required => false } )}, - {"connection_goal", + {connection_goal, mk( non_neg_integer(), #{ @@ -630,7 +630,7 @@ fields(local_status_enabled) -> required => false } )}, - {"session_goal", + {session_goal, mk( non_neg_integer(), #{ @@ -638,7 +638,7 @@ fields(local_status_enabled) -> required => false } )}, - {"disconnected_session_goal", + {disconnected_session_goal, mk( non_neg_integer(), #{ @@ -646,7 +646,7 @@ fields(local_status_enabled) -> required => false } )}, - {"session_recipients", + {session_recipients, mk( list(binary()), #{ @@ -654,7 +654,7 @@ fields(local_status_enabled) -> required => false } )}, - {"recipients", + {recipients, mk( list(binary()), #{ @@ -662,7 +662,7 @@ fields(local_status_enabled) -> required => false } )}, - {"stats", + {stats, mk( ref(status_stats), #{ @@ -673,7 +673,7 @@ fields(local_status_enabled) -> ]; fields(status_stats) -> [ - {"initial_connected", + {initial_connected, mk( non_neg_integer(), #{ @@ -681,7 +681,7 @@ fields(status_stats) -> required => true } )}, - {"current_connected", + {current_connected, mk( non_neg_integer(), #{ @@ -689,7 +689,7 @@ fields(status_stats) -> required => true } )}, - {"initial_sessions", + {initial_sessions, mk( non_neg_integer(), #{ @@ -697,7 +697,7 @@ fields(status_stats) -> required => true } )}, - {"current_sessions", + {current_sessions, mk( non_neg_integer(), #{ @@ -705,7 +705,7 @@ fields(status_stats) -> required => true } )}, - {"current_disconnected_sessions", + {current_disconnected_sessions, mk( non_neg_integer(), #{ @@ -716,11 +716,11 @@ fields(status_stats) -> ]; fields(global_coordinator_status) -> without( - ["status", "process", "session_goal", "session_recipients", "stats"], + [status, process, session_goal, session_recipients, stats], fields(local_status_enabled) ) ++ [ - {"donors", + {donors, mk( list(binary()), #{ @@ -728,7 +728,7 @@ fields(global_coordinator_status) -> required => false } )}, - {"donor_conn_avg", + {donor_conn_avg, mk( non_neg_integer(), #{ @@ -736,7 +736,7 @@ fields(global_coordinator_status) -> required => false } )}, - {"donor_sess_avg", + {donor_sess_avg, mk( non_neg_integer(), #{ @@ -744,7 +744,7 @@ fields(global_coordinator_status) -> required => false } )}, - {"node", + {node, mk( binary(), #{ @@ -754,9 +754,9 @@ fields(global_coordinator_status) -> )} ]; fields(global_evacuation_status) -> - without(["status", "process"], fields(local_status_enabled)) ++ + without([status, process], fields(local_status_enabled)) ++ [ - {"node", + {node, mk( binary(), #{ @@ -768,19 +768,19 @@ fields(global_evacuation_status) -> fields(global_purge_status) -> without( [ - "status", - "process", - "connection_eviction_rate", - "session_eviction_rate", - "connection_goal", - "disconnected_session_goal", - "session_recipients", - "recipients" + status, + process, + connection_eviction_rate, + session_eviction_rate, + connection_goal, + disconnected_session_goal, + session_recipients, + recipients ], fields(local_status_enabled) ) ++ [ - {"purge_rate", + {purge_rate, mk( pos_integer(), #{ @@ -788,7 +788,7 @@ fields(global_purge_status) -> required => false } )}, - {"node", + {node, mk( binary(), #{ @@ -799,7 +799,7 @@ fields(global_purge_status) -> ]; fields(global_status) -> [ - {"evacuations", + {evacuations, mk( hoconsc:array(ref(global_evacuation_status)), #{ @@ -807,7 +807,7 @@ fields(global_status) -> required => false } )}, - {"purges", + {purges, mk( hoconsc:array(ref(global_purge_status)), #{ @@ -815,7 +815,7 @@ fields(global_status) -> required => false } )}, - {"rebalances", + {rebalances, mk( hoconsc:array(ref(global_coordinator_status)), #{ diff --git a/apps/emqx_node_rebalance/test/emqx_node_rebalance_SUITE.erl b/apps/emqx_node_rebalance/test/emqx_node_rebalance_SUITE.erl index 4f0fbe3c4..04a74bf28 100644 --- a/apps/emqx_node_rebalance/test/emqx_node_rebalance_SUITE.erl +++ b/apps/emqx_node_rebalance/test/emqx_node_rebalance_SUITE.erl @@ -48,7 +48,7 @@ init_per_testcase(Case, Config) -> ClusterNodes = start_cluster( Config, NodeNames, - [emqx, emqx_eviction_agent, emqx_node_rebalance] + [emqx, emqx_node_rebalance] ), ok = snabbkaffe:start_trace(), [{cluster_nodes, ClusterNodes} | Config]. diff --git a/apps/emqx_node_rebalance/test/emqx_node_rebalance_agent_SUITE.erl b/apps/emqx_node_rebalance/test/emqx_node_rebalance_agent_SUITE.erl index ac5f809bf..bd15b6475 100644 --- a/apps/emqx_node_rebalance/test/emqx_node_rebalance_agent_SUITE.erl +++ b/apps/emqx_node_rebalance/test/emqx_node_rebalance_agent_SUITE.erl @@ -38,7 +38,7 @@ groups() -> ]. init_per_suite(Config) -> - Apps = emqx_cth_suite:start([emqx, emqx_eviction_agent, emqx_node_rebalance], #{ + Apps = emqx_cth_suite:start([emqx, emqx_node_rebalance], #{ work_dir => ?config(priv_dir, Config) }), [{apps, Apps} | Config]. @@ -60,7 +60,7 @@ init_per_testcase(Case, Config) -> ClusterNodes = emqx_cth_cluster:start( [ {case_specific_node_name(?MODULE, Case), #{ - apps => [emqx, emqx_eviction_agent, emqx_node_rebalance] + apps => [emqx, emqx_node_rebalance] }} ], #{work_dir => emqx_cth_suite:work_dir(Case, Config)} diff --git a/apps/emqx_node_rebalance/test/emqx_node_rebalance_api_SUITE.erl b/apps/emqx_node_rebalance/test/emqx_node_rebalance_api_SUITE.erl index a652dea0a..06e119532 100644 --- a/apps/emqx_node_rebalance/test/emqx_node_rebalance_api_SUITE.erl +++ b/apps/emqx_node_rebalance/test/emqx_node_rebalance_api_SUITE.erl @@ -29,7 +29,7 @@ all() -> emqx_common_test_helpers:all(?MODULE). init_per_suite(Config) -> - Apps = emqx_cth_suite:start([emqx, emqx_eviction_agent, emqx_node_rebalance], #{ + Apps = emqx_cth_suite:start([emqx, emqx_node_rebalance], #{ work_dir => ?config(priv_dir, Config) }), [{apps, Apps} | Config]. @@ -548,7 +548,6 @@ app_specs() -> #{enable => true} } }}, - emqx_eviction_agent, emqx_node_rebalance ]. diff --git a/apps/emqx_node_rebalance/test/emqx_node_rebalance_cli_SUITE.erl b/apps/emqx_node_rebalance/test/emqx_node_rebalance_cli_SUITE.erl index 55542d320..3980b4a45 100644 --- a/apps/emqx_node_rebalance/test/emqx_node_rebalance_cli_SUITE.erl +++ b/apps/emqx_node_rebalance/test/emqx_node_rebalance_cli_SUITE.erl @@ -15,7 +15,7 @@ [emqtt_connect_many/2, stop_many/1, case_specific_node_name/3] ). --define(START_APPS, [emqx, emqx_eviction_agent, emqx_node_rebalance]). +-define(START_APPS, [emqx, emqx_node_rebalance]). all() -> emqx_common_test_helpers:all(?MODULE). diff --git a/apps/emqx_node_rebalance/test/emqx_node_rebalance_evacuation_SUITE.erl b/apps/emqx_node_rebalance/test/emqx_node_rebalance_evacuation_SUITE.erl index 4c0d13788..d27f6d6d3 100644 --- a/apps/emqx_node_rebalance/test/emqx_node_rebalance_evacuation_SUITE.erl +++ b/apps/emqx_node_rebalance/test/emqx_node_rebalance_evacuation_SUITE.erl @@ -70,7 +70,7 @@ init_per_testcase(Case, Config) -> case_specific_node_name(?MODULE, Case, '_recipient') ] end, - ClusterNodes = start_cluster(Config, NodeNames, [emqx, emqx_eviction_agent, emqx_node_rebalance]), + ClusterNodes = start_cluster(Config, NodeNames, [emqx, emqx_node_rebalance]), ok = snabbkaffe:start_trace(), [{cluster_nodes, ClusterNodes} | Config]. diff --git a/apps/emqx_node_rebalance/test/emqx_node_rebalance_purge_SUITE.erl b/apps/emqx_node_rebalance/test/emqx_node_rebalance_purge_SUITE.erl index 31844c5d0..0daeac106 100644 --- a/apps/emqx_node_rebalance/test/emqx_node_rebalance_purge_SUITE.erl +++ b/apps/emqx_node_rebalance/test/emqx_node_rebalance_purge_SUITE.erl @@ -117,7 +117,6 @@ app_specs() -> config => #{delayed => #{enable => true}} }}, - emqx_eviction_agent, emqx_node_rebalance ]. diff --git a/apps/emqx_node_rebalance/test/emqx_node_rebalance_status_SUITE.erl b/apps/emqx_node_rebalance/test/emqx_node_rebalance_status_SUITE.erl index 888e63beb..6a7f20c4e 100644 --- a/apps/emqx_node_rebalance/test/emqx_node_rebalance_status_SUITE.erl +++ b/apps/emqx_node_rebalance/test/emqx_node_rebalance_status_SUITE.erl @@ -32,7 +32,6 @@ init_per_suite(Config) -> Apps = [ emqx_conf, emqx, - emqx_eviction_agent, emqx_node_rebalance ], Cluster = [ diff --git a/apps/emqx_opentelemetry/src/emqx_opentelemetry.app.src b/apps/emqx_opentelemetry/src/emqx_opentelemetry.app.src index 81631b03a..cb7c7d32a 100644 --- a/apps/emqx_opentelemetry/src/emqx_opentelemetry.app.src +++ b/apps/emqx_opentelemetry/src/emqx_opentelemetry.app.src @@ -1,6 +1,6 @@ {application, emqx_opentelemetry, [ {description, "OpenTelemetry for EMQX Broker"}, - {vsn, "0.2.4"}, + {vsn, "0.2.5"}, {registered, []}, {mod, {emqx_otel_app, []}}, {applications, [ diff --git a/apps/emqx_opentelemetry/src/emqx_otel_cpu_sup.erl b/apps/emqx_opentelemetry/src/emqx_otel_cpu_sup.erl index fc67831be..54f88ad99 100644 --- a/apps/emqx_opentelemetry/src/emqx_otel_cpu_sup.erl +++ b/apps/emqx_opentelemetry/src/emqx_otel_cpu_sup.erl @@ -119,7 +119,9 @@ code_change(_OldVsn, State, _Extra) -> refresh(#{interval := Interval} = State) -> NState = case cpu_sup:util([]) of - {all, U, I, _} -> + {all, Use, Idle, _} -> + U = floor(Use * 100) / 100, + I = ceil(Idle * 100) / 100, State#{'cpu.use' => U, 'cpu.idle' => I}; _ -> State#{'cpu.use' => 0, 'cpu.idle' => 0} diff --git a/apps/emqx_prometheus/src/emqx_prometheus.erl b/apps/emqx_prometheus/src/emqx_prometheus.erl index 8556e82d3..450033f18 100644 --- a/apps/emqx_prometheus/src/emqx_prometheus.erl +++ b/apps/emqx_prometheus/src/emqx_prometheus.erl @@ -37,6 +37,7 @@ -include_lib("public_key/include/public_key.hrl"). -include_lib("prometheus/include/prometheus_model.hrl"). -include_lib("emqx/include/logger.hrl"). +-include_lib("emqx_durable_storage/include/emqx_ds_metrics.hrl"). -import( prometheus_model_helpers, @@ -212,11 +213,30 @@ collect_mf(?PROMETHEUS_DEFAULT_REGISTRY, Callback) -> ok = add_collect_family(Callback, cert_metric_meta(), ?MG(cert_data, RawData)), ok = add_collect_family(Callback, mria_metric_meta(), ?MG(mria_data, RawData)), + ok = maybe_add_ds_collect_family(Callback, RawData), ok = maybe_license_add_collect_family(Callback, RawData), ok; collect_mf(_Registry, _Callback) -> ok. +maybe_add_ds_collect_family(Callback, RawData) -> + case emqx_persistent_message:is_persistence_enabled() of + true -> + add_collect_family( + Callback, emqx_ds_builtin_metrics:prometheus_meta(), ?MG(ds_data, RawData) + ); + false -> + ok + end. + +maybe_collect_ds_data(Mode) -> + case emqx_persistent_message:is_persistence_enabled() of + true -> + #{ds_data => emqx_ds_builtin_metrics:prometheus_collect(Mode)}; + false -> + #{} + end. + %% @private collect(<<"json">>) -> RawData = emqx_prometheus_cluster:raw_data(?MODULE, ?GET_PROM_DATA_MODE()), @@ -251,7 +271,7 @@ add_collect_family(Name, Data, Callback, Type) -> %% behaviour fetch_from_local_node(Mode) -> - {node(), #{ + {node(), (maybe_collect_ds_data(Mode))#{ stats_data => stats_data(Mode), vm_data => vm_data(Mode), cluster_data => cluster_data(Mode), @@ -480,7 +500,19 @@ emqx_collect(K = emqx_mria_lag, D) -> gauge_metrics(?MG(K, D, [])); emqx_collect(K = emqx_mria_bootstrap_time, D) -> gauge_metrics(?MG(K, D, [])); emqx_collect(K = emqx_mria_bootstrap_num_keys, D) -> gauge_metrics(?MG(K, D, [])); emqx_collect(K = emqx_mria_message_queue_len, D) -> gauge_metrics(?MG(K, D, [])); -emqx_collect(K = emqx_mria_replayq_len, D) -> gauge_metrics(?MG(K, D, [])). +emqx_collect(K = emqx_mria_replayq_len, D) -> gauge_metrics(?MG(K, D, [])); +%% DS +emqx_collect(K = ?DS_EGRESS_BATCHES, D) -> counter_metrics(?MG(K, D, [])); +emqx_collect(K = ?DS_EGRESS_BATCHES_RETRY, D) -> counter_metrics(?MG(K, D, [])); +emqx_collect(K = ?DS_EGRESS_BATCHES_FAILED, D) -> counter_metrics(?MG(K, D, [])); +emqx_collect(K = ?DS_EGRESS_MESSAGES, D) -> counter_metrics(?MG(K, D, [])); +emqx_collect(K = ?DS_EGRESS_BYTES, D) -> counter_metrics(?MG(K, D, [])); +emqx_collect(K = ?DS_EGRESS_FLUSH_TIME, D) -> gauge_metrics(?MG(K, D, [])); +emqx_collect(K = ?DS_STORE_BATCH_TIME, D) -> gauge_metrics(?MG(K, D, [])); +emqx_collect(K = ?DS_BUILTIN_NEXT_TIME, D) -> gauge_metrics(?MG(K, D, [])); +emqx_collect(K = ?DS_LTS_SEEK_COUNTER, D) -> counter_metrics(?MG(K, D, [])); +emqx_collect(K = ?DS_LTS_NEXT_COUNTER, D) -> counter_metrics(?MG(K, D, [])); +emqx_collect(K = ?DS_LTS_COLLISION_COUNTER, D) -> counter_metrics(?MG(K, D, [])). %%-------------------------------------------------------------------- %% Indicators diff --git a/apps/emqx_resource/src/emqx_resource.app.src b/apps/emqx_resource/src/emqx_resource.app.src index 272dd4e08..913cc5e8c 100644 --- a/apps/emqx_resource/src/emqx_resource.app.src +++ b/apps/emqx_resource/src/emqx_resource.app.src @@ -1,7 +1,7 @@ %% -*- mode: erlang -*- {application, emqx_resource, [ {description, "Manager for all external resources"}, - {vsn, "0.1.28"}, + {vsn, "0.1.29"}, {registered, []}, {mod, {emqx_resource_app, []}}, {applications, [ diff --git a/apps/emqx_resource/src/emqx_resource_buffer_worker.erl b/apps/emqx_resource/src/emqx_resource_buffer_worker.erl index 37f9369ff..e35453c94 100644 --- a/apps/emqx_resource/src/emqx_resource_buffer_worker.erl +++ b/apps/emqx_resource/src/emqx_resource_buffer_worker.erl @@ -64,8 +64,10 @@ -define(COLLECT_REQ_LIMIT, 1000). -define(SEND_REQ(FROM, REQUEST), {'$send_req', FROM, REQUEST}). --define(QUERY(FROM, REQUEST, SENT, EXPIRE_AT), {query, FROM, REQUEST, SENT, EXPIRE_AT}). --define(SIMPLE_QUERY(FROM, REQUEST), ?QUERY(FROM, REQUEST, false, infinity)). +-define(QUERY(FROM, REQUEST, SENT, EXPIRE_AT, TRACE_CTX), + {query, FROM, REQUEST, SENT, EXPIRE_AT, TRACE_CTX} +). +-define(SIMPLE_QUERY(FROM, REQUEST, TRACE_CTX), ?QUERY(FROM, REQUEST, false, infinity, TRACE_CTX)). -define(REPLY(FROM, SENT, RESULT), {reply, FROM, SENT, RESULT}). -define(INFLIGHT_ITEM(Ref, BatchOrQuery, IsRetriable, AsyncWorkerMRef), {Ref, BatchOrQuery, IsRetriable, AsyncWorkerMRef} @@ -77,7 +79,10 @@ -type id() :: binary(). -type index() :: pos_integer(). -type expire_at() :: infinity | integer(). --type queue_query() :: ?QUERY(reply_fun(), request(), HasBeenSent :: boolean(), expire_at()). +-type trace_context() :: map() | undefined. +-type queue_query() :: ?QUERY( + reply_fun(), request(), HasBeenSent :: boolean(), expire_at(), TraceCtx :: trace_context() +). -type request() :: term(). -type request_from() :: undefined | gen_statem:from(). -type timeout_ms() :: emqx_schema:timeout_duration_ms(). @@ -154,7 +159,10 @@ simple_sync_query(Id, Request, QueryOpts0) -> emqx_resource_metrics:matched_inc(Id), Ref = make_request_ref(), ReplyTo = maps:get(reply_to, QueryOpts0, undefined), - Result = call_query(force_sync, Id, Index, Ref, ?SIMPLE_QUERY(ReplyTo, Request), QueryOpts), + TraceCtx = maps:get(trace_ctx, QueryOpts0, undefined), + Result = call_query( + force_sync, Id, Index, Ref, ?SIMPLE_QUERY(ReplyTo, Request, TraceCtx), QueryOpts + ), _ = handle_query_result(Id, Result, _HasBeenSent = false), Result. @@ -167,8 +175,9 @@ simple_async_query(Id, Request, QueryOpts0) -> emqx_resource_metrics:matched_inc(Id), Ref = make_request_ref(), ReplyTo = maps:get(reply_to, QueryOpts0, undefined), + TraceCtx = maps:get(trace_ctx, QueryOpts0, undefined), Result = call_query( - async_if_possible, Id, Index, Ref, ?SIMPLE_QUERY(ReplyTo, Request), QueryOpts + async_if_possible, Id, Index, Ref, ?SIMPLE_QUERY(ReplyTo, Request, TraceCtx), QueryOpts ), _ = handle_query_result(Id, Result, _HasBeenSent = false), Result. @@ -439,10 +448,10 @@ retry_inflight_sync(Ref, QueryOrBatch, Data0) -> Result = call_query(force_sync, Id, Index, Ref, QueryOrBatch, QueryOpts), {ShouldAck, PostFn, DeltaCounters} = case QueryOrBatch of - ?QUERY(ReplyTo, _, HasBeenSent, _ExpireAt) -> + ?QUERY(ReplyTo, _, HasBeenSent, _ExpireAt, _TraceCtx) -> Reply = ?REPLY(ReplyTo, HasBeenSent, Result), reply_caller_defer_metrics(Id, Reply, QueryOpts); - [?QUERY(_, _, _, _) | _] = Batch -> + [?QUERY(_, _, _, _, _) | _] = Batch -> batch_reply_caller_defer_metrics(Id, Result, Batch, QueryOpts) end, Data1 = aggregate_counters(Data0, DeltaCounters), @@ -501,11 +510,13 @@ collect_and_enqueue_query_requests(Request0, Data0) -> ReplyFun = maps:get(async_reply_fun, Opts, undefined), HasBeenSent = false, ExpireAt = maps:get(expire_at, Opts), - ?QUERY(ReplyFun, Req, HasBeenSent, ExpireAt); + TraceCtx = maps:get(trace_ctx, Opts, undefined), + ?QUERY(ReplyFun, Req, HasBeenSent, ExpireAt, TraceCtx); (?SEND_REQ(ReplyTo, {query, Req, Opts})) -> HasBeenSent = false, ExpireAt = maps:get(expire_at, Opts), - ?QUERY(ReplyTo, Req, HasBeenSent, ExpireAt) + TraceCtx = maps:get(trace_ctx, Opts, undefined), + ?QUERY(ReplyTo, Req, HasBeenSent, ExpireAt, TraceCtx) end, Requests ), @@ -515,7 +526,7 @@ collect_and_enqueue_query_requests(Request0, Data0) -> reply_overflown([]) -> ok; -reply_overflown([?QUERY(ReplyTo, _Req, _HasBeenSent, _ExpireAt) | More]) -> +reply_overflown([?QUERY(ReplyTo, _Req, _HasBeenSent, _ExpireAt, _TraceCtx) | More]) -> do_reply_caller(ReplyTo, {error, buffer_overflow}), reply_overflown(More). @@ -572,7 +583,11 @@ flush(Data0) -> {keep_state, Data1}; {_, false} -> ?tp(buffer_worker_flush_before_pop, #{}), - {Q1, QAckRef, Batch} = replayq:pop(Q0, #{count_limit => BatchSize}), + PopOpts = #{ + count_limit => BatchSize, + stop_before => {fun stop_batching/2, initial_state} + }, + {Q1, QAckRef, Batch} = replayq:pop(Q0, PopOpts), Data2 = Data1#{queue := Q1}, ?tp(buffer_worker_flush_before_sieve_expired, #{}), Now = now_(), @@ -608,6 +623,23 @@ flush(Data0) -> end end. +stop_batching(Query, initial_state) -> + get_stop_flag(Query); +stop_batching(Query, PrevStopFlag) -> + case get_stop_flag(Query) =:= PrevStopFlag of + true -> + PrevStopFlag; + false -> + %% We stop beceause we don't want a batch with mixed values for the + %% stop_action_after_render option + true + end. + +get_stop_flag(?QUERY(_, _, _, _, #{stop_action_after_render := true})) -> + stop_action_after_render; +get_stop_flag(_) -> + no_stop_action_after_render. + -spec do_flush(data(), #{ is_batch := boolean(), batch := [queue_query()], @@ -630,7 +662,7 @@ do_flush( inflight_tid := InflightTID } = Data0, %% unwrap when not batching (i.e., batch size == 1) - [?QUERY(ReplyTo, _, HasBeenSent, _ExpireAt) = Request] = Batch, + [?QUERY(ReplyTo, _, HasBeenSent, _ExpireAt, _TraceCtx) = Request] = Batch, QueryOpts = #{inflight_tid => InflightTID, simple_query => false}, Result = call_query(async_if_possible, Id, Index, Ref, Request, QueryOpts), Reply = ?REPLY(ReplyTo, HasBeenSent, Result), @@ -824,14 +856,14 @@ batch_reply_caller_defer_metrics(Id, BatchResult, Batch, QueryOpts) -> expand_batch_reply(BatchResults, Batch) when is_list(BatchResults) -> lists:map( - fun({?QUERY(FROM, _REQUEST, SENT, _EXPIRE_AT), Result}) -> + fun({?QUERY(FROM, _REQUEST, SENT, _EXPIRE_AT, _TraceCtx), Result}) -> ?REPLY(FROM, SENT, Result) end, lists:zip(Batch, BatchResults) ); expand_batch_reply(BatchResult, Batch) -> lists:map( - fun(?QUERY(FROM, _REQUEST, SENT, _EXPIRE_AT)) -> + fun(?QUERY(FROM, _REQUEST, SENT, _EXPIRE_AT, _TraceCtx)) -> ?REPLY(FROM, SENT, BatchResult) end, Batch @@ -880,7 +912,7 @@ reply_dropped(_ReplyTo, _Result) -> -spec batch_reply_dropped([queue_query()], {error, late_reply | request_expired}) -> ok. batch_reply_dropped(Batch, Result) -> lists:foreach( - fun(?QUERY(ReplyTo, _CoreReq, _HasBeenSent, _ExpireAt)) -> + fun(?QUERY(ReplyTo, _CoreReq, _HasBeenSent, _ExpireAt, _TraceCtx)) -> reply_dropped(ReplyTo, Result) end, Batch @@ -992,7 +1024,29 @@ handle_query_async_result_pure(Id, {error, Reason} = Error, HasBeenSent) -> handle_query_async_result_pure(_Id, {ok, Pid}, _HasBeenSent) when is_pid(Pid) -> {ack, fun() -> ok end, #{}}; handle_query_async_result_pure(_Id, ok, _HasBeenSent) -> - {ack, fun() -> ok end, #{}}. + {ack, fun() -> ok end, #{}}; +handle_query_async_result_pure(Id, Results, HasBeenSent) when is_list(Results) -> + All = fun(L) -> + case L of + {ok, Pid} -> is_pid(Pid); + _ -> false + end + end, + case lists:all(All, Results) of + true -> + {ack, fun() -> ok end, #{}}; + false -> + PostFn = fun() -> + ?SLOG(error, #{ + id => Id, + msg => "async_batch_send_error", + reason => Results, + has_been_sent => HasBeenSent + }), + ok + end, + {nack, PostFn, #{}} + end. -spec aggregate_counters(data(), counters()) -> data(). aggregate_counters(Data = #{counters := OldCounters}, DeltaCounters) -> @@ -1093,11 +1147,66 @@ call_query(QM, Id, Index, Ref, Query, QueryOpts) -> {ok, _Group, #{status := ?status_connecting, error := unhealthy_target}} -> {error, {unrecoverable_error, unhealthy_target}}; {ok, _Group, Resource} -> - do_call_query(QM, Id, Index, Ref, Query, QueryOpts, Resource); + QueryResult = + try + set_rule_id_trace_meta_data(Query), + do_call_query(QM, Id, Index, Ref, Query, QueryOpts, Resource) + after + unset_rule_id_trace_meta_data() + end, + QueryResult; {error, not_found} -> ?RESOURCE_ERROR(not_found, "resource not found") end. +set_rule_id_trace_meta_data(Requests) when is_list(Requests) -> + %% Get the rule ids from requests + RuleIDs = lists:foldl(fun collect_rule_id/2, #{}, Requests), + ClientIDs = lists:foldl(fun collect_client_id/2, #{}, Requests), + RuleTriggerTimes = lists:foldl(fun collect_rule_trigger_times/2, [], Requests), + StopAfterRenderVal = + case Requests of + %% We know that the batch is not mixed since we prevent this by + %% using a stop_after function in the replayq:pop call + [?QUERY(_, _, _, _, #{stop_action_after_render := true}) | _] -> + true; + [?QUERY(_, _, _, _, _TraceCTX) | _] -> + false + end, + logger:update_process_metadata(#{ + rule_ids => RuleIDs, + client_ids => ClientIDs, + rule_trigger_times => RuleTriggerTimes, + stop_action_after_render => StopAfterRenderVal + }), + ok; +set_rule_id_trace_meta_data(Request) -> + set_rule_id_trace_meta_data([Request]), + ok. + +collect_rule_id(?QUERY(_, _, _, _, #{rule_id := RuleId}), Acc) -> + Acc#{RuleId => true}; +collect_rule_id(?QUERY(_, _, _, _, _), Acc) -> + Acc. + +collect_client_id(?QUERY(_, _, _, _, #{clientid := ClientId}), Acc) -> + Acc#{ClientId => true}; +collect_client_id(?QUERY(_, _, _, _, _), Acc) -> + Acc. + +collect_rule_trigger_times(?QUERY(_, _, _, _, #{rule_trigger_time := Time}), Acc) -> + [Time | Acc]; +collect_rule_trigger_times(?QUERY(_, _, _, _, _), Acc) -> + Acc. + +unset_rule_id_trace_meta_data() -> + logger:update_process_metadata(#{ + rule_ids => #{}, + client_ids => #{}, + stop_action_after_render => false, + rule_trigger_times => [] + }). + %% action:kafka_producer:myproducer1:connector:kafka_producer:mykakfaclient1 extract_connector_id(Id) when is_binary(Id) -> case binary:split(Id, <<":">>, [global]) of @@ -1208,7 +1317,15 @@ do_call_query(_QM, _Id, _Index, _Ref, _Query, _QueryOpts, _Data) -> ). apply_query_fun( - sync, Mod, Id, _Index, _Ref, ?QUERY(_, Request, _, _) = _Query, ResSt, Channels, QueryOpts + sync, + Mod, + Id, + _Index, + _Ref, + ?QUERY(_, Request, _, _, _TraceCtx) = _Query, + ResSt, + Channels, + QueryOpts ) -> ?tp(call_query, #{id => Id, mod => Mod, query => _Query, res_st => ResSt, call_mode => sync}), maybe_reply_to( @@ -1227,7 +1344,15 @@ apply_query_fun( QueryOpts ); apply_query_fun( - async, Mod, Id, Index, Ref, ?QUERY(_, Request, _, _) = Query, ResSt, Channels, QueryOpts + async, + Mod, + Id, + Index, + Ref, + ?QUERY(_, Request, _, _, _TraceCtx) = Query, + ResSt, + Channels, + QueryOpts ) -> ?tp(call_query_async, #{ id => Id, mod => Mod, query => Query, res_st => ResSt, call_mode => async @@ -1268,7 +1393,7 @@ apply_query_fun( Id, _Index, _Ref, - [?QUERY(_, FirstRequest, _, _) | _] = Batch, + [?QUERY(_, FirstRequest, _, _, _) | _] = Batch, ResSt, Channels, QueryOpts @@ -1276,7 +1401,9 @@ apply_query_fun( ?tp(call_batch_query, #{ id => Id, mod => Mod, batch => Batch, res_st => ResSt, call_mode => sync }), - Requests = lists:map(fun(?QUERY(_ReplyTo, Request, _, _ExpireAt)) -> Request end, Batch), + Requests = lists:map( + fun(?QUERY(_ReplyTo, Request, _, _ExpireAt, _TraceCtx)) -> Request end, Batch + ), maybe_reply_to( ?APPLY_RESOURCE( call_batch_query, @@ -1298,7 +1425,7 @@ apply_query_fun( Id, Index, Ref, - [?QUERY(_, FirstRequest, _, _) | _] = Batch, + [?QUERY(_, FirstRequest, _, _, _) | _] = Batch, ResSt, Channels, QueryOpts @@ -1321,7 +1448,7 @@ apply_query_fun( min_batch => minimize(Batch) }, Requests = lists:map( - fun(?QUERY(_ReplyTo, Request, _, _ExpireAt)) -> Request end, Batch + fun(?QUERY(_ReplyTo, Request, _, _ExpireAt, _TraceCtx)) -> Request end, Batch ), IsRetriable = false, AsyncWorkerMRef = undefined, @@ -1367,7 +1494,7 @@ handle_async_reply1( inflight_tid := InflightTID, resource_id := Id, buffer_worker := BufferWorkerPid, - min_query := ?QUERY(ReplyTo, _, _, ExpireAt) = _Query + min_query := ?QUERY(ReplyTo, _, _, ExpireAt, _TraceCtx) = _Query } = ReplyContext, Result ) -> @@ -1399,7 +1526,7 @@ do_handle_async_reply( request_ref := Ref, buffer_worker := BufferWorkerPid, inflight_tid := InflightTID, - min_query := ?QUERY(ReplyTo, _, Sent, _ExpireAt) = _Query + min_query := ?QUERY(ReplyTo, _, Sent, _ExpireAt, _TraceCtx) = _Query }, Result ) -> @@ -1486,13 +1613,13 @@ handle_async_batch_reply2([Inflight], ReplyContext, Results0, Now) -> %% So we just take the original flag from the ReplyContext batch %% and put it back to the batch found in inflight table %% which must have already been set to `false` - [?QUERY(_ReplyTo, _, HasBeenSent, _ExpireAt) | _] = Batch, + [?QUERY(_ReplyTo, _, HasBeenSent, _ExpireAt, _TraceCtx) | _] = Batch, {RealNotExpired0, RealExpired, Results} = sieve_expired_requests_with_results(RealBatch, Now, Results0), RealNotExpired = lists:map( - fun(?QUERY(ReplyTo, CoreReq, _HasBeenSent, ExpireAt)) -> - ?QUERY(ReplyTo, CoreReq, HasBeenSent, ExpireAt) + fun(?QUERY(ReplyTo, CoreReq, _HasBeenSent, ExpireAt, TraceCtx)) -> + ?QUERY(ReplyTo, CoreReq, HasBeenSent, ExpireAt, TraceCtx) end, RealNotExpired0 ), @@ -1678,7 +1805,10 @@ inflight_get_first_retriable(InflightTID, Now) -> case ets:select(InflightTID, MatchSpec, _Limit = 1) of '$end_of_table' -> none; - {[{Ref, Query = ?QUERY(_ReplyTo, _CoreReq, _HasBeenSent, ExpireAt)}], _Continuation} -> + { + [{Ref, Query = ?QUERY(_ReplyTo, _CoreReq, _HasBeenSent, ExpireAt, _TraceCtx)}], + _Continuation + } -> case is_expired(ExpireAt, Now) of true -> {expired, Ref, [Query]}; @@ -1714,7 +1844,7 @@ inflight_append(undefined, _InflightItem) -> ok; inflight_append( InflightTID, - ?INFLIGHT_ITEM(Ref, [?QUERY(_, _, _, _) | _] = Batch0, IsRetriable, AsyncWorkerMRef) + ?INFLIGHT_ITEM(Ref, [?QUERY(_, _, _, _, _) | _] = Batch0, IsRetriable, AsyncWorkerMRef) ) -> Batch = mark_as_sent(Batch0), InflightItem = ?INFLIGHT_ITEM(Ref, Batch, IsRetriable, AsyncWorkerMRef), @@ -1726,7 +1856,10 @@ inflight_append( inflight_append( InflightTID, ?INFLIGHT_ITEM( - Ref, ?QUERY(_ReplyTo, _Req, _HasBeenSent, _ExpireAt) = Query0, IsRetriable, AsyncWorkerMRef + Ref, + ?QUERY(_ReplyTo, _Req, _HasBeenSent, _ExpireAt, _TraceCtx) = Query0, + IsRetriable, + AsyncWorkerMRef ) ) -> Query = mark_as_sent(Query0), @@ -1790,9 +1923,13 @@ ack_inflight(undefined, _Ref, _BufferWorkerPid) -> ack_inflight(InflightTID, Ref, BufferWorkerPid) -> {Count, Removed} = case ets:take(InflightTID, Ref) of - [?INFLIGHT_ITEM(Ref, ?QUERY(_, _, _, _), _IsRetriable, _AsyncWorkerMRef)] -> + [?INFLIGHT_ITEM(Ref, ?QUERY(_, _, _, _, _), _IsRetriable, _AsyncWorkerMRef)] -> {1, true}; - [?INFLIGHT_ITEM(Ref, [?QUERY(_, _, _, _) | _] = Batch, _IsRetriable, _AsyncWorkerMRef)] -> + [ + ?INFLIGHT_ITEM( + Ref, [?QUERY(_, _, _, _, _) | _] = Batch, _IsRetriable, _AsyncWorkerMRef + ) + ] -> {length(Batch), true}; [] -> {0, false} @@ -1942,9 +2079,9 @@ do_collect_requests(Acc, Count, Limit) -> mark_as_sent(Batch) when is_list(Batch) -> lists:map(fun mark_as_sent/1, Batch); -mark_as_sent(?QUERY(ReplyTo, Req, _HasBeenSent, ExpireAt)) -> +mark_as_sent(?QUERY(ReplyTo, Req, _HasBeenSent, ExpireAt, TraceCtx)) -> HasBeenSent = true, - ?QUERY(ReplyTo, Req, HasBeenSent, ExpireAt). + ?QUERY(ReplyTo, Req, HasBeenSent, ExpireAt, TraceCtx). is_unrecoverable_error({error, {unrecoverable_error, _}}) -> true; @@ -1967,7 +2104,7 @@ is_async_return(_) -> sieve_expired_requests(Batch, Now) -> lists:partition( - fun(?QUERY(_ReplyTo, _CoreReq, _HasBeenSent, ExpireAt)) -> + fun(?QUERY(_ReplyTo, _CoreReq, _HasBeenSent, ExpireAt, _TraceCtx)) -> not is_expired(ExpireAt, Now) end, Batch @@ -1978,7 +2115,7 @@ sieve_expired_requests_with_results(Batch, Now, Results) when is_list(Results) - {RevNotExpiredBatch, RevNotExpiredResults, ExpiredBatch} = lists:foldl( fun( - {?QUERY(_ReplyTo, _CoreReq, _HasBeenSent, ExpireAt) = Query, Result}, + {?QUERY(_ReplyTo, _CoreReq, _HasBeenSent, ExpireAt, _TraceCtx) = Query, Result}, {NotExpAcc, ResAcc, ExpAcc} ) -> case not is_expired(ExpireAt, Now) of @@ -2026,15 +2163,16 @@ ensure_expire_at(#{timeout := TimeoutMS} = Opts) -> Opts#{expire_at => ExpireAt}. %% no need to keep the request for async reply handler -minimize(?QUERY(_, _, _, _) = Q) -> +minimize(?QUERY(_, _, _, _, _) = Q) -> do_minimize(Q); minimize(L) when is_list(L) -> lists:map(fun do_minimize/1, L). -ifdef(TEST). -do_minimize(?QUERY(_ReplyTo, _Req, _Sent, _ExpireAt) = Query) -> Query. +do_minimize(?QUERY(_ReplyTo, _Req, _Sent, _ExpireAt, _TraceCtx) = Query) -> Query. -else. -do_minimize(?QUERY(ReplyTo, _Req, Sent, ExpireAt)) -> ?QUERY(ReplyTo, [], Sent, ExpireAt). +do_minimize(?QUERY(ReplyTo, _Req, Sent, ExpireAt, TraceCtx)) -> + ?QUERY(ReplyTo, [], Sent, ExpireAt, TraceCtx). -endif. %% To avoid message loss due to misconfigurations, we adjust diff --git a/apps/emqx_resource/src/emqx_resource_manager.erl b/apps/emqx_resource/src/emqx_resource_manager.erl index 60bc3c7e9..6d9ad50e4 100644 --- a/apps/emqx_resource/src/emqx_resource_manager.erl +++ b/apps/emqx_resource/src/emqx_resource_manager.erl @@ -60,6 +60,9 @@ % Behaviour -export([init/1, callback_mode/0, handle_event/4, terminate/3]). +%% Internal exports. +-export([worker_resource_health_check/1, worker_channel_health_check/2]). + % State record -record(data, { id, @@ -73,7 +76,27 @@ state, error, pid, - added_channels, + added_channels = #{}, + %% Reference to process performing resource health check. + hc_workers = #{ + resource => #{}, + channel => #{ + pending => [], + previous_status => #{} + } + } :: #{ + resource := #{{pid(), reference()} => true}, + channel := #{ + {pid(), reference()} => channel_id(), + pending := [channel_id()], + previous_status := #{channel_id() => channel_status_map()} + } + }, + %% Callers waiting on health check + hc_pending_callers = #{resource => [], channel => #{}} :: #{ + resource := [gen_server:from()], + channel := #{channel_id() => [gen_server:from()]} + }, extra }). -type data() :: #data{}. @@ -96,6 +119,12 @@ -define(state_disconnected, disconnected). -define(state_stopped, stopped). +-type state() :: + ?state_stopped + | ?state_disconnected + | ?state_connecting + | ?state_connected. + -define(IS_STATUS(ST), ST =:= ?status_connecting; ST =:= ?status_connected; ST =:= ?status_disconnected ). @@ -153,13 +182,13 @@ create(ResId, Group, ResourceType, Config, Opts) -> case SpawnBufferWorkers andalso lists:member(QueryMode, [sync, async]) of true -> %% start resource workers as the query type requires them - ok = emqx_resource_buffer_worker_sup:start_workers(ResId, Opts), - case maps:get(start_after_created, Opts, ?START_AFTER_CREATED) of - true -> - wait_for_ready(ResId, maps:get(start_timeout, Opts, ?START_TIMEOUT)); - false -> - ok - end; + ok = emqx_resource_buffer_worker_sup:start_workers(ResId, Opts); + false -> + ok + end, + case maps:get(start_after_created, Opts, ?START_AFTER_CREATED) of + true -> + wait_for_ready(ResId, maps:get(start_timeout, Opts, ?START_TIMEOUT)); false -> ok end. @@ -328,6 +357,7 @@ add_channel(ResId, ChannelId, Config) -> Result = safe_call(ResId, {add_channel, ChannelId, Config}, ?T_OPERATION), %% Wait for health_check to finish _ = health_check(ResId), + _ = channel_health_check(ResId, ChannelId), Result. remove_channel(ResId, ChannelId) -> @@ -455,7 +485,7 @@ handle_event({call, From}, {remove, ClearMetrics}, _State, Data) -> handle_event({call, From}, lookup, _State, #data{group = Group} = Data) -> Reply = {ok, Group, data_record_to_external_map(Data)}, {keep_state_and_data, [{reply, From, Reply}]}; -% Called when doing a manually health check. +% Called when doing a manual health check. handle_event({call, From}, health_check, ?state_stopped, _Data) -> Actions = [{reply, From, {error, resource_is_stopped}}], {keep_state_and_data, Actions}; @@ -463,9 +493,9 @@ handle_event({call, From}, {channel_health_check, _}, ?state_stopped, _Data) -> Actions = [{reply, From, {error, resource_is_stopped}}], {keep_state_and_data, Actions}; handle_event({call, From}, health_check, _State, Data) -> - handle_manually_health_check(From, Data); + handle_manual_resource_health_check(From, Data); handle_event({call, From}, {channel_health_check, ChannelId}, _State, Data) -> - handle_manually_channel_health_check(From, Data, ChannelId); + handle_manual_channel_health_check(From, Data, ChannelId); % State: CONNECTING handle_event(enter, _OldState, ?state_connecting = State, Data) -> ok = log_status_consistency(State, Data), @@ -473,7 +503,7 @@ handle_event(enter, _OldState, ?state_connecting = State, Data) -> handle_event(internal, start_resource, ?state_connecting, Data) -> start_resource(Data, undefined); handle_event(state_timeout, health_check, ?state_connecting, Data) -> - handle_connecting_health_check(Data); + start_resource_health_check(Data); handle_event( {call, From}, {remove_channel, ChannelId}, ?state_connecting = _State, Data ) -> @@ -487,7 +517,7 @@ handle_event(enter, _OldState, ?state_connected = State, Data) -> ?tp(resource_connected_enter, #{}), {keep_state_and_data, health_check_actions(Data)}; handle_event(state_timeout, health_check, ?state_connected, Data) -> - handle_connected_health_check(Data); + start_resource_health_check(Data); handle_event( {call, From}, {add_channel, ChannelId, Config}, ?state_connected = _State, Data ) -> @@ -523,6 +553,24 @@ handle_event( ) -> Channels = emqx_resource:call_get_channels(Data#data.id, Data#data.mod), {keep_state_and_data, {reply, From, {ok, Channels}}}; +handle_event( + info, + {'DOWN', Ref, process, Pid, Res}, + State0, + Data0 = #data{hc_workers = #{resource := RHCWorkers}} +) when + is_map_key({Pid, Ref}, RHCWorkers) +-> + handle_resource_health_check_worker_down(State0, Data0, {Pid, Ref}, Res); +handle_event( + info, + {'DOWN', Ref, process, Pid, Res}, + _State, + Data0 = #data{hc_workers = #{channel := CHCWorkers}} +) when + is_map_key({Pid, Ref}, CHCWorkers) +-> + handle_channel_health_check_worker_down(Data0, {Pid, Ref}, Res); % Ignore all other events handle_event(EventType, EventData, State, Data) -> ?SLOG( @@ -538,7 +586,7 @@ handle_event(EventType, EventData, State, Data) -> keep_state_and_data. log_status_consistency(Status, #data{status = Status} = Data) -> - log_cache_consistency(read_cache(Data#data.id), Data); + log_cache_consistency(read_cache(Data#data.id), remove_runtime_data(Data)); log_status_consistency(Status, Data) -> ?tp(warning, "inconsistent_status", #{ status => Status, @@ -835,86 +883,166 @@ handle_not_connected_and_not_connecting_remove_channel(From, ChannelId, Data) -> _ = maybe_clear_alarm(ChannelId), {keep_state, update_state(NewData, Data), [{reply, From, ok}]}. -handle_manually_health_check(From, Data) -> - with_health_check( - Data, - fun(Status, UpdatedData) -> - Actions = [{reply, From, {ok, Status}}], - {next_state, Status, channels_health_check(Status, UpdatedData), Actions} - end - ). +handle_manual_resource_health_check(From, Data0 = #data{hc_workers = #{resource := HCWorkers}}) when + map_size(HCWorkers) > 0 +-> + %% ongoing health check + #data{hc_pending_callers = Pending0 = #{resource := RPending0}} = Data0, + Pending = Pending0#{resource := [From | RPending0]}, + Data = Data0#data{hc_pending_callers = Pending}, + {keep_state, Data}; +handle_manual_resource_health_check(From, Data0) -> + #data{hc_pending_callers = Pending0 = #{resource := RPending0}} = Data0, + Pending = Pending0#{resource := [From | RPending0]}, + Data = Data0#data{hc_pending_callers = Pending}, + start_resource_health_check(Data). -handle_manually_channel_health_check(From, #data{state = undefined}, _ChannelId) -> +reply_pending_resource_health_check_callers(Status, Data0 = #data{hc_pending_callers = Pending0}) -> + #{resource := RPending} = Pending0, + Actions = [{reply, From, {ok, Status}} || From <- RPending], + Data = Data0#data{hc_pending_callers = Pending0#{resource := []}}, + {Actions, Data}. + +start_resource_health_check(#data{state = undefined} = Data) -> + %% No resource running, thus disconnected. + %% A health check spawn when state is undefined can only happen when someone manually + %% asks for a health check and the resource could not initialize or has not had enough + %% time to do so. Let's assume the continuation is as if we were `?status_connecting'. + continue_resource_health_check_not_connected(?status_disconnected, Data); +start_resource_health_check(#data{hc_workers = #{resource := HCWorkers}}) when + map_size(HCWorkers) > 0 +-> + %% Already ongoing + keep_state_and_data; +start_resource_health_check(#data{} = Data0) -> + #data{hc_workers = HCWorkers0 = #{resource := RHCWorkers0}} = Data0, + WorkerRef = {_Pid, _Ref} = spawn_resource_health_check_worker(Data0), + HCWorkers = HCWorkers0#{resource := RHCWorkers0#{WorkerRef => true}}, + Data = Data0#data{hc_workers = HCWorkers}, + {keep_state, Data}. + +-spec spawn_resource_health_check_worker(data()) -> {pid(), reference()}. +spawn_resource_health_check_worker(#data{} = Data) -> + spawn_monitor(?MODULE, worker_resource_health_check, [Data]). + +%% separated so it can be spec'ed and placate dialyzer tantrums... +-spec worker_resource_health_check(data()) -> no_return(). +worker_resource_health_check(Data) -> + HCRes = emqx_resource:call_health_check(Data#data.id, Data#data.mod, Data#data.state), + exit({ok, HCRes}). + +handle_resource_health_check_worker_down(CurrentState, Data0, WorkerRef, ExitResult) -> + #data{hc_workers = HCWorkers0 = #{resource := RHCWorkers0}} = Data0, + HCWorkers = HCWorkers0#{resource := maps:remove(WorkerRef, RHCWorkers0)}, + Data1 = Data0#data{hc_workers = HCWorkers}, + case ExitResult of + {ok, HCRes} -> + continue_with_health_check(Data1, CurrentState, HCRes); + _ -> + %% Unexpected: `emqx_resource:call_health_check' catches all exceptions. + continue_with_health_check(Data1, CurrentState, {error, ExitResult}) + end. + +continue_with_health_check(#data{} = Data0, CurrentState, HCRes) -> + #data{ + id = ResId, + error = PrevError + } = Data0, + {NewStatus, NewState, Err} = parse_health_check_result(HCRes, Data0), + _ = maybe_alarm(NewStatus, ResId, Err, PrevError), + ok = maybe_resume_resource_workers(ResId, NewStatus), + Data1 = Data0#data{ + state = NewState, status = NewStatus, error = Err + }, + Data = update_state(Data1, Data0), + case CurrentState of + ?state_connected -> + continue_resource_health_check_connected(NewStatus, Data); + _ -> + %% `?state_connecting' | `?state_disconnected' | `?state_stopped' + continue_resource_health_check_not_connected(NewStatus, Data) + end. + +%% Continuation to be used when the current resource state is `?state_connected'. +continue_resource_health_check_connected(NewStatus, Data0) -> + case NewStatus of + ?status_connected -> + {Replies, Data1} = reply_pending_resource_health_check_callers(NewStatus, Data0), + Data2 = channels_health_check(?status_connected, Data1), + Data = update_state(Data2, Data0), + Actions = Replies ++ health_check_actions(Data), + {keep_state, Data, Actions}; + _ -> + ?SLOG(warning, #{ + msg => "health_check_failed", + id => Data0#data.id, + status => NewStatus + }), + %% Note: works because, coincidentally, channel/resource status is a + %% subset of resource manager state... But there should be a conversion + %% between the two here, as resource manager also has `stopped', which is + %% not a valid status at the time of writing. + {Replies, Data} = reply_pending_resource_health_check_callers(NewStatus, Data0), + {next_state, NewStatus, channels_health_check(NewStatus, Data), Replies} + end. + +%% Continuation to be used when the current resource state is not `?state_connected'. +continue_resource_health_check_not_connected(NewStatus, Data0) -> + {Replies, Data} = reply_pending_resource_health_check_callers(NewStatus, Data0), + case NewStatus of + ?status_connected -> + {next_state, ?state_connected, channels_health_check(?status_connected, Data), Replies}; + ?status_connecting -> + Actions = Replies ++ health_check_actions(Data), + {next_state, ?status_connecting, channels_health_check(?status_connecting, Data), + Actions}; + ?status_disconnected -> + {next_state, ?state_disconnected, channels_health_check(?status_disconnected, Data), + Replies} + end. + +handle_manual_channel_health_check(From, #data{state = undefined}, _ChannelId) -> {keep_state_and_data, [{reply, From, channel_status({error, resource_disconnected})}]}; -handle_manually_channel_health_check( +handle_manual_channel_health_check( + From, + #data{ + added_channels = Channels, + hc_pending_callers = #{channel := CPending0} = Pending0, + hc_workers = #{channel := #{previous_status := PreviousStatus}} + } = Data0, + ChannelId +) when + is_map_key(ChannelId, Channels), + is_map_key(ChannelId, PreviousStatus) +-> + %% Ongoing health check. + CPending = maps:update_with( + ChannelId, + fun(OtherCallers) -> + [From | OtherCallers] + end, + [From], + CPending0 + ), + Pending = Pending0#{channel := CPending}, + Data = Data0#data{hc_pending_callers = Pending}, + {keep_state, Data}; +handle_manual_channel_health_check( From, #data{added_channels = Channels} = _Data, ChannelId ) when is_map_key(ChannelId, Channels) -> + %% No ongoing health check: reply with current status. {keep_state_and_data, [{reply, From, maps:get(ChannelId, Channels)}]}; -handle_manually_channel_health_check( +handle_manual_channel_health_check( From, _Data, _ChannelId ) -> {keep_state_and_data, [{reply, From, channel_status({error, channel_not_found})}]}. -get_channel_status_channel_added(#data{id = ResId, mod = Mod, state = State}, ChannelId) -> - RawStatus = emqx_resource:call_channel_health_check(ResId, ChannelId, Mod, State), - channel_status(RawStatus). - -handle_connecting_health_check(Data) -> - with_health_check( - Data, - fun - (?status_connected, UpdatedData) -> - {next_state, ?state_connected, - channels_health_check(?status_connected, UpdatedData)}; - (?status_connecting, UpdatedData) -> - {keep_state, channels_health_check(?status_connecting, UpdatedData), - health_check_actions(UpdatedData)}; - (?status_disconnected, UpdatedData) -> - {next_state, ?state_disconnected, - channels_health_check(?status_disconnected, UpdatedData)} - end - ). - -handle_connected_health_check(Data) -> - with_health_check( - Data, - fun - (?status_connected, UpdatedData0) -> - UpdatedData1 = channels_health_check(?status_connected, UpdatedData0), - {keep_state, UpdatedData1, health_check_actions(UpdatedData1)}; - (Status, UpdatedData) -> - ?SLOG(warning, #{ - msg => "health_check_failed", - id => Data#data.id, - status => Status - }), - %% Note: works because, coincidentally, channel/resource status is a - %% subset of resource manager state... But there should be a conversion - %% between the two here, as resource manager also has `stopped', which is - %% not a valid status at the time of writing. - {next_state, Status, channels_health_check(Status, UpdatedData)} - end - ). - -with_health_check(#data{state = undefined} = Data, Func) -> - Func(disconnected, Data); -with_health_check(#data{error = PrevError} = Data, Func) -> - ResId = Data#data.id, - HCRes = emqx_resource:call_health_check(Data#data.id, Data#data.mod, Data#data.state), - {Status, NewState, Err} = parse_health_check_result(HCRes, Data), - _ = maybe_alarm(Status, ResId, Err, PrevError), - ok = maybe_resume_resource_workers(ResId, Status), - UpdatedData = Data#data{ - state = NewState, status = Status, error = Err - }, - Func(Status, update_state(UpdatedData, Data)). - -spec channels_health_check(resource_status(), data()) -> data(). channels_health_check(?status_connected = _ConnectorStatus, Data0) -> Channels = maps:to_list(Data0#data.added_channels), @@ -930,7 +1058,7 @@ channels_health_check(?status_connected = _ConnectorStatus, Data0) -> get_config_for_channels(Data0, ChannelsNotAdded), Data1 = add_channels_in_list(ChannelsNotAddedWithConfigs, Data0), %% Now that we have done the adding, we can get the status of all channels - Data2 = channel_status_for_all_channels(Data1), + Data2 = trigger_health_check_for_added_channels(Data1), update_state(Data2, Data0); channels_health_check(?status_connecting = _ConnectorStatus, Data0) -> %% Whenever the resource is connecting: @@ -1026,41 +1154,117 @@ resource_not_connected_channel_error_msg(ResourceStatus, ChannelId, Data1) -> ) ). -channel_status_for_all_channels(Data) -> - Channels = maps:to_list(Data#data.added_channels), - AddedChannelsWithOldAndNewStatus = [ - {ChannelId, OldStatus, get_channel_status_channel_added(Data, ChannelId)} - || {ChannelId, OldStatus} <- Channels, +%% Currently, we only call resource channel health checks when the underlying resource is +%% `?status_connected'. +-spec trigger_health_check_for_added_channels(data()) -> data(). +trigger_health_check_for_added_channels(Data0 = #data{hc_workers = HCWorkers0}) -> + #{channel := CHCWorkers0} = HCWorkers0, + PreviousStatus = maps:from_list([ + {ChannelId, OldStatus} + || {ChannelId, OldStatus} <- maps:to_list(Data0#data.added_channels), channel_status_is_channel_added(OldStatus) - ], + ]), + ChannelsToCheck = maps:keys(PreviousStatus), + case ChannelsToCheck of + [] -> + %% Nothing to do. + Data0; + [ChannelId | Rest] -> + %% Shooting one check at a time. We could increase concurrency in the future. + CHCWorkers = CHCWorkers0#{pending := Rest, previous_status := PreviousStatus}, + Data1 = Data0#data{hc_workers = HCWorkers0#{channel := CHCWorkers}}, + start_channel_health_check(Data1, ChannelId) + end. + +-spec continue_channel_health_check_connected(data()) -> data(). +continue_channel_health_check_connected(Data0) -> + #data{hc_workers = HCWorkers0} = Data0, + #{channel := #{previous_status := PreviousStatus} = CHCWorkers0} = HCWorkers0, + CHCWorkers = CHCWorkers0#{previous_status := #{}}, + Data1 = Data0#data{hc_workers = HCWorkers0#{channel := CHCWorkers}}, %% Remove the added channels with a a status different from connected or connecting + CheckedChannels = [ + {ChannelId, NewStatus} + || {ChannelId, NewStatus} <- maps:to_list(Data0#data.added_channels), + is_map_key(ChannelId, PreviousStatus) + ], ChannelsToRemove = [ ChannelId - || {ChannelId, _, NewStatus} <- AddedChannelsWithOldAndNewStatus, + || {ChannelId, NewStatus} <- CheckedChannels, not channel_status_is_channel_added(NewStatus) ], - Data1 = remove_channels_in_list(ChannelsToRemove, Data, true), + Data = remove_channels_in_list(ChannelsToRemove, Data1, true), %% Raise/clear alarms lists:foreach( fun - ({ID, _OldStatus, #{status := ?status_connected}}) -> + ({ID, #{status := ?status_connected}}) -> _ = maybe_clear_alarm(ID); - ({ID, OldStatus, NewStatus}) -> + ({ID, NewStatus}) -> + OldStatus = maps:get(ID, PreviousStatus), _ = maybe_alarm(NewStatus, ID, NewStatus, OldStatus) end, - AddedChannelsWithOldAndNewStatus + CheckedChannels ), - %% Update the ChannelsMap - ChannelsMap = Data1#data.added_channels, - NewChannelsMap = - lists:foldl( - fun({ChannelId, _, NewStatus}, Acc) -> - maps:put(ChannelId, NewStatus, Acc) - end, - ChannelsMap, - AddedChannelsWithOldAndNewStatus - ), - Data1#data{added_channels = NewChannelsMap}. + Data. + +-spec start_channel_health_check(data(), channel_id()) -> data(). +start_channel_health_check(#data{} = Data0, ChannelId) -> + #data{hc_workers = HCWorkers0 = #{channel := CHCWorkers0}} = Data0, + WorkerRef = {_Pid, _Ref} = spawn_channel_health_check_worker(Data0, ChannelId), + HCWorkers = HCWorkers0#{channel := CHCWorkers0#{WorkerRef => ChannelId}}, + Data0#data{hc_workers = HCWorkers}. + +-spec spawn_channel_health_check_worker(data(), channel_id()) -> {pid(), reference()}. +spawn_channel_health_check_worker(#data{} = Data, ChannelId) -> + spawn_monitor(?MODULE, worker_channel_health_check, [Data, ChannelId]). + +%% separated so it can be spec'ed and placate dialyzer tantrums... +-spec worker_channel_health_check(data(), channel_id()) -> no_return(). +worker_channel_health_check(Data, ChannelId) -> + #data{id = ResId, mod = Mod, state = State} = Data, + RawStatus = emqx_resource:call_channel_health_check(ResId, ChannelId, Mod, State), + exit({ok, channel_status(RawStatus)}). + +-spec handle_channel_health_check_worker_down( + data(), {pid(), reference()}, {ok, channel_status_map()} +) -> + gen_statem:event_handler_result(state(), data()). +handle_channel_health_check_worker_down(Data0, WorkerRef, ExitResult) -> + #data{ + hc_workers = HCWorkers0 = #{channel := CHCWorkers0}, + added_channels = AddedChannels0 + } = Data0, + {ChannelId, CHCWorkers1} = maps:take(WorkerRef, CHCWorkers0), + case ExitResult of + {ok, NewStatus} -> + %% `emqx_resource:call_channel_health_check' catches all exceptions. + AddedChannels = maps:put(ChannelId, NewStatus, AddedChannels0) + end, + Data1 = Data0#data{added_channels = AddedChannels}, + {Replies, Data2} = reply_pending_channel_health_check_callers(ChannelId, NewStatus, Data1), + case CHCWorkers1 of + #{pending := [NextChannelId | Rest]} -> + CHCWorkers = CHCWorkers1#{pending := Rest}, + HCWorkers = HCWorkers0#{channel := CHCWorkers}, + Data3 = Data2#data{hc_workers = HCWorkers}, + Data = start_channel_health_check(Data3, NextChannelId), + {keep_state, update_state(Data, Data0), Replies}; + #{pending := []} -> + HCWorkers = HCWorkers0#{channel := CHCWorkers1}, + Data3 = Data2#data{hc_workers = HCWorkers}, + Data = continue_channel_health_check_connected(Data3), + {keep_state, update_state(Data, Data0), Replies} + end. + +reply_pending_channel_health_check_callers( + ChannelId, Status, Data0 = #data{hc_pending_callers = Pending0} +) -> + #{channel := CPending0} = Pending0, + Pending = maps:get(ChannelId, CPending0, []), + Actions = [{reply, From, Status} || From <- Pending], + CPending = maps:remove(ChannelId, CPending0), + Data = Data0#data{hc_pending_callers = Pending0#{channel := CPending}}, + {Actions, Data}. get_config_for_channels(Data0, ChannelsWithoutConfig) -> ResId = Data0#data.id, @@ -1097,9 +1301,21 @@ update_state(Data) -> update_state(DataWas, DataWas) -> DataWas; update_state(Data, _DataWas) -> - _ = insert_cache(Data#data.id, Data), + _ = insert_cache(Data#data.id, remove_runtime_data(Data)), Data. +remove_runtime_data(#data{} = Data0) -> + Data0#data{ + hc_workers = #{ + resource => #{}, + channel => #{pending => [], previous_status => #{}} + }, + hc_pending_callers = #{ + resource => [], + channel => #{} + } + }. + health_check_interval(Opts) -> maps:get(health_check_interval, Opts, ?HEALTHCHECK_INTERVAL). diff --git a/apps/emqx_resource/test/emqx_connector_demo.erl b/apps/emqx_resource/test/emqx_connector_demo.erl index 93f6b661b..754727e8c 100644 --- a/apps/emqx_resource/test/emqx_connector_demo.erl +++ b/apps/emqx_resource/test/emqx_connector_demo.erl @@ -18,6 +18,7 @@ -include_lib("typerefl/include/types.hrl"). -include_lib("snabbkaffe/include/snabbkaffe.hrl"). +-include_lib("emqx_resource/include/emqx_resource.hrl"). -behaviour(emqx_resource). @@ -30,7 +31,12 @@ on_query_async/4, on_batch_query/3, on_batch_query_async/4, - on_get_status/2 + on_get_status/2, + + on_add_channel/4, + on_remove_channel/3, + on_get_channels/1, + on_get_channel_status/3 ]). -export([counter_loop/0, set_callback_mode/1]). @@ -39,6 +45,7 @@ -export([roots/0]). -define(CM_KEY, {?MODULE, callback_mode}). +-define(PT_CHAN_KEY(CONN_RES_ID), {?MODULE, chans, CONN_RES_ID}). roots() -> [ @@ -70,12 +77,14 @@ on_start(InstId, #{name := Name} = Opts) -> {ok, Opts#{ id => InstId, stop_error => StopError, + channels => #{}, pid => spawn_counter_process(Name, Register) }}. on_stop(_InstId, #{stop_error := true}) -> {error, stop_error}; -on_stop(_InstId, #{pid := Pid}) -> +on_stop(InstId, #{pid := Pid}) -> + persistent_term:erase(?PT_CHAN_KEY(InstId)), stop_counter_process(Pid). on_query(_InstId, get_state, State) -> @@ -276,15 +285,47 @@ batch_individual_reply({async, ReplyFunAndArgs}, InstId, Batch, State) -> on_get_status(_InstId, #{health_check_error := true}) -> ?tp(connector_demo_health_check_error, #{}), - disconnected; + ?status_disconnected; on_get_status(_InstId, State = #{health_check_error := {msg, Message}}) -> ?tp(connector_demo_health_check_error, #{}), - {disconnected, State, Message}; + {?status_disconnected, State, Message}; +on_get_status(_InstId, #{pid := Pid, health_check_error := {delay, Delay}}) -> + ?tp(connector_demo_health_check_delay, #{}), + timer:sleep(Delay), + case is_process_alive(Pid) of + true -> ?status_connected; + false -> ?status_disconnected + end; on_get_status(_InstId, #{pid := Pid}) -> timer:sleep(300), case is_process_alive(Pid) of - true -> connected; - false -> disconnected + true -> ?status_connected; + false -> ?status_disconnected + end. + +on_add_channel(ConnResId, ConnSt0, ChanId, ChanCfg) -> + ConnSt = emqx_utils_maps:deep_put([channels, ChanId], ConnSt0, ChanCfg), + do_add_channel(ConnResId, ChanId, ChanCfg), + {ok, ConnSt}. + +on_remove_channel(ConnResId, ConnSt0, ChanId) -> + ConnSt = emqx_utils_maps:deep_remove([channels, ChanId], ConnSt0), + do_remove_channel(ConnResId, ChanId), + {ok, ConnSt}. + +on_get_channels(ConnResId) -> + persistent_term:get(?PT_CHAN_KEY(ConnResId), []). + +on_get_channel_status(_ConnResId, ChanId, #{channels := Chans}) -> + case Chans of + #{ChanId := #{health_check_delay := Delay}} -> + ?tp(connector_demo_channel_health_check_delay, #{}), + timer:sleep(Delay), + ?status_connected; + #{ChanId := _ChanCfg} -> + ?status_connected; + #{} -> + ?status_disconnected end. spawn_counter_process(Name, Register) -> @@ -447,3 +488,11 @@ make_random_reply(N) -> 3 -> {error, {unrecoverable_error, N}} end. + +do_add_channel(ConnResId, ChanId, ChanCfg) -> + Chans = persistent_term:get(?PT_CHAN_KEY(ConnResId), []), + persistent_term:put(?PT_CHAN_KEY(ConnResId), [{ChanId, ChanCfg} | Chans]). + +do_remove_channel(ConnResId, ChanId) -> + Chans = persistent_term:get(?PT_CHAN_KEY(ConnResId), []), + persistent_term:put(?PT_CHAN_KEY(ConnResId), proplists:delete(ChanId, Chans)). diff --git a/apps/emqx_resource/test/emqx_resource_SUITE.erl b/apps/emqx_resource/test/emqx_resource_SUITE.erl index fa9f7e7c9..05a2f711d 100644 --- a/apps/emqx_resource/test/emqx_resource_SUITE.erl +++ b/apps/emqx_resource/test/emqx_resource_SUITE.erl @@ -52,12 +52,20 @@ end_per_testcase(_, _Config) -> init_per_suite(Config) -> code:ensure_loaded(?TEST_RESOURCE), - ok = emqx_common_test_helpers:start_apps([emqx_conf]), - {ok, _} = application:ensure_all_started(emqx_resource), - Config. + Apps = emqx_cth_suite:start( + [ + emqx, + emqx_conf, + emqx_resource + ], + #{work_dir => emqx_cth_suite:work_dir(Config)} + ), + [{apps, Apps} | Config]. -end_per_suite(_Config) -> - ok = emqx_common_test_helpers:stop_apps([emqx_resource, emqx_conf]). +end_per_suite(Config) -> + Apps = proplists:get_value(apps, Config), + emqx_cth_suite:stop(Apps), + ok. %%------------------------------------------------------------------------------ %% Tests @@ -115,10 +123,7 @@ t_create_remove(_) -> ?assertNot(is_process_alive(Pid)) end, - fun(Trace) -> - ?assertEqual([], ?of_kind("inconsistent_status", Trace)), - ?assertEqual([], ?of_kind("inconsistent_cache", Trace)) - end + [log_consistency_prop()] ). t_create_remove_local(_) -> @@ -174,10 +179,7 @@ t_create_remove_local(_) -> ?assertNot(is_process_alive(Pid)) end, - fun(Trace) -> - ?assertEqual([], ?of_kind("inconsistent_status", Trace)), - ?assertEqual([], ?of_kind("inconsistent_cache", Trace)) - end + [log_consistency_prop()] ). t_do_not_start_after_created(_) -> @@ -219,10 +221,7 @@ t_do_not_start_after_created(_) -> ?assertNot(is_process_alive(Pid2)) end, - fun(Trace) -> - ?assertEqual([], ?of_kind("inconsistent_status", Trace)), - ?assertEqual([], ?of_kind("inconsistent_cache", Trace)) - end + [log_consistency_prop()] ). t_query(_) -> @@ -278,10 +277,9 @@ t_batch_query_counter(_) -> fun(Result, Trace) -> ?assertMatch({ok, 0}, Result), QueryTrace = ?of_kind(call_batch_query, Trace), - ?assertMatch([#{batch := [{query, _, get_counter, _, _}]}], QueryTrace) + ?assertMatch([#{batch := [{query, _, get_counter, _, _, _}]}], QueryTrace) end ), - NMsgs = 1_000, ?check_trace( ?TRACE_OPTS, @@ -341,7 +339,7 @@ t_query_counter_async_query(_) -> fun(Trace) -> %% the callback_mode of 'emqx_connector_demo' is 'always_sync'. QueryTrace = ?of_kind(call_query, Trace), - ?assertMatch([#{query := {query, _, {inc_counter, 1}, _, _}} | _], QueryTrace) + ?assertMatch([#{query := {query, _, {inc_counter, 1}, _, _, _}} | _], QueryTrace) end ), %% simple query ignores the query_mode and batching settings in the resource_worker @@ -352,7 +350,7 @@ t_query_counter_async_query(_) -> ?assertMatch({ok, 1000}, Result), %% the callback_mode if 'emqx_connector_demo' is 'always_sync'. QueryTrace = ?of_kind(call_query, Trace), - ?assertMatch([#{query := {query, _, get_counter, _, _}}], QueryTrace) + ?assertMatch([#{query := {query, _, get_counter, _, _, _}}], QueryTrace) end ), #{counters := C} = emqx_resource:get_metrics(?ID), @@ -398,7 +396,7 @@ t_query_counter_async_callback(_) -> end, fun(Trace) -> QueryTrace = ?of_kind(call_query_async, Trace), - ?assertMatch([#{query := {query, _, {inc_counter, 1}, _, _}} | _], QueryTrace) + ?assertMatch([#{query := {query, _, {inc_counter, 1}, _, _, _}} | _], QueryTrace) end ), @@ -409,7 +407,7 @@ t_query_counter_async_callback(_) -> fun(Result, Trace) -> ?assertMatch({ok, 1000}, Result), QueryTrace = ?of_kind(call_query, Trace), - ?assertMatch([#{query := {query, _, get_counter, _, _}}], QueryTrace) + ?assertMatch([#{query := {query, _, get_counter, _, _, _}}], QueryTrace) end ), #{counters := C} = emqx_resource:get_metrics(?ID), @@ -481,7 +479,7 @@ t_query_counter_async_inflight(_) -> ), fun(Trace) -> QueryTrace = ?of_kind(call_query_async, Trace), - ?assertMatch([#{query := {query, _, {inc_counter, 1}, _, _}} | _], QueryTrace) + ?assertMatch([#{query := {query, _, {inc_counter, 1}, _, _, _}} | _], QueryTrace) end ), tap_metrics(?LINE), @@ -538,7 +536,7 @@ t_query_counter_async_inflight(_) -> end, fun(Trace) -> QueryTrace = ?of_kind(call_query_async, Trace), - ?assertMatch([#{query := {query, _, {inc_counter, _}, _, _}} | _], QueryTrace), + ?assertMatch([#{query := {query, _, {inc_counter, _}, _, _, _}} | _], QueryTrace), ?assertEqual(WindowSize + Num + 1, ets:info(Tab0, size), #{tab => ets:tab2list(Tab0)}), tap_metrics(?LINE), ok @@ -558,7 +556,7 @@ t_query_counter_async_inflight(_) -> ), fun(Trace) -> QueryTrace = ?of_kind(call_query_async, Trace), - ?assertMatch([#{query := {query, _, {inc_counter, 1}, _, _}} | _], QueryTrace) + ?assertMatch([#{query := {query, _, {inc_counter, 1}, _, _, _}} | _], QueryTrace) end ), @@ -670,8 +668,8 @@ t_query_counter_async_inflight_batch(_) -> || Event = #{ ?snk_kind := call_batch_query_async, batch := [ - {query, _, {inc_counter, 1}, _, _}, - {query, _, {inc_counter, 1}, _, _} + {query, _, {inc_counter, 1}, _, _, _}, + {query, _, {inc_counter, 1}, _, _, _} ] } <- Trace @@ -755,7 +753,7 @@ t_query_counter_async_inflight_batch(_) -> fun(Trace) -> QueryTrace = ?of_kind(call_batch_query_async, Trace), ?assertMatch( - [#{batch := [{query, _, {inc_counter, _}, _, _} | _]} | _], + [#{batch := [{query, _, {inc_counter, _}, _, _, _} | _]} | _], QueryTrace ) end @@ -780,7 +778,7 @@ t_query_counter_async_inflight_batch(_) -> fun(Trace) -> QueryTrace = ?of_kind(call_batch_query_async, Trace), ?assertMatch( - [#{batch := [{query, _, {inc_counter, _}, _, _} | _]} | _], + [#{batch := [{query, _, {inc_counter, _}, _, _, _} | _]} | _], QueryTrace ) end @@ -855,14 +853,12 @@ t_healthy_timeout(_) -> ), ?assertEqual(ok, emqx_resource:remove_local(?ID)) end, - fun(Trace) -> - ?assertEqual([], ?of_kind("inconsistent_status", Trace)), - ?assertEqual([], ?of_kind("inconsistent_cache", Trace)) - end + [log_consistency_prop()] ). t_healthy(_) -> ?check_trace( + #{timetrap => 10_000}, begin ?assertMatch( {ok, _}, @@ -873,10 +869,13 @@ t_healthy(_) -> #{name => test_resource} ) ), + ct:pal("getting state"), {ok, #{pid := Pid}} = emqx_resource:query(?ID, get_state), timer:sleep(300), + ct:pal("setting state as `connecting`"), emqx_resource:set_resource_status_connecting(?ID), + ct:pal("health check"), ?assertEqual({ok, connected}, emqx_resource:health_check(?ID)), ?assertMatch( [#{status := connected}], @@ -894,10 +893,7 @@ t_healthy(_) -> ?assertEqual(ok, emqx_resource:remove_local(?ID)) end, - fun(Trace) -> - ?assertEqual([], ?of_kind("inconsistent_status", Trace)), - ?assertEqual([], ?of_kind("inconsistent_cache", Trace)) - end + [log_consistency_prop()] ). t_unhealthy_target(_) -> @@ -1005,11 +1001,7 @@ t_stop_start(_) -> ?assertEqual(ok, emqx_resource:stop(?ID)), ?assertEqual(0, emqx_resource_metrics:inflight_get(?ID)) end, - - fun(Trace) -> - ?assertEqual([], ?of_kind("inconsistent_status", Trace)), - ?assertEqual([], ?of_kind("inconsistent_cache", Trace)) - end + [log_consistency_prop()] ). t_stop_start_local(_) -> @@ -1064,10 +1056,7 @@ t_stop_start_local(_) -> ?assert(is_process_alive(Pid1)) end, - fun(Trace) -> - ?assertEqual([], ?of_kind("inconsistent_status", Trace)), - ?assertEqual([], ?of_kind("inconsistent_cache", Trace)) - end + [log_consistency_prop()] ). t_list_filter(_) -> @@ -1269,10 +1258,7 @@ t_health_check_disconnected(_) -> emqx_resource:health_check(?ID) ) end, - fun(Trace) -> - ?assertEqual([], ?of_kind("inconsistent_status", Trace)), - ?assertEqual([], ?of_kind("inconsistent_cache", Trace)) - end + [log_consistency_prop()] ). t_unblock_only_required_buffer_workers(_) -> @@ -2064,7 +2050,7 @@ do_t_expiration_before_sending(QueryMode) -> end, fun(Trace) -> ?assertMatch( - [#{batch := [{query, _, {inc_counter, 99}, _, _}]}], + [#{batch := [{query, _, {inc_counter, 99}, _, _, _}]}], ?of_kind(buffer_worker_flush_all_expired, Trace) ), Metrics = tap_metrics(?LINE), @@ -2180,7 +2166,7 @@ do_t_expiration_before_sending_partial_batch(QueryMode) -> #{ ?snk_kind := handle_async_reply, action := ack, - batch_or_query := [{query, _, {inc_counter, 99}, _, _}] + batch_or_query := [{query, _, {inc_counter, 99}, _, _, _}] }, 10 * TimeoutMS ); @@ -2202,8 +2188,8 @@ do_t_expiration_before_sending_partial_batch(QueryMode) -> ?assertMatch( [ #{ - expired := [{query, _, {inc_counter, 199}, _, _}], - not_expired := [{query, _, {inc_counter, 99}, _, _}] + expired := [{query, _, {inc_counter, 199}, _, _, _}], + not_expired := [{query, _, {inc_counter, 99}, _, _, _}] } ], ?of_kind(buffer_worker_flush_potentially_partial, Trace) @@ -2316,7 +2302,7 @@ do_t_expiration_async_after_reply(IsBatch) -> #{?snk_kind := delay}, #{ ?snk_kind := handle_async_reply_enter, - batch_or_query := [{query, _, {inc_counter, 199}, _, _} | _] + batch_or_query := [{query, _, {inc_counter, 199}, _, _, _} | _] } ), @@ -2359,8 +2345,8 @@ do_t_expiration_async_after_reply(IsBatch) -> [ #{ expired := [ - {query, _, {inc_counter, 199}, _, _}, - {query, _, {inc_counter, 299}, _, _} + {query, _, {inc_counter, 199}, _, _, _}, + {query, _, {inc_counter, 299}, _, _, _} ] } ], @@ -2378,8 +2364,8 @@ do_t_expiration_async_after_reply(IsBatch) -> single -> ?assertMatch( [ - #{expired := [{query, _, {inc_counter, 199}, _, _}]}, - #{expired := [{query, _, {inc_counter, 299}, _, _}]} + #{expired := [{query, _, {inc_counter, 199}, _, _, _}]}, + #{expired := [{query, _, {inc_counter, 299}, _, _, _}]} ], ?of_kind(handle_async_reply_expired, Trace) ) @@ -2430,7 +2416,7 @@ t_expiration_batch_all_expired_after_reply(_Config) -> #{?snk_kind := delay}, #{ ?snk_kind := handle_async_reply_enter, - batch_or_query := [{query, _, {inc_counter, 199}, _, _} | _] + batch_or_query := [{query, _, {inc_counter, 199}, _, _, _} | _] } ), @@ -2464,8 +2450,8 @@ t_expiration_batch_all_expired_after_reply(_Config) -> [ #{ expired := [ - {query, _, {inc_counter, 199}, _, _}, - {query, _, {inc_counter, 299}, _, _} + {query, _, {inc_counter, 199}, _, _, _}, + {query, _, {inc_counter, 299}, _, _, _} ] } ], @@ -2591,7 +2577,7 @@ do_t_expiration_retry() -> end, fun(Trace) -> ?assertMatch( - [#{expired := [{query, _, {inc_counter, 1}, _, _}]}], + [#{expired := [{query, _, {inc_counter, 1}, _, _, _}]}], ?of_kind(buffer_worker_retry_expired, Trace) ), Metrics = tap_metrics(?LINE), @@ -2668,8 +2654,8 @@ t_expiration_retry_batch_multiple_times(_Config) -> fun(Trace) -> ?assertMatch( [ - #{expired := [{query, _, {inc_counter, 1}, _, _}]}, - #{expired := [{query, _, {inc_counter, 2}, _, _}]} + #{expired := [{query, _, {inc_counter, 1}, _, _, _}]}, + #{expired := [{query, _, {inc_counter, 2}, _, _, _}]} ], ?of_kind(buffer_worker_retry_expired, Trace) ), @@ -3116,6 +3102,93 @@ t_telemetry_handler_crash(_Config) -> ), ok. +t_non_blocking_resource_health_check(_Config) -> + ?check_trace( + begin + {ok, _} = + create( + ?ID, + ?DEFAULT_RESOURCE_GROUP, + ?TEST_RESOURCE, + #{name => test_resource, health_check_error => {delay, 1_000}}, + #{health_check_interval => 100} + ), + %% concurrently attempt to health check the resource; should do it only once + %% for all callers + NumCallers = 20, + Expected = lists:duplicate(NumCallers, {ok, connected}), + ?assertEqual( + Expected, + emqx_utils:pmap( + fun(_) -> emqx_resource:health_check(?ID) end, + lists:seq(1, NumCallers) + ) + ), + + NumCallers + end, + [ + log_consistency_prop(), + fun(NumCallers, Trace) -> + %% shouldn't have one health check per caller + SubTrace = ?of_kind(connector_demo_health_check_delay, Trace), + ?assertMatch([_ | _], SubTrace), + ?assert(length(SubTrace) < (NumCallers div 2), #{trace => Trace}), + ok + end + ] + ), + ok. + +t_non_blocking_channel_health_check(_Config) -> + ?check_trace( + begin + {ok, _} = + create( + ?ID, + ?DEFAULT_RESOURCE_GROUP, + ?TEST_RESOURCE, + #{name => test_resource, health_check_error => {delay, 500}}, + #{health_check_interval => 100} + ), + ChanId = <<"chan">>, + ok = + emqx_resource_manager:add_channel( + ?ID, + ChanId, + #{health_check_delay => 500} + ), + + %% concurrently attempt to health check the resource; should do it only once + %% for all callers + NumCallers = 20, + Expected = lists:duplicate( + NumCallers, + #{error => undefined, status => connected} + ), + ?assertEqual( + Expected, + emqx_utils:pmap( + fun(_) -> emqx_resource_manager:channel_health_check(?ID, ChanId) end, + lists:seq(1, NumCallers) + ) + ), + + NumCallers + end, + [ + log_consistency_prop(), + fun(NumCallers, Trace) -> + %% shouldn't have one health check per caller + SubTrace = ?of_kind(connector_demo_channel_health_check_delay, Trace), + ?assertMatch([_ | _], SubTrace), + ?assert(length(SubTrace) < (NumCallers div 2), #{trace => Trace}), + ok + end + ] + ), + ok. + %%------------------------------------------------------------------------------ %% Helpers %%------------------------------------------------------------------------------ @@ -3272,7 +3345,6 @@ wait_n_events(NEvents, Timeout, EventName) -> end. assert_sync_retry_fail_then_succeed_inflight(Trace) -> - ct:pal(" ~p", [Trace]), ?assert( ?strict_causality( #{?snk_kind := buffer_worker_flush_nack, ref := _Ref}, @@ -3292,7 +3364,6 @@ assert_sync_retry_fail_then_succeed_inflight(Trace) -> ok. assert_async_retry_fail_then_succeed_inflight(Trace) -> - ct:pal(" ~p", [Trace]), ?assert( ?strict_causality( #{?snk_kind := handle_async_reply, action := nack}, @@ -3373,3 +3444,10 @@ create(Id, Group, Type, Config) -> create(Id, Group, Type, Config, Opts) -> emqx_resource:create_local(Id, Group, Type, Config, Opts). + +log_consistency_prop() -> + {"check state and cache consistency", fun ?MODULE:log_consistency_prop/1}. +log_consistency_prop(Trace) -> + ?assertEqual([], ?of_kind("inconsistent_status", Trace)), + ?assertEqual([], ?of_kind("inconsistent_cache", Trace)), + ok. diff --git a/apps/emqx_retainer/src/emqx_retainer.app.src b/apps/emqx_retainer/src/emqx_retainer.app.src index 248cc9310..4a8b3cdc3 100644 --- a/apps/emqx_retainer/src/emqx_retainer.app.src +++ b/apps/emqx_retainer/src/emqx_retainer.app.src @@ -2,7 +2,7 @@ {application, emqx_retainer, [ {description, "EMQX Retainer"}, % strict semver, bump manually! - {vsn, "5.0.21"}, + {vsn, "5.0.22"}, {modules, []}, {registered, [emqx_retainer_sup]}, {applications, [kernel, stdlib, emqx, emqx_ctl]}, diff --git a/apps/emqx_retainer/src/emqx_retainer_mnesia.erl b/apps/emqx_retainer/src/emqx_retainer_mnesia.erl index bdc1f2c67..7e2a73a09 100644 --- a/apps/emqx_retainer/src/emqx_retainer_mnesia.erl +++ b/apps/emqx_retainer/src/emqx_retainer_mnesia.erl @@ -17,6 +17,7 @@ -module(emqx_retainer_mnesia). -behaviour(emqx_retainer). +-behaviour(emqx_db_backup). -include("emqx_retainer.hrl"). -include_lib("emqx/include/logger.hrl"). @@ -54,6 +55,8 @@ -export([populate_index_meta/0]). -export([reindex/3]). +-export([backup_tables/0]). + -record(retained_message, {topic, msg, expiry_time}). -record(retained_index, {key, expiry_time}). -record(retained_index_meta, {key, read_indices, write_indices, reindexing, extra}). @@ -73,6 +76,12 @@ topics() -> [emqx_topic:join(I) || I <- mnesia:dirty_all_keys(?TAB_MESSAGE)]. +%%-------------------------------------------------------------------- +%% Data backup +%%-------------------------------------------------------------------- +backup_tables() -> + [?TAB_MESSAGE]. + %%-------------------------------------------------------------------- %% emqx_retainer callbacks %%-------------------------------------------------------------------- diff --git a/apps/emqx_rule_engine/rebar.config b/apps/emqx_rule_engine/rebar.config index 07c53d3e3..0f00f15c6 100644 --- a/apps/emqx_rule_engine/rebar.config +++ b/apps/emqx_rule_engine/rebar.config @@ -2,7 +2,16 @@ {deps, [ {emqx, {path, "../emqx"}}, - {emqx_utils, {path, "../emqx_utils"}} + {emqx_utils, {path, "../emqx_utils"}}, + {emqx_modules, {path, "../emqx_modules"}} +]}. + +{profiles, [ + {test, [ + {deps, [ + {emqx_bridge_http, {path, "../emqx_bridge_http"}} + ]} + ]} ]}. {erl_opts, [ diff --git a/apps/emqx_rule_engine/src/emqx_rule_api_schema.erl b/apps/emqx_rule_engine/src/emqx_rule_api_schema.erl index d82951124..f4685222c 100644 --- a/apps/emqx_rule_engine/src/emqx_rule_api_schema.erl +++ b/apps/emqx_rule_engine/src/emqx_rule_api_schema.erl @@ -26,7 +26,7 @@ -export([namespace/0, roots/0, fields/1]). --type tag() :: rule_creation | rule_test | rule_engine. +-type tag() :: rule_creation | rule_test | rule_engine | rule_apply_test. -spec check_params(map(), tag()) -> {ok, map()} | {error, term()}. check_params(Params, Tag) -> @@ -54,7 +54,8 @@ roots() -> {"rule_creation", sc(ref("rule_creation"), #{desc => ?DESC("root_rule_creation")})}, {"rule_info", sc(ref("rule_info"), #{desc => ?DESC("root_rule_info")})}, {"rule_events", sc(ref("rule_events"), #{desc => ?DESC("root_rule_events")})}, - {"rule_test", sc(ref("rule_test"), #{desc => ?DESC("root_rule_test")})} + {"rule_test", sc(ref("rule_test"), #{desc => ?DESC("root_rule_test")})}, + {"rule_apply_test", sc(ref("rule_apply_test"), #{desc => ?DESC("root_apply_rule_test")})} ]. fields("rule_engine") -> @@ -101,29 +102,22 @@ fields("rule_events") -> ]; fields("rule_test") -> [ - {"context", - sc( - hoconsc:union([ - ref("ctx_pub"), - ref("ctx_sub"), - ref("ctx_unsub"), - ref("ctx_delivered"), - ref("ctx_acked"), - ref("ctx_dropped"), - ref("ctx_connected"), - ref("ctx_disconnected"), - ref("ctx_connack"), - ref("ctx_check_authz_complete"), - ref("ctx_bridge_mqtt"), - ref("ctx_delivery_dropped") - ]), - #{ - desc => ?DESC("test_context"), - default => #{} - } - )}, + rule_input_message_context(), {"sql", sc(binary(), #{desc => ?DESC("test_sql"), required => true})} ]; +fields("rule_apply_test") -> + [ + rule_input_message_context(), + {"stop_action_after_template_rendering", + sc( + typerefl:boolean(), + #{ + desc => + ?DESC("stop_action_after_template_render"), + default => true + } + )} + ]; fields("metrics") -> [ {"matched", @@ -315,6 +309,29 @@ fields("ctx_delivery_dropped") -> | msg_event_common_fields() ]. +rule_input_message_context() -> + {"context", + sc( + hoconsc:union([ + ref("ctx_pub"), + ref("ctx_sub"), + ref("ctx_unsub"), + ref("ctx_delivered"), + ref("ctx_acked"), + ref("ctx_dropped"), + ref("ctx_connected"), + ref("ctx_disconnected"), + ref("ctx_connack"), + ref("ctx_check_authz_complete"), + ref("ctx_bridge_mqtt"), + ref("ctx_delivery_dropped") + ]), + #{ + desc => ?DESC("test_context"), + default => #{} + } + )}. + qos() -> {"qos", sc(emqx_schema:qos(), #{desc => ?DESC("event_qos")})}. diff --git a/apps/emqx_rule_engine/src/emqx_rule_engine.app.src b/apps/emqx_rule_engine/src/emqx_rule_engine.app.src index 1768141ae..1fed922dd 100644 --- a/apps/emqx_rule_engine/src/emqx_rule_engine.app.src +++ b/apps/emqx_rule_engine/src/emqx_rule_engine.app.src @@ -17,7 +17,9 @@ %% rule_engine should wait for bridge connector start, %% it's will check action/connector ref's exist. emqx_bridge, - emqx_connector + emqx_connector, + %% Needed to start the tracing functionality + emqx_modules ]}, {mod, {emqx_rule_engine_app, []}}, {env, []}, diff --git a/apps/emqx_rule_engine/src/emqx_rule_engine_api.erl b/apps/emqx_rule_engine/src/emqx_rule_engine_api.erl index 354e40c5f..d203dd915 100644 --- a/apps/emqx_rule_engine/src/emqx_rule_engine_api.erl +++ b/apps/emqx_rule_engine/src/emqx_rule_engine_api.erl @@ -37,6 +37,7 @@ '/rule_test'/2, '/rules'/2, '/rules/:id'/2, + '/rules/:id/test'/2, '/rules/:id/metrics'/2, '/rules/:id/metrics/reset'/2 ]). @@ -145,6 +146,7 @@ paths() -> "/rule_test", "/rules", "/rules/:id", + "/rules/:id/test", "/rules/:id/metrics", "/rules/:id/metrics/reset" ]. @@ -161,6 +163,9 @@ rule_creation_schema() -> rule_test_schema() -> ref(emqx_rule_api_schema, "rule_test"). +rule_apply_test_schema() -> + ref(emqx_rule_api_schema, "rule_apply_test"). + rule_info_schema() -> ref(emqx_rule_api_schema, "rule_info"). @@ -258,6 +263,21 @@ schema("/rules/:id") -> } } }; +schema("/rules/:id/test") -> + #{ + 'operationId' => '/rules/:id/test', + post => #{ + tags => [<<"rules">>], + description => ?DESC("api11"), + summary => <<"Apply a rule for testing">>, + 'requestBody' => rule_apply_test_schema(), + responses => #{ + 400 => error_schema('BAD_REQUEST', "Invalid Parameters"), + 412 => error_schema('NOT_MATCH', "SQL Not Match"), + 200 => <<"Rule Applied">> + } + } + }; schema("/rules/:id/metrics") -> #{ 'operationId' => '/rules/:id/metrics', @@ -392,6 +412,24 @@ param_path_id() -> end ). +'/rules/:id/test'(post, #{body := Params, bindings := #{id := RuleId}}) -> + ?CHECK_PARAMS( + Params, + rule_apply_test, + begin + case emqx_rule_sqltester:apply_rule(RuleId, CheckedParams) of + {ok, Result} -> + {200, Result}; + {error, {parse_error, Reason}} -> + {400, #{code => 'BAD_REQUEST', message => err_msg(Reason)}}; + {error, nomatch} -> + {412, #{code => 'NOT_MATCH', message => <<"SQL Not Match">>}}; + {error, Reason} -> + {400, #{code => 'BAD_REQUEST', message => err_msg(Reason)}} + end + end + ). + '/rules/:id'(get, #{bindings := #{id := Id}}) -> case emqx_rule_engine:get_rule(Id) of {ok, Rule} -> diff --git a/apps/emqx_rule_engine/src/emqx_rule_funcs.erl b/apps/emqx_rule_engine/src/emqx_rule_funcs.erl index ac7f66597..4e28efb5f 100644 --- a/apps/emqx_rule_engine/src/emqx_rule_funcs.erl +++ b/apps/emqx_rule_engine/src/emqx_rule_funcs.erl @@ -132,6 +132,8 @@ %% String Funcs -export([ + coalesce/1, + coalesce/2, lower/1, ltrim/1, reverse/1, @@ -143,6 +145,7 @@ upper/1, split/2, split/3, + concat/1, concat/2, tokens/2, tokens/3, @@ -199,7 +202,8 @@ -export([ md5/1, sha/1, - sha256/1 + sha256/1, + hash/2 ]). %% zip Funcs @@ -707,24 +711,11 @@ map(Map = #{}) -> map(Data) -> error(badarg, [Data]). -bin2hexstr(Bin) when is_binary(Bin) -> - emqx_utils:bin_to_hexstr(Bin, upper); -%% If Bin is a bitstring which is not divisible by 8, we pad it and then do the -%% conversion -bin2hexstr(Bin) when is_bitstring(Bin), (8 - (bit_size(Bin) rem 8)) >= 4 -> - PadSize = 8 - (bit_size(Bin) rem 8), - Padding = <<0:PadSize>>, - BinToConvert = <>, - <<_FirstByte:8, HexStr/binary>> = emqx_utils:bin_to_hexstr(BinToConvert, upper), - HexStr; -bin2hexstr(Bin) when is_bitstring(Bin) -> - PadSize = 8 - (bit_size(Bin) rem 8), - Padding = <<0:PadSize>>, - BinToConvert = <>, - emqx_utils:bin_to_hexstr(BinToConvert, upper). +bin2hexstr(Bin) -> + emqx_variform_bif:bin2hexstr(Bin). -hexstr2bin(Str) when is_binary(Str) -> - emqx_utils:hexstr_to_bin(Str). +hexstr2bin(Str) -> + emqx_variform_bif:hexstr2bin(Str). %%------------------------------------------------------------------------------ %% NULL Funcs @@ -768,130 +759,67 @@ is_array(_) -> false. %% String Funcs %%------------------------------------------------------------------------------ -lower(S) when is_binary(S) -> - string:lowercase(S). +coalesce(List) -> emqx_variform_bif:coalesce(List). -ltrim(S) when is_binary(S) -> - string:trim(S, leading). +coalesce(A, B) -> emqx_variform_bif:coalesce(A, B). -reverse(S) when is_binary(S) -> - iolist_to_binary(string:reverse(S)). +lower(S) -> emqx_variform_bif:lower(S). -rtrim(S) when is_binary(S) -> - string:trim(S, trailing). +ltrim(S) -> emqx_variform_bif:ltrim(S). -strlen(S) when is_binary(S) -> - string:length(S). +reverse(S) -> emqx_variform_bif:reverse(S). -substr(S, Start) when is_binary(S), is_integer(Start) -> - string:slice(S, Start). +rtrim(S) -> emqx_variform_bif:rtrim(S). -substr(S, Start, Length) when - is_binary(S), - is_integer(Start), - is_integer(Length) --> - string:slice(S, Start, Length). +strlen(S) -> emqx_variform_bif:strlen(S). -trim(S) when is_binary(S) -> - string:trim(S). +substr(S, Start) -> emqx_variform_bif:substr(S, Start). -upper(S) when is_binary(S) -> - string:uppercase(S). +substr(S, Start, Length) -> emqx_variform_bif:substr(S, Start, Length). -split(S, P) when is_binary(S), is_binary(P) -> - [R || R <- string:split(S, P, all), R =/= <<>> andalso R =/= ""]. +trim(S) -> emqx_variform_bif:trim(S). -split(S, P, <<"notrim">>) -> - string:split(S, P, all); -split(S, P, <<"leading_notrim">>) -> - string:split(S, P, leading); -split(S, P, <<"leading">>) when is_binary(S), is_binary(P) -> - [R || R <- string:split(S, P, leading), R =/= <<>> andalso R =/= ""]; -split(S, P, <<"trailing_notrim">>) -> - string:split(S, P, trailing); -split(S, P, <<"trailing">>) when is_binary(S), is_binary(P) -> - [R || R <- string:split(S, P, trailing), R =/= <<>> andalso R =/= ""]. +upper(S) -> emqx_variform_bif:upper(S). -tokens(S, Separators) -> - [list_to_binary(R) || R <- string:lexemes(binary_to_list(S), binary_to_list(Separators))]. +split(S, P) -> emqx_variform_bif:split(S, P). -tokens(S, Separators, <<"nocrlf">>) -> - [ - list_to_binary(R) - || R <- string:lexemes(binary_to_list(S), binary_to_list(Separators) ++ [$\r, $\n, [$\r, $\n]]) - ]. +split(S, P, Position) -> emqx_variform_bif:split(S, P, Position). -%% implicit convert args to strings, and then do concatenation -concat(S1, S2) -> - unicode:characters_to_binary([str(S1), str(S2)], unicode). +tokens(S, Separators) -> emqx_variform_bif:tokens(S, Separators). -sprintf_s(Format, Args) when is_list(Args) -> - erlang:iolist_to_binary(io_lib:format(binary_to_list(Format), Args)). +tokens(S, Separators, NoCRLF) -> emqx_variform_bif:tokens(S, Separators, NoCRLF). -pad(S, Len) when is_binary(S), is_integer(Len) -> - iolist_to_binary(string:pad(S, Len, trailing)). +concat(S1, S2) -> emqx_variform_bif:concat(S1, S2). -pad(S, Len, <<"trailing">>) when is_binary(S), is_integer(Len) -> - iolist_to_binary(string:pad(S, Len, trailing)); -pad(S, Len, <<"both">>) when is_binary(S), is_integer(Len) -> - iolist_to_binary(string:pad(S, Len, both)); -pad(S, Len, <<"leading">>) when is_binary(S), is_integer(Len) -> - iolist_to_binary(string:pad(S, Len, leading)). +concat(List) -> emqx_variform_bif:concat(List). -pad(S, Len, <<"trailing">>, Char) when is_binary(S), is_integer(Len), is_binary(Char) -> - Chars = unicode:characters_to_list(Char, utf8), - iolist_to_binary(string:pad(S, Len, trailing, Chars)); -pad(S, Len, <<"both">>, Char) when is_binary(S), is_integer(Len), is_binary(Char) -> - Chars = unicode:characters_to_list(Char, utf8), - iolist_to_binary(string:pad(S, Len, both, Chars)); -pad(S, Len, <<"leading">>, Char) when is_binary(S), is_integer(Len), is_binary(Char) -> - Chars = unicode:characters_to_list(Char, utf8), - iolist_to_binary(string:pad(S, Len, leading, Chars)). +sprintf_s(Format, Args) -> emqx_variform_bif:sprintf_s(Format, Args). -replace(SrcStr, P, RepStr) when is_binary(SrcStr), is_binary(P), is_binary(RepStr) -> - iolist_to_binary(string:replace(SrcStr, P, RepStr, all)). +pad(S, Len) -> emqx_variform_bif:pad(S, Len). -replace(SrcStr, P, RepStr, <<"all">>) when is_binary(SrcStr), is_binary(P), is_binary(RepStr) -> - iolist_to_binary(string:replace(SrcStr, P, RepStr, all)); -replace(SrcStr, P, RepStr, <<"trailing">>) when - is_binary(SrcStr), is_binary(P), is_binary(RepStr) --> - iolist_to_binary(string:replace(SrcStr, P, RepStr, trailing)); -replace(SrcStr, P, RepStr, <<"leading">>) when is_binary(SrcStr), is_binary(P), is_binary(RepStr) -> - iolist_to_binary(string:replace(SrcStr, P, RepStr, leading)). +pad(S, Len, Position) -> emqx_variform_bif:pad(S, Len, Position). -regex_match(Str, RE) -> - case re:run(Str, RE, [global, {capture, none}]) of - match -> true; - nomatch -> false - end. +pad(S, Len, Position, Char) -> emqx_variform_bif:pad(S, Len, Position, Char). -regex_replace(SrcStr, RE, RepStr) -> - re:replace(SrcStr, RE, RepStr, [global, {return, binary}]). +replace(SrcStr, Pattern, RepStr) -> emqx_variform_bif:replace(SrcStr, Pattern, RepStr). -ascii(Char) when is_binary(Char) -> - [FirstC | _] = binary_to_list(Char), - FirstC. +replace(SrcStr, Pattern, RepStr, Position) -> + emqx_variform_bif:replace(SrcStr, Pattern, RepStr, Position). -find(S, P) when is_binary(S), is_binary(P) -> - find_s(S, P, leading). +regex_match(Str, RE) -> emqx_variform_bif:regex_match(Str, RE). -find(S, P, <<"trailing">>) when is_binary(S), is_binary(P) -> - find_s(S, P, trailing); -find(S, P, <<"leading">>) when is_binary(S), is_binary(P) -> - find_s(S, P, leading). +regex_replace(SrcStr, RE, RepStr) -> emqx_variform_bif:regex_replace(SrcStr, RE, RepStr). -find_s(S, P, Dir) -> - case string:find(S, P, Dir) of - nomatch -> <<"">>; - SubStr -> SubStr - end. +ascii(Char) -> emqx_variform_bif:ascii(Char). + +find(S, P) -> emqx_variform_bif:find(S, P). + +find(S, P, Position) -> emqx_variform_bif:find(S, P, Position). + +join_to_string(Str) -> emqx_variform_bif:join_to_string(Str). + +join_to_string(Sep, List) -> emqx_variform_bif:join_to_string(Sep, List). -join_to_string(List) when is_list(List) -> - join_to_string(<<", ">>, List). -join_to_string(Sep, List) when is_list(List), is_binary(Sep) -> - iolist_to_binary(lists:join(Sep, [str(Item) || Item <- List])). join_to_sql_values_string(List) -> QuotedList = [ @@ -938,137 +866,7 @@ jq(FilterProgram, JSONBin) -> ]) ). -unescape(Bin) when is_binary(Bin) -> - UnicodeList = unicode:characters_to_list(Bin, utf8), - UnescapedUnicodeList = unescape_string(UnicodeList), - UnescapedUTF8Bin = unicode:characters_to_binary(UnescapedUnicodeList, utf32, utf8), - case UnescapedUTF8Bin of - Out when is_binary(Out) -> - Out; - Error -> - throw({invalid_unicode_character, Error}) - end. - -unescape_string(Input) -> unescape_string(Input, []). - -unescape_string([], Acc) -> - lists:reverse(Acc); -unescape_string([$\\, $\\ | Rest], Acc) -> - unescape_string(Rest, [$\\ | Acc]); -unescape_string([$\\, $n | Rest], Acc) -> - unescape_string(Rest, [$\n | Acc]); -unescape_string([$\\, $t | Rest], Acc) -> - unescape_string(Rest, [$\t | Acc]); -unescape_string([$\\, $r | Rest], Acc) -> - unescape_string(Rest, [$\r | Acc]); -unescape_string([$\\, $b | Rest], Acc) -> - unescape_string(Rest, [$\b | Acc]); -unescape_string([$\\, $f | Rest], Acc) -> - unescape_string(Rest, [$\f | Acc]); -unescape_string([$\\, $v | Rest], Acc) -> - unescape_string(Rest, [$\v | Acc]); -unescape_string([$\\, $' | Rest], Acc) -> - unescape_string(Rest, [$\' | Acc]); -unescape_string([$\\, $" | Rest], Acc) -> - unescape_string(Rest, [$\" | Acc]); -unescape_string([$\\, $? | Rest], Acc) -> - unescape_string(Rest, [$\? | Acc]); -unescape_string([$\\, $a | Rest], Acc) -> - unescape_string(Rest, [$\a | Acc]); -%% Start of HEX escape code -unescape_string([$\\, $x | [$0 | _] = HexStringStart], Acc) -> - unescape_handle_hex_string(HexStringStart, Acc); -unescape_string([$\\, $x | [$1 | _] = HexStringStart], Acc) -> - unescape_handle_hex_string(HexStringStart, Acc); -unescape_string([$\\, $x | [$2 | _] = HexStringStart], Acc) -> - unescape_handle_hex_string(HexStringStart, Acc); -unescape_string([$\\, $x | [$3 | _] = HexStringStart], Acc) -> - unescape_handle_hex_string(HexStringStart, Acc); -unescape_string([$\\, $x | [$4 | _] = HexStringStart], Acc) -> - unescape_handle_hex_string(HexStringStart, Acc); -unescape_string([$\\, $x | [$5 | _] = HexStringStart], Acc) -> - unescape_handle_hex_string(HexStringStart, Acc); -unescape_string([$\\, $x | [$6 | _] = HexStringStart], Acc) -> - unescape_handle_hex_string(HexStringStart, Acc); -unescape_string([$\\, $x | [$7 | _] = HexStringStart], Acc) -> - unescape_handle_hex_string(HexStringStart, Acc); -unescape_string([$\\, $x | [$8 | _] = HexStringStart], Acc) -> - unescape_handle_hex_string(HexStringStart, Acc); -unescape_string([$\\, $x | [$9 | _] = HexStringStart], Acc) -> - unescape_handle_hex_string(HexStringStart, Acc); -unescape_string([$\\, $x | [$A | _] = HexStringStart], Acc) -> - unescape_handle_hex_string(HexStringStart, Acc); -unescape_string([$\\, $x | [$B | _] = HexStringStart], Acc) -> - unescape_handle_hex_string(HexStringStart, Acc); -unescape_string([$\\, $x | [$C | _] = HexStringStart], Acc) -> - unescape_handle_hex_string(HexStringStart, Acc); -unescape_string([$\\, $x | [$D | _] = HexStringStart], Acc) -> - unescape_handle_hex_string(HexStringStart, Acc); -unescape_string([$\\, $x | [$E | _] = HexStringStart], Acc) -> - unescape_handle_hex_string(HexStringStart, Acc); -unescape_string([$\\, $x | [$F | _] = HexStringStart], Acc) -> - unescape_handle_hex_string(HexStringStart, Acc); -unescape_string([$\\, $x | [$a | _] = HexStringStart], Acc) -> - unescape_handle_hex_string(HexStringStart, Acc); -unescape_string([$\\, $x | [$b | _] = HexStringStart], Acc) -> - unescape_handle_hex_string(HexStringStart, Acc); -unescape_string([$\\, $x | [$c | _] = HexStringStart], Acc) -> - unescape_handle_hex_string(HexStringStart, Acc); -unescape_string([$\\, $x | [$d | _] = HexStringStart], Acc) -> - unescape_handle_hex_string(HexStringStart, Acc); -unescape_string([$\\, $x | [$e | _] = HexStringStart], Acc) -> - unescape_handle_hex_string(HexStringStart, Acc); -unescape_string([$\\, $x | [$f | _] = HexStringStart], Acc) -> - unescape_handle_hex_string(HexStringStart, Acc); -%% We treat all other escape sequences as not valid input to leave room for -%% extending the function to support more escape codes -unescape_string([$\\, X | _Rest], _Acc) -> - erlang:throw({unrecognized_escape_sequence, list_to_binary([$\\, X])}); -unescape_string([First | Rest], Acc) -> - unescape_string(Rest, [First | Acc]). - -unescape_handle_hex_string(HexStringStart, Acc) -> - {RemainingString, Num} = parse_hex_string(HexStringStart), - unescape_string(RemainingString, [Num | Acc]). - -parse_hex_string(SeqStartingWithHexDigit) -> - parse_hex_string(SeqStartingWithHexDigit, []). - -parse_hex_string([], Acc) -> - ReversedAcc = lists:reverse(Acc), - {[], list_to_integer(ReversedAcc, 16)}; -parse_hex_string([First | Rest] = String, Acc) -> - case is_hex_digit(First) of - true -> - parse_hex_string(Rest, [First | Acc]); - false -> - ReversedAcc = lists:reverse(Acc), - {String, list_to_integer(ReversedAcc, 16)} - end. - -is_hex_digit($0) -> true; -is_hex_digit($1) -> true; -is_hex_digit($2) -> true; -is_hex_digit($3) -> true; -is_hex_digit($4) -> true; -is_hex_digit($5) -> true; -is_hex_digit($6) -> true; -is_hex_digit($7) -> true; -is_hex_digit($8) -> true; -is_hex_digit($9) -> true; -is_hex_digit($A) -> true; -is_hex_digit($B) -> true; -is_hex_digit($C) -> true; -is_hex_digit($D) -> true; -is_hex_digit($E) -> true; -is_hex_digit($F) -> true; -is_hex_digit($a) -> true; -is_hex_digit($b) -> true; -is_hex_digit($c) -> true; -is_hex_digit($d) -> true; -is_hex_digit($e) -> true; -is_hex_digit($f) -> true; -is_hex_digit(_) -> false. +unescape(Str) -> emqx_variform_bif:unescape(Str). %%------------------------------------------------------------------------------ %% Array Funcs @@ -1095,6 +893,10 @@ last(List) when is_list(List) -> contains(Elm, List) when is_list(List) -> lists:member(Elm, List). +%%------------------------------------------------------------------------------ +%% Map Funcs +%%------------------------------------------------------------------------------ + map_new() -> #{}. @@ -1187,7 +989,7 @@ sha256(S) when is_binary(S) -> hash(sha256, S). hash(Type, Data) -> - emqx_utils:bin_to_hexstr(crypto:hash(Type, Data), lower). + emqx_variform_bif:hash(Type, Data). %%------------------------------------------------------------------------------ %% gzip Funcs diff --git a/apps/emqx_rule_engine/src/emqx_rule_runtime.erl b/apps/emqx_rule_engine/src/emqx_rule_runtime.erl index f51908772..f99341a9b 100644 --- a/apps/emqx_rule_engine/src/emqx_rule_runtime.erl +++ b/apps/emqx_rule_engine/src/emqx_rule_runtime.erl @@ -69,6 +69,14 @@ apply_rule_discard_result(Rule, Columns, Envs) -> ok. apply_rule(Rule = #{id := RuleID}, Columns, Envs) -> + set_process_trace_metadata(RuleID, Columns), + trace_rule_sql( + "rule_activated", + #{ + input => Columns, environment => Envs + }, + debug + ), ok = emqx_metrics_worker:inc(rule_metrics, RuleID, 'matched'), clear_rule_payload(), try @@ -77,48 +85,90 @@ apply_rule(Rule = #{id := RuleID}, Columns, Envs) -> %% ignore the errors if select or match failed _:Reason = {select_and_transform_error, Error} -> ok = emqx_metrics_worker:inc(rule_metrics, RuleID, 'failed.exception'), - ?SLOG(warning, #{ - msg => "SELECT_clause_exception", - rule_id => RuleID, - reason => Error - }), + trace_rule_sql( + "SELECT_clause_exception", + #{ + reason => Error + }, + warning + ), {error, Reason}; _:Reason = {match_conditions_error, Error} -> ok = emqx_metrics_worker:inc(rule_metrics, RuleID, 'failed.exception'), - ?SLOG(warning, #{ - msg => "WHERE_clause_exception", - rule_id => RuleID, - reason => Error - }), + trace_rule_sql( + "WHERE_clause_exception", + #{ + reason => Error + }, + warning + ), {error, Reason}; _:Reason = {select_and_collect_error, Error} -> ok = emqx_metrics_worker:inc(rule_metrics, RuleID, 'failed.exception'), - ?SLOG(warning, #{ - msg => "FOREACH_clause_exception", - rule_id => RuleID, - reason => Error - }), + trace_rule_sql( + "FOREACH_clause_exception", + #{ + reason => Error + }, + warning + ), {error, Reason}; _:Reason = {match_incase_error, Error} -> ok = emqx_metrics_worker:inc(rule_metrics, RuleID, 'failed.exception'), - ?SLOG(warning, #{ - msg => "INCASE_clause_exception", - rule_id => RuleID, - reason => Error - }), + trace_rule_sql( + "INCASE_clause_exception", + #{ + reason => Error + }, + warning + ), {error, Reason}; Class:Error:StkTrace -> ok = emqx_metrics_worker:inc(rule_metrics, RuleID, 'failed.exception'), - ?SLOG(error, #{ - msg => "apply_rule_failed", - rule_id => RuleID, - exception => Class, - reason => Error, - stacktrace => StkTrace - }), + trace_rule_sql( + "apply_rule_failed", + #{ + exception => Class, + reason => Error, + stacktrace => StkTrace + }, + error + ), {error, {Error, StkTrace}} + after + reset_process_trace_metadata(Columns) end. +set_process_trace_metadata(RuleID, #{clientid := ClientID} = Columns) -> + logger:update_process_metadata(#{ + clientid => ClientID + }), + set_process_trace_metadata(RuleID, maps:remove(clientid, Columns)); +set_process_trace_metadata(RuleID, Columns) -> + EventTimestamp = + case Columns of + #{timestamp := Timestamp} -> + Timestamp; + _ -> + erlang:system_time(millisecond) + end, + logger:update_process_metadata(#{ + rule_id => RuleID, + rule_trigger_time => EventTimestamp + }). + +reset_process_trace_metadata(#{clientid := _ClientID}) -> + Meta = logger:get_process_metadata(), + Meta1 = maps:remove(clientid, Meta), + Meta2 = maps:remove(rule_id, Meta1), + Meta3 = maps:remove(rule_trigger_time, Meta2), + logger:set_process_metadata(Meta3); +reset_process_trace_metadata(_) -> + Meta = logger:get_process_metadata(), + Meta1 = maps:remove(rule_id, Meta), + Meta2 = maps:remove(rule_trigger_time, Meta1), + logger:set_process_metadata(Meta2). + do_apply_rule( #{ id := RuleId, @@ -136,13 +186,18 @@ do_apply_rule( {ok, ColumnsAndSelected, FinalCollection} -> case FinalCollection of [] -> + trace_rule_sql("SQL_yielded_no_result"), ok = emqx_metrics_worker:inc(rule_metrics, RuleId, 'failed.no_result'); _ -> + trace_rule_sql( + "SQL_yielded_result", #{result => FinalCollection}, debug + ), ok = emqx_metrics_worker:inc(rule_metrics, RuleId, 'passed') end, NewEnvs = maps:merge(ColumnsAndSelected, Envs), {ok, [handle_action_list(RuleId, Actions, Coll, NewEnvs) || Coll <- FinalCollection]}; false -> + trace_rule_sql("SQL_yielded_no_result"), ok = emqx_metrics_worker:inc(rule_metrics, RuleId, 'failed.no_result'), {error, nomatch} end; @@ -159,9 +214,11 @@ do_apply_rule( ) -> case evaluate_select(Fields, Columns, Conditions) of {ok, Selected} -> + trace_rule_sql("SQL_yielded_result", #{result => Selected}, debug), ok = emqx_metrics_worker:inc(rule_metrics, RuleId, 'passed'), {ok, handle_action_list(RuleId, Actions, Selected, maps:merge(Columns, Envs))}; false -> + trace_rule_sql("SQL_yielded_no_result"), ok = emqx_metrics_worker:inc(rule_metrics, RuleId, 'failed.no_result'), {error, nomatch} end. @@ -346,36 +403,39 @@ handle_action_list(RuleId, Actions, Selected, Envs) -> handle_action(RuleId, ActId, Selected, Envs) -> ok = emqx_metrics_worker:inc(rule_metrics, RuleId, 'actions.total'), try - do_handle_action(RuleId, ActId, Selected, Envs) + Result = do_handle_action(RuleId, ActId, Selected, Envs), + Result catch throw:out_of_service -> ok = emqx_metrics_worker:inc(rule_metrics, RuleId, 'actions.failed'), ok = emqx_metrics_worker:inc( rule_metrics, RuleId, 'actions.failed.out_of_service' ), - ?SLOG(warning, #{msg => "out_of_service", action => ActId}); + trace_action(ActId, "out_of_service", #{}, warning); Err:Reason:ST -> ok = emqx_metrics_worker:inc(rule_metrics, RuleId, 'actions.failed'), ok = emqx_metrics_worker:inc(rule_metrics, RuleId, 'actions.failed.unknown'), - ?SLOG(error, #{ - msg => "action_failed", - action => ActId, - exception => Err, - reason => Reason, - stacktrace => ST - }) + trace_action( + ActId, + "action_failed", + #{ + exception => Err, + reason => Reason, + stacktrace => ST + }, + error + ) end. -define(IS_RES_DOWN(R), R == stopped; R == not_connected; R == not_found; R == unhealthy_target). -do_handle_action(RuleId, {bridge, BridgeType, BridgeName, ResId}, Selected, _Envs) -> - ?TRACE( - "BRIDGE", - "bridge_action", - #{bridge_id => emqx_bridge_resource:bridge_id(BridgeType, BridgeName)} - ), - ReplyTo = {fun ?MODULE:inc_action_metrics/2, [RuleId], #{reply_dropped => true}}, +do_handle_action(RuleId, {bridge, BridgeType, BridgeName, ResId} = Action, Selected, _Envs) -> + trace_action_bridge("BRIDGE", Action, "bridge_action", #{}, debug), + {TraceCtx, IncCtx} = do_handle_action_get_trace_inc_metrics_context(RuleId, Action), + ReplyTo = {fun ?MODULE:inc_action_metrics/2, [IncCtx], #{reply_dropped => true}}, case - emqx_bridge:send_message(BridgeType, BridgeName, ResId, Selected, #{reply_to => ReplyTo}) + emqx_bridge:send_message(BridgeType, BridgeName, ResId, Selected, #{ + reply_to => ReplyTo, trace_ctx => TraceCtx + }) of {error, Reason} when Reason == bridge_not_found; Reason == bridge_stopped -> throw(out_of_service); @@ -386,22 +446,19 @@ do_handle_action(RuleId, {bridge, BridgeType, BridgeName, ResId}, Selected, _Env end; do_handle_action( RuleId, - {bridge_v2, BridgeType, BridgeName}, + {bridge_v2, BridgeType, BridgeName} = Action, Selected, _Envs ) -> - ?TRACE( - "BRIDGE", - "bridge_action", - #{bridge_id => {bridge_v2, BridgeType, BridgeName}} - ), - ReplyTo = {fun ?MODULE:inc_action_metrics/2, [RuleId], #{reply_dropped => true}}, + trace_action_bridge("BRIDGE", Action, "bridge_action", #{}, debug), + {TraceCtx, IncCtx} = do_handle_action_get_trace_inc_metrics_context(RuleId, Action), + ReplyTo = {fun ?MODULE:inc_action_metrics/2, [IncCtx], #{reply_dropped => true}}, case emqx_bridge_v2:send_message( BridgeType, BridgeName, Selected, - #{reply_to => ReplyTo} + #{reply_to => ReplyTo, trace_ctx => TraceCtx} ) of {error, Reason} when Reason == bridge_not_found; Reason == bridge_stopped -> @@ -412,12 +469,75 @@ do_handle_action( Result end; do_handle_action(RuleId, #{mod := Mod, func := Func} = Action, Selected, Envs) -> + trace_action(Action, "call_action_function"), %% the function can also throw 'out_of_service' Args = maps:get(args, Action, []), Result = Mod:Func(Selected, Envs, Args), - inc_action_metrics(RuleId, Result), + {_, IncCtx} = do_handle_action_get_trace_inc_metrics_context(RuleId, Action), + inc_action_metrics(IncCtx, Result), Result. +do_handle_action_get_trace_inc_metrics_context(RuleID, Action) -> + case {emqx_trace:list(), logger:get_process_metadata()} of + {[], #{stop_action_after_render := true}} -> + %% Even if there is no trace we still need to pass + %% stop_action_after_render in the trace meta data so that the + %% action will be stopped. + { + #{ + stop_action_after_render => true + }, + #{ + rule_id => RuleID, + action_id => Action + } + }; + {[], _} -> + %% As a performance/memory optimization, we don't create any trace + %% context if there are no trace patterns. + {undefined, #{ + rule_id => RuleID, + action_id => Action + }}; + {_List, TraceMeta} -> + Ctx = do_handle_action_get_trace_inc_metrics_context_unconditionally(Action, TraceMeta), + {maps:remove(action_id, Ctx), Ctx} + end. + +do_handle_action_get_trace_inc_metrics_context_unconditionally(Action, TraceMeta) -> + StopAfterRender = maps:get(stop_action_after_render, TraceMeta, false), + case TraceMeta of + #{ + rule_id := RuleID, + clientid := ClientID, + rule_trigger_time := Timestamp + } -> + #{ + rule_id => RuleID, + clientid => ClientID, + action_id => Action, + stop_action_after_render => StopAfterRender, + rule_trigger_time => Timestamp + }; + #{ + rule_id := RuleID, + rule_trigger_time := Timestamp + } -> + #{ + rule_id => RuleID, + action_id => Action, + stop_action_after_render => StopAfterRender, + rule_trigger_time => Timestamp + } + end. + +action_info({bridge, BridgeType, BridgeName, _ResId}) -> + #{type => BridgeType, name => BridgeName}; +action_info({bridge_v2, BridgeType, BridgeName}) -> + #{type => BridgeType, name => BridgeName}; +action_info(FuncInfoMap) -> + FuncInfoMap. + eval({Op, _} = Exp, Context) when is_list(Context) andalso (Op == path orelse Op == var) -> case Context of [Columns] -> @@ -596,21 +716,46 @@ nested_put(Alias, Val, Columns0) -> Columns = ensure_decoded_payload(Alias, Columns0), emqx_rule_maps:nested_put(Alias, Val, Columns). -inc_action_metrics(RuleId, Result) -> - _ = do_inc_action_metrics(RuleId, Result), +inc_action_metrics(TraceCtx, Result) -> + _ = do_inc_action_metrics(TraceCtx, Result), Result. -do_inc_action_metrics(RuleId, {error, {recoverable_error, _}}) -> - emqx_metrics_worker:inc(rule_metrics, RuleId, 'actions.failed.out_of_service'); -do_inc_action_metrics(RuleId, {error, {unrecoverable_error, _}}) -> +do_inc_action_metrics( + #{rule_id := RuleId, action_id := ActId} = TraceContext, + {error, {unrecoverable_error, {action_stopped_after_template_rendering, Explanation}} = _Reason} +) -> + TraceContext1 = maps:remove(action_id, TraceContext), + trace_action( + ActId, + "action_stopped_after_template_rendering", + maps:merge(#{reason => Explanation}, TraceContext1) + ), emqx_metrics_worker:inc(rule_metrics, RuleId, 'actions.failed'), emqx_metrics_worker:inc(rule_metrics, RuleId, 'actions.failed.unknown'); -do_inc_action_metrics(RuleId, R) -> +do_inc_action_metrics( + #{rule_id := RuleId, action_id := ActId} = TraceContext, + {error, {recoverable_error, _}} +) -> + TraceContext1 = maps:remove(action_id, TraceContext), + trace_action(ActId, "out_of_service", TraceContext1), + emqx_metrics_worker:inc(rule_metrics, RuleId, 'actions.failed.out_of_service'); +do_inc_action_metrics( + #{rule_id := RuleId, action_id := ActId} = TraceContext, + {error, {unrecoverable_error, _} = Reason} +) -> + TraceContext1 = maps:remove(action_id, TraceContext), + trace_action(ActId, "action_failed", maps:merge(#{reason => Reason}, TraceContext1)), + emqx_metrics_worker:inc(rule_metrics, RuleId, 'actions.failed'), + emqx_metrics_worker:inc(rule_metrics, RuleId, 'actions.failed.unknown'); +do_inc_action_metrics(#{rule_id := RuleId, action_id := ActId} = TraceContext, R) -> + TraceContext1 = maps:remove(action_id, TraceContext), case is_ok_result(R) of false -> + trace_action(ActId, "action_failed", maps:merge(#{reason => R}, TraceContext1)), emqx_metrics_worker:inc(rule_metrics, RuleId, 'actions.failed'), emqx_metrics_worker:inc(rule_metrics, RuleId, 'actions.failed.unknown'); true -> + trace_action(ActId, "action_success", maps:merge(#{result => R}, TraceContext1)), emqx_metrics_worker:inc(rule_metrics, RuleId, 'actions.success') end. @@ -658,3 +803,39 @@ parse_function_name(Module, Name) when is_binary(Name) -> end; parse_function_name(_Module, Name) when is_atom(Name) -> Name. + +trace_action(ActId, Message) -> + trace_action_bridge("ACTION", ActId, Message). + +trace_action(ActId, Message, Extra) -> + trace_action_bridge("ACTION", ActId, Message, Extra, debug). + +trace_action(ActId, Message, Extra, Level) -> + trace_action_bridge("ACTION", ActId, Message, Extra, Level). + +trace_action_bridge(Tag, ActId, Message) -> + trace_action_bridge(Tag, ActId, Message, #{}, debug). + +trace_action_bridge(Tag, ActId, Message, Extra, Level) -> + ?TRACE( + Level, + Tag, + Message, + maps:merge( + #{ + action_info => action_info(ActId) + }, + Extra + ) + ). + +trace_rule_sql(Message) -> + trace_rule_sql(Message, #{}, debug). + +trace_rule_sql(Message, Extra, Level) -> + ?TRACE( + Level, + "RULE_SQL_EXEC", + Message, + Extra + ). diff --git a/apps/emqx_rule_engine/src/emqx_rule_sqltester.erl b/apps/emqx_rule_engine/src/emqx_rule_sqltester.erl index 8212e3385..83f29eef3 100644 --- a/apps/emqx_rule_engine/src/emqx_rule_sqltester.erl +++ b/apps/emqx_rule_engine/src/emqx_rule_sqltester.erl @@ -20,9 +20,73 @@ test/1, get_selected_data/3, %% Some SQL functions return different results in the test environment - is_test_runtime_env/0 + is_test_runtime_env/0, + apply_rule/2 ]). +apply_rule( + RuleId, + #{ + context := Context, + stop_action_after_template_rendering := StopAfterRender + } +) -> + {ok, Rule} = emqx_rule_engine:get_rule(RuleId), + InTopic = get_in_topic(Context), + EventTopics = maps:get(from, Rule, []), + case lists:all(fun is_publish_topic/1, EventTopics) of + true -> + %% test if the topic matches the topic filters in the rule + case emqx_topic:match_any(InTopic, EventTopics) of + true -> + do_apply_matched_rule( + Rule, + Context, + StopAfterRender + ); + false -> + {error, nomatch} + end; + false -> + case lists:member(InTopic, EventTopics) of + true -> + %% the rule is for both publish and events, test it directly + do_apply_matched_rule(Rule, Context, StopAfterRender); + false -> + {error, nomatch} + end + end. + +do_apply_matched_rule(Rule, Context, StopAfterRender) -> + update_process_trace_metadata(StopAfterRender), + ApplyRuleRes = emqx_rule_runtime:apply_rule( + Rule, + Context, + apply_rule_environment() + ), + reset_trace_process_metadata(StopAfterRender), + ApplyRuleRes. + +update_process_trace_metadata(true = _StopAfterRender) -> + logger:update_process_metadata(#{ + stop_action_after_render => true + }); +update_process_trace_metadata(false = _StopAfterRender) -> + ok. + +reset_trace_process_metadata(true = _StopAfterRender) -> + Meta = logger:get_process_metadata(), + NewMeta = maps:remove(stop_action_after_render, Meta), + logger:set_process_metadata(NewMeta); +reset_trace_process_metadata(false = _StopAfterRender) -> + ok. + +%% At the time of writing the environment passed to the apply rule function is +%% not used at all for normal actions. When it is used for custom functions it +%% is first merged with the context so there does not seem to be any need to +%% set this to anything else then the empty map. +apply_rule_environment() -> #{}. + -spec test(#{sql := binary(), context := map()}) -> {ok, map() | list()} | {error, term()}. test(#{sql := Sql, context := Context}) -> case emqx_rule_sqlparser:parse(Sql) of diff --git a/apps/emqx_rule_engine/test/emqx_rule_engine_SUITE.erl b/apps/emqx_rule_engine/test/emqx_rule_engine_SUITE.erl index 76cc23c0d..b0ca00a0e 100644 --- a/apps/emqx_rule_engine/test/emqx_rule_engine_SUITE.erl +++ b/apps/emqx_rule_engine/test/emqx_rule_engine_SUITE.erl @@ -43,7 +43,8 @@ all() -> {group, metrics}, {group, metrics_simple}, {group, metrics_fail}, - {group, metrics_fail_simple} + {group, metrics_fail_simple}, + {group, tracing} ]. suite() -> @@ -142,6 +143,9 @@ groups() -> {metrics_fail_simple, [], [ t_rule_metrics_sync_fail, t_rule_metrics_async_fail + ]}, + {tracing, [], [ + t_trace_rule_id ]} ]. @@ -160,7 +164,7 @@ init_per_suite(Config) -> Config. end_per_suite(_Config) -> - emqx_common_test_helpers:stop_apps([emqx_conf, emqx_rule_engine]), + emqx_common_test_helpers:stop_apps([emqx_conf, emqx_rule_engine, emqx_auth, emqx_bridge]), ok. set_special_configs(emqx_auth) -> @@ -3632,6 +3636,111 @@ create_bridge(Type, Name, Config) -> {ok, _Bridge} = emqx_bridge:create(Type, Name, Config), emqx_bridge_resource:bridge_id(Type, Name). +create_rule(Name, SQL) -> + Rule = emqx_rule_engine_SUITE:make_simple_rule(Name, SQL), + {ok, _} = emqx_rule_engine:create_rule(Rule). + +emqtt_client_config() -> + [ + {host, "localhost"}, + {clientid, <<"client">>}, + {username, <<"testuser">>}, + {password, <<"pass">>} + ]. + +filesync(Name, Type) -> + ct:sleep(50), + filesync(Name, Type, 5). + +%% sometime the handler process is not started yet. +filesync(Name, Type, 0) -> + ct:fail("Handler process not started ~p ~p", [Name, Type]); +filesync(Name0, Type, Retry) -> + Name = + case is_binary(Name0) of + true -> Name0; + false -> list_to_binary(Name0) + end, + try + Handler = binary_to_atom(<<"trace_", (atom_to_binary(Type))/binary, "_", Name/binary>>), + ok = logger_disk_log_h:filesync(Handler) + catch + E:R -> + ct:pal("Filesync error:~p ~p~n", [{Name, Type, Retry}, {E, R}]), + ct:sleep(100), + filesync(Name, Type, Retry - 1) + end. + +t_trace_rule_id(_Config) -> + %% Start MQTT Client + emqx_trace_SUITE:reload(), + {ok, T} = emqtt:start_link(emqtt_client_config()), + emqtt:connect(T), + %% Create rules + create_rule( + <<"test_rule_id_1">>, + <<"select 1 as rule_number from \"rule_1_topic\"">> + ), + create_rule( + <<"test_rule_id_2">>, + <<"select 2 as rule_number from \"rule_2_topic\"">> + ), + %% Start tracing + ok = emqx_trace_handler:install( + "CLI-RULE-1", ruleid, <<"test_rule_id_1">>, all, "tmp/rule_trace_1.log" + ), + ok = emqx_trace_handler:install( + "CLI-RULE-2", ruleid, <<"test_rule_id_2">>, all, "tmp/rule_trace_2.log" + ), + emqx_trace:check(), + ok = filesync("CLI-RULE-1", ruleid), + ok = filesync("CLI-RULE-2", ruleid), + + %% Verify the tracing file exits + ?assert(filelib:is_regular("tmp/rule_trace_1.log")), + ?assert(filelib:is_regular("tmp/rule_trace_2.log")), + + %% Get current traces + ?assertMatch( + [ + #{ + type := ruleid, + filter := <<"test_rule_id_1">>, + level := debug, + dst := "tmp/rule_trace_1.log", + name := <<"CLI-RULE-1">> + }, + #{ + type := ruleid, + filter := <<"test_rule_id_2">>, + name := <<"CLI-RULE-2">>, + level := debug, + dst := "tmp/rule_trace_2.log" + } + ], + emqx_trace_handler:running() + ), + + %% Trigger rule + emqtt:publish(T, <<"rule_1_topic">>, <<"my_traced_message">>), + ?retry( + 100, + 5, + begin + ok = filesync("CLI-RULE-1", ruleid), + {ok, Bin} = file:read_file("tmp/rule_trace_1.log"), + ?assertNotEqual(nomatch, binary:match(Bin, [<<"my_traced_message">>])) + end + ), + ok = filesync("CLI-RULE-2", ruleid), + ?assert(filelib:file_size("tmp/rule_trace_2.log") =:= 0), + + %% Stop tracing + ok = emqx_trace_handler:uninstall(ruleid, <<"CLI-RULE-1">>), + ok = emqx_trace_handler:uninstall(ruleid, <<"CLI-RULE-2">>), + ?assertEqual([], emqx_trace_handler:running()), + emqtt:disconnect(T). + %%------------------------------------------------------------------------------ %% Internal helpers %%------------------------------------------------------------------------------ diff --git a/apps/emqx_rule_engine/test/emqx_rule_engine_api_rule_apply_SUITE.erl b/apps/emqx_rule_engine/test/emqx_rule_engine_api_rule_apply_SUITE.erl new file mode 100644 index 000000000..52fa1a2e5 --- /dev/null +++ b/apps/emqx_rule_engine/test/emqx_rule_engine_api_rule_apply_SUITE.erl @@ -0,0 +1,391 @@ +%%-------------------------------------------------------------------- +%% Copyright (c) 2022-2024 EMQ Technologies Co., Ltd. All Rights Reserved. +%% +%% Licensed under the Apache License, Version 2.0 (the "License"); +%% you may not use this file except in compliance with the License. +%% You may obtain a copy of the License at +%% +%% http://www.apache.org/licenses/LICENSE-2.0 +%% +%% Unless required by applicable law or agreed to in writing, software +%% distributed under the License is distributed on an "AS IS" BASIS, +%% WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +%% See the License for the specific language governing permissions and +%% limitations under the License. +%%-------------------------------------------------------------------- + +-module(emqx_rule_engine_api_rule_apply_SUITE). + +-compile(nowarn_export_all). +-compile(export_all). + +-include_lib("eunit/include/eunit.hrl"). +-include_lib("common_test/include/ct.hrl"). +-include_lib("snabbkaffe/include/snabbkaffe.hrl"). + +-define(CONF_DEFAULT, <<"rule_engine {rules {}}">>). + +all() -> + emqx_common_test_helpers:all(?MODULE). + +init_per_suite(Config) -> + application:load(emqx_conf), + AppsToStart = [ + emqx, + emqx_conf, + emqx_connector, + emqx_bridge, + emqx_bridge_http, + emqx_rule_engine, + emqx_modules + ], + %% I don't know why we need to stop the apps and then start them but if we + %% don't do this and other suites run before this suite the test cases will + %% fail as it seems like the connector silently refuses to start. + ok = emqx_cth_suite:stop(AppsToStart), + Apps = emqx_cth_suite:start( + AppsToStart, + #{work_dir => emqx_cth_suite:work_dir(Config)} + ), + emqx_mgmt_api_test_util:init_suite(), + [{apps, Apps} | Config]. + +end_per_suite(Config) -> + Apps = ?config(apps, Config), + emqx_mgmt_api_test_util:end_suite(), + ok = emqx_cth_suite:stop(Apps), + ok. + +init_per_testcase(_Case, Config) -> + emqx_bridge_http_test_lib:init_http_success_server(Config). + +end_per_testcase(_TestCase, _Config) -> + ok = emqx_bridge_http_connector_test_server:stop(), + emqx_bridge_v2_testlib:delete_all_bridges(), + emqx_bridge_v2_testlib:delete_all_connectors(), + emqx_common_test_helpers:call_janitor(), + ok. + +t_basic_apply_rule_trace_ruleid(Config) -> + basic_apply_rule_test_helper(Config, ruleid, false). + +t_basic_apply_rule_trace_clientid(Config) -> + basic_apply_rule_test_helper(Config, clientid, false). + +t_basic_apply_rule_trace_ruleid_stop_after_render(Config) -> + basic_apply_rule_test_helper(Config, ruleid, true). + +basic_apply_rule_test_helper(Config, TraceType, StopAfterRender) -> + HTTPServerConfig = ?config(http_server, Config), + emqx_bridge_http_test_lib:make_bridge(HTTPServerConfig), + #{status := connected} = emqx_bridge_v2:health_check( + http, emqx_bridge_http_test_lib:bridge_name() + ), + %% Create Rule + RuleTopic = iolist_to_binary([<<"my_rule_topic/">>, atom_to_binary(?FUNCTION_NAME)]), + SQL = <<"SELECT payload.id as id FROM \"", RuleTopic/binary, "\"">>, + {ok, #{<<"id">> := RuleId}} = + emqx_bridge_testlib:create_rule_and_action_http( + http, + RuleTopic, + Config, + #{sql => SQL} + ), + ClientId = <<"c_emqx">>, + %% =================================== + %% Create trace for RuleId + %% =================================== + TraceName = atom_to_binary(?FUNCTION_NAME), + TraceValue = + case TraceType of + ruleid -> + RuleId; + clientid -> + ClientId + end, + create_trace(TraceName, TraceType, TraceValue), + %% =================================== + Context = #{ + clientid => ClientId, + event_type => message_publish, + payload => <<"{\"msg\": \"hello\"}">>, + qos => 1, + topic => RuleTopic, + username => <<"u_emqx">> + }, + Params = #{ + <<"context">> => Context, + <<"stop_action_after_template_rendering">> => StopAfterRender + }, + emqx_trace:check(), + ok = emqx_trace_handler_SUITE:filesync(TraceName, TraceType), + Now = erlang:system_time(second) - 10, + {ok, _} = file:read_file(emqx_trace:log_file(TraceName, Now)), + ?assertMatch({ok, _}, call_apply_rule_api(RuleId, Params)), + ?retry( + _Interval0 = 200, + _NAttempts0 = 20, + begin + Bin = read_rule_trace_file(TraceName, TraceType, Now), + io:format("THELOG:~n~s", [Bin]), + ?assertNotEqual(nomatch, binary:match(Bin, [<<"rule_activated">>])), + ?assertNotEqual(nomatch, binary:match(Bin, [<<"SQL_yielded_result">>])), + ?assertNotEqual(nomatch, binary:match(Bin, [<<"bridge_action">>])), + ?assertNotEqual(nomatch, binary:match(Bin, [<<"action_template_rendered">>])), + ?assertNotEqual(nomatch, binary:match(Bin, [<<"QUERY_ASYNC">>])) + end + ), + case StopAfterRender of + true -> + ?retry( + _Interval0 = 200, + _NAttempts0 = 20, + begin + Bin = read_rule_trace_file(TraceName, TraceType, Now), + io:format("THELOG2:~n~s", [Bin]), + ?assertNotEqual( + nomatch, binary:match(Bin, [<<"action_stopped_after_template_rendering">>]) + ) + end + ); + false -> + ?retry( + _Interval0 = 200, + _NAttempts0 = 20, + begin + Bin = read_rule_trace_file(TraceName, TraceType, Now), + io:format("THELOG3:~n~s", [Bin]), + ?assertNotEqual(nomatch, binary:match(Bin, [<<"action_success">>])) + end + ) + end, + %% Check that rule_trigger_time meta field is present in all log entries + Log0 = read_rule_trace_file(TraceName, TraceType, Now), + Log1 = binary:split(Log0, <<"\n">>, [global, trim]), + Log2 = lists:join(<<",\n">>, Log1), + Log3 = iolist_to_binary(["[", Log2, "]"]), + {ok, LogEntries} = emqx_utils_json:safe_decode(Log3, [return_maps]), + [#{<<"meta">> := #{<<"rule_trigger_time">> := RuleTriggerTime}} | _] = LogEntries, + [ + ?assert( + (maps:get(<<"rule_trigger_time">>, Meta, no_time) =:= RuleTriggerTime) orelse + (lists:member(RuleTriggerTime, maps:get(<<"rule_trigger_times">>, Meta, []))) + ) + || #{<<"meta">> := Meta} <- LogEntries + ], + emqx_trace:delete(TraceName), + ok. + +create_trace(TraceName, TraceType, TraceValue) -> + Now = erlang:system_time(second) - 10, + Start = Now, + End = Now + 60, + Trace = #{ + name => TraceName, + type => TraceType, + TraceType => TraceValue, + start_at => Start, + end_at => End, + formatter => json + }, + {ok, _} = emqx_trace:create(Trace). + +t_apply_rule_test_batch_separation_stop_after_render(_Config) -> + MeckOpts = [passthrough, no_link, no_history, non_strict], + catch meck:new(emqx_connector_info, MeckOpts), + meck:expect( + emqx_connector_info, + hard_coded_test_connector_info_modules, + 0, + [emqx_rule_engine_test_connector_info] + ), + emqx_connector_info:clean_cache(), + catch meck:new(emqx_action_info, MeckOpts), + meck:expect( + emqx_action_info, + hard_coded_test_action_info_modules, + 0, + [emqx_rule_engine_test_action_info] + ), + emqx_action_info:clean_cache(), + {ok, _} = emqx_connector:create(rule_engine_test, ?FUNCTION_NAME, #{}), + Name = atom_to_binary(?FUNCTION_NAME), + ActionConf = + #{ + <<"connector">> => Name, + <<"parameters">> => + #{ + <<"values">> => + #{ + <<"send_to_pid">> => emqx_utils:bin_to_hexstr( + term_to_binary(self()), upper + ) + } + }, + <<"resource_opts">> => #{ + <<"batch_size">> => 1000, + <<"batch_time">> => 500 + } + }, + {ok, _} = emqx_bridge_v2:create( + rule_engine_test, + ?FUNCTION_NAME, + ActionConf + ), + SQL = <<"SELECT payload.is_stop_after_render as stop_after_render FROM \"", Name/binary, "\"">>, + {ok, RuleID} = create_rule_with_action( + rule_engine_test, + ?FUNCTION_NAME, + SQL + ), + create_trace(Name, ruleid, RuleID), + emqx_trace:check(), + ok = emqx_trace_handler_SUITE:filesync(Name, ruleid), + Now = erlang:system_time(second) - 10, + %% Stop + ParmsStopAfterRender = apply_rule_parms(true, Name), + ParmsNoStopAfterRender = apply_rule_parms(false, Name), + %% Check that batching is working + Count = 200, + CountMsgFun = + fun + CountMsgFunRec(0 = _CurCount, GotBatchWithAtLeastTwo) -> + GotBatchWithAtLeastTwo; + CountMsgFunRec(CurCount, GotBatchWithAtLeastTwo) -> + receive + List -> + Len = length(List), + CountMsgFunRec(CurCount - Len, GotBatchWithAtLeastTwo orelse (Len > 1)) + end + end, + lists:foreach( + fun(_) -> + {ok, _} = call_apply_rule_api(RuleID, ParmsStopAfterRender) + end, + lists:seq(1, Count) + ), + %% We should get the messages and at least one batch with more than 1 + true = CountMsgFun(Count, false), + %% We should check that we don't get any mixed batch + CheckBatchesFun = + fun + CheckBatchesFunRec(0 = _CurCount) -> + ok; + CheckBatchesFunRec(CurCount) -> + receive + [{_, #{<<"stop_after_render">> := StopValue}} | _] = List -> + [ + ?assertMatch(#{<<"stop_after_render">> := StopValue}, Msg) + || {_, Msg} <- List + ], + Len = length(List), + CheckBatchesFunRec(CurCount - Len) + end + end, + lists:foreach( + fun(_) -> + case rand:normal() < 0 of + true -> + {ok, _} = call_apply_rule_api(RuleID, ParmsStopAfterRender); + false -> + {ok, _} = call_apply_rule_api(RuleID, ParmsNoStopAfterRender) + end + end, + lists:seq(1, Count) + ), + CheckBatchesFun(Count), + %% Just check that the log file is created as expected + ?retry( + _Interval0 = 200, + _NAttempts0 = 20, + begin + Bin = read_rule_trace_file(Name, ruleid, Now), + ?assertNotEqual(nomatch, binary:match(Bin, [<<"action_success">>])), + ?assertNotEqual( + nomatch, binary:match(Bin, [<<"action_stopped_after_template_rendering">>]) + ) + end + ), + %% Cleanup + ok = emqx_trace:delete(Name), + ok = emqx_rule_engine:delete_rule(RuleID), + ok = emqx_bridge_v2:remove(rule_engine_test, ?FUNCTION_NAME), + ok = emqx_connector:remove(rule_engine_test, ?FUNCTION_NAME), + [_, _] = meck:unload(), + ok. + +apply_rule_parms(StopAfterRender, Name) -> + Payload = #{<<"is_stop_after_render">> => StopAfterRender}, + Context = #{ + clientid => Name, + event_type => message_publish, + payload => emqx_utils_json:encode(Payload), + qos => 1, + topic => Name, + username => <<"u_emqx">> + }, + #{ + <<"context">> => Context, + <<"stop_action_after_template_rendering">> => StopAfterRender + }. + +create_rule_with_action(ActionType, ActionName, SQL) -> + BridgeId = emqx_bridge_resource:bridge_id(ActionType, ActionName), + Params = #{ + enable => true, + sql => SQL, + actions => [BridgeId] + }, + Path = emqx_mgmt_api_test_util:api_path(["rules"]), + AuthHeader = emqx_mgmt_api_test_util:auth_header_(), + ct:pal("rule action params: ~p", [Params]), + case emqx_mgmt_api_test_util:request_api(post, Path, "", AuthHeader, Params) of + {ok, Res0} -> + #{<<"id">> := RuleId} = emqx_utils_json:decode(Res0, [return_maps]), + {ok, RuleId}; + Error -> + Error + end. + +%% Helper Functions + +call_apply_rule_api(RuleId, Params) -> + Method = post, + Path = emqx_mgmt_api_test_util:api_path(["rules", RuleId, "test"]), + Res = request(Method, Path, Params), + Res. + +request(Method, Path, Params) -> + AuthHeader = emqx_mgmt_api_test_util:auth_header_(), + Opts = #{return_all => true}, + case emqx_mgmt_api_test_util:request_api(Method, Path, "", AuthHeader, Params, Opts) of + {ok, {Status, Headers, Body0}} -> + Body = maybe_json_decode(Body0), + {ok, {Status, Headers, Body}}; + {error, {Status, Headers, Body0}} -> + Body = + case emqx_utils_json:safe_decode(Body0, [return_maps]) of + {ok, Decoded0 = #{<<"message">> := Msg0}} -> + Msg = maybe_json_decode(Msg0), + Decoded0#{<<"message">> := Msg}; + {ok, Decoded0} -> + Decoded0; + {error, _} -> + Body0 + end, + {error, {Status, Headers, Body}}; + Error -> + Error + end. + +maybe_json_decode(X) -> + case emqx_utils_json:safe_decode(X, [return_maps]) of + {ok, Decoded} -> Decoded; + {error, _} -> X + end. + +read_rule_trace_file(TraceName, TraceType, From) -> + emqx_trace:check(), + ok = emqx_trace_handler_SUITE:filesync(TraceName, TraceType), + {ok, Bin} = file:read_file(emqx_trace:log_file(TraceName, From)), + Bin. diff --git a/apps/emqx_rule_engine/test/emqx_rule_engine_api_rule_test_SUITE.erl b/apps/emqx_rule_engine/test/emqx_rule_engine_api_rule_test_SUITE.erl index 7f74cc7d7..8b47669da 100644 --- a/apps/emqx_rule_engine/test/emqx_rule_engine_api_rule_test_SUITE.erl +++ b/apps/emqx_rule_engine/test/emqx_rule_engine_api_rule_test_SUITE.erl @@ -30,11 +30,11 @@ all() -> init_per_suite(Config) -> application:load(emqx_conf), ok = emqx_common_test_helpers:load_config(emqx_rule_engine_schema, ?CONF_DEFAULT), - ok = emqx_common_test_helpers:start_apps([emqx_conf, emqx_rule_engine]), + ok = emqx_common_test_helpers:start_apps([emqx_conf, emqx_rule_engine, emqx_modules]), Config. end_per_suite(_Config) -> - emqx_common_test_helpers:stop_apps([emqx_conf, emqx_rule_engine]), + emqx_common_test_helpers:stop_apps([emqx_conf, emqx_rule_engine, emqx_modules]), ok. t_ctx_pub(_) -> diff --git a/apps/emqx_rule_engine/test/emqx_rule_engine_test_action_info.erl b/apps/emqx_rule_engine/test/emqx_rule_engine_test_action_info.erl new file mode 100644 index 000000000..91bbcb442 --- /dev/null +++ b/apps/emqx_rule_engine/test/emqx_rule_engine_test_action_info.erl @@ -0,0 +1,101 @@ +%%-------------------------------------------------------------------- +%% Copyright (c) 2022-2024 EMQ Technologies Co., Ltd. All Rights Reserved. +%%-------------------------------------------------------------------- + +-module(emqx_rule_engine_test_action_info). + +-behaviour(emqx_action_info). + +-export([ + bridge_v1_type_name/0, + action_type_name/0, + connector_type_name/0, + schema_module/0 +]). + +-export([ + namespace/0, + roots/0, + fields/1, + desc/1 +]). + +-define(CONNECTOR_TYPE, rule_engine_test). +-define(ACTION_TYPE, ?CONNECTOR_TYPE). + +bridge_v1_type_name() -> ?ACTION_TYPE. + +action_type_name() -> ?ACTION_TYPE. + +connector_type_name() -> ?ACTION_TYPE. + +schema_module() -> emqx_rule_engine_test_action_info. + +%% ------------------------------------------------------------------------------------------------- +%% Hocon Schema Definitions + +namespace() -> "bridge_test_action_info". + +roots() -> []. + +fields(Field) when + Field == "get_connector"; + Field == "put_connector"; + Field == "post_connector" +-> + Fields = + fields(connector_fields) ++ + emqx_connector_schema:resource_opts_ref(?MODULE, connector_resource_opts), + emqx_connector_schema:api_fields(Field, ?CONNECTOR_TYPE, Fields); +fields(Field) when + Field == "get_bridge_v2"; + Field == "post_bridge_v2"; + Field == "put_bridge_v2" +-> + emqx_bridge_v2_schema:api_fields(Field, ?ACTION_TYPE, fields(rule_engine_test_action)); +fields(action) -> + {?ACTION_TYPE, + hoconsc:mk( + hoconsc:map(name, hoconsc:ref(?MODULE, rule_engine_test_action)), + #{ + desc => <<"Test Action Config">>, + required => false + } + )}; +fields(rule_engine_test_action) -> + emqx_bridge_v2_schema:make_producer_action_schema( + hoconsc:mk( + hoconsc:ref(?MODULE, action_parameters), + #{ + required => true, + desc => undefined + } + ) + ); +fields(action_parameters) -> + [ + {values, + hoconsc:mk( + typerefl:map(), + #{desc => undefined, default => #{}} + )} + ]; +fields("config_connector") -> + emqx_connector_schema:common_fields() ++ + fields(connector_fields) ++ + emqx_connector_schema:resource_opts_ref(?MODULE, connector_resource_opts); +fields(connector_resource_opts) -> + emqx_connector_schema:resource_opts_fields(); +fields("config") -> + emqx_resource_schema:fields("resource_opts") ++ + fields(connector_fields); +fields(connector_fields) -> + [ + {values, + hoconsc:mk( + typerefl:map(), + #{desc => undefined, default => #{}} + )} + ]. +desc(_) -> + undefined. diff --git a/apps/emqx_rule_engine/test/emqx_rule_engine_test_connector.erl b/apps/emqx_rule_engine/test/emqx_rule_engine_test_connector.erl new file mode 100644 index 000000000..c22c5fbd5 --- /dev/null +++ b/apps/emqx_rule_engine/test/emqx_rule_engine_test_connector.erl @@ -0,0 +1,100 @@ +%%-------------------------------------------------------------------- +%% Copyright (c) 2020-2024 EMQ Technologies Co., Ltd. All Rights Reserved. +%% +%% Licensed under the Apache License, Version 2.0 (the "License"); +%% you may not use this file except in compliance with the License. +%% You may obtain a copy of the License at +%% +%% http://www.apache.org/licenses/LICENSE-2.0 +%% +%% Unless required by applicable law or agreed to in writing, software +%% distributed under the License is distributed on an "AS IS" BASIS, +%% WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +%% See the License for the specific language governing permissions and +%% limitations under the License. +%%-------------------------------------------------------------------- +-module(emqx_rule_engine_test_connector). + +-include_lib("emqx_connector/include/emqx_connector.hrl"). +-include_lib("typerefl/include/types.hrl"). +-include_lib("emqx/include/logger.hrl"). +-include_lib("hocon/include/hoconsc.hrl"). +-include_lib("snabbkaffe/include/snabbkaffe.hrl"). + +-behaviour(emqx_resource). + +%% callbacks of behaviour emqx_resource +-export([ + callback_mode/0, + on_start/2, + on_stop/2, + on_query/3, + on_batch_query/3, + on_get_status/2, + on_add_channel/4, + on_remove_channel/3, + on_get_channels/1, + on_get_channel_status/3 +]). + +%% =================================================================== +callback_mode() -> always_sync. + +on_start( + _InstId, + _Config +) -> + {ok, #{installed_channels => #{}}}. + +on_stop(_InstId, _State) -> + ok. + +on_add_channel( + _InstId, + #{ + installed_channels := InstalledChannels + } = OldState, + ChannelId, + ChannelConfig +) -> + NewInstalledChannels = maps:put(ChannelId, ChannelConfig, InstalledChannels), + NewState = OldState#{installed_channels => NewInstalledChannels}, + {ok, NewState}. + +on_remove_channel( + _InstId, + OldState, + _ChannelId +) -> + {ok, OldState}. + +on_get_channel_status( + _ResId, + _ChannelId, + _State +) -> + connected. + +on_get_channels(ResId) -> + emqx_bridge_v2:get_channels_for_connector(ResId). + +on_query( + _InstId, + _Query, + _State +) -> + ok. + +on_batch_query( + _InstId, + [{ChannelId, _Req} | _] = Msg, + #{installed_channels := Channels} = _State +) -> + #{parameters := #{values := #{send_to_pid := PidBin}}} = maps:get(ChannelId, Channels), + Pid = binary_to_term(emqx_utils:hexstr_to_bin(PidBin)), + Pid ! Msg, + emqx_trace:rendered_action_template(ChannelId, #{nothing_to_render => ok}), + ok. + +on_get_status(_InstId, _State) -> + connected. diff --git a/apps/emqx_rule_engine/test/emqx_rule_engine_test_connector_info.erl b/apps/emqx_rule_engine/test/emqx_rule_engine_test_connector_info.erl new file mode 100644 index 000000000..1c300bff8 --- /dev/null +++ b/apps/emqx_rule_engine/test/emqx_rule_engine_test_connector_info.erl @@ -0,0 +1,43 @@ +%%-------------------------------------------------------------------- +%% Copyright (c) 2022-2024 EMQ Technologies Co., Ltd. All Rights Reserved. +%%-------------------------------------------------------------------- + +-module(emqx_rule_engine_test_connector_info). + +-behaviour(emqx_connector_info). + +-export([ + type_name/0, + bridge_types/0, + resource_callback_module/0, + config_schema/0, + schema_module/0, + api_schema/1 +]). + +type_name() -> + rule_engine_test. + +bridge_types() -> + [rule_engine_test]. + +resource_callback_module() -> + emqx_rule_engine_test_connector. + +config_schema() -> + {rule_engine_test, + hoconsc:mk( + hoconsc:map(name, hoconsc:ref(emqx_rule_engine_test_action_info, "config_connector")), + #{ + desc => <<"Test Connector Config">>, + required => false + } + )}. + +schema_module() -> + emqx_rule_engine_test_action_info. + +api_schema(Method) -> + emqx_connector_schema:api_ref( + ?MODULE, <<"rule_engine_test">>, Method ++ "_connector" + ). diff --git a/apps/emqx_s3/src/emqx_s3.app.src b/apps/emqx_s3/src/emqx_s3.app.src index 965cb099d..c307f2c9c 100644 --- a/apps/emqx_s3/src/emqx_s3.app.src +++ b/apps/emqx_s3/src/emqx_s3.app.src @@ -1,6 +1,6 @@ {application, emqx_s3, [ {description, "EMQX S3"}, - {vsn, "5.0.14"}, + {vsn, "5.1.0"}, {modules, []}, {registered, [emqx_s3_sup]}, {applications, [ diff --git a/apps/emqx_s3/src/emqx_s3_schema.erl b/apps/emqx_s3/src/emqx_s3_schema.erl index ff8c632bd..1199948d0 100644 --- a/apps/emqx_s3/src/emqx_s3_schema.erl +++ b/apps/emqx_s3/src/emqx_s3_schema.erl @@ -74,7 +74,7 @@ fields(s3_upload) -> [ {bucket, mk( - string(), + emqx_schema:template_str(), #{ desc => ?DESC("bucket"), required => true @@ -82,7 +82,7 @@ fields(s3_upload) -> )}, {key, mk( - string(), + emqx_schema:template_str(), #{ desc => ?DESC("key"), required => true diff --git a/apps/emqx_utils/README.md b/apps/emqx_utils/README.md index f8c386f3d..d03b34c64 100644 --- a/apps/emqx_utils/README.md +++ b/apps/emqx_utils/README.md @@ -16,6 +16,7 @@ handling, data conversions, and more. - `emqx_utils_json`: JSON encoding and decoding - `emqx_utils_maps`: convenience functions for map lookup and manipulation like deep_get etc. +- `emqx_metrics`: counters, gauges, slides ## Contributing diff --git a/apps/emqx/src/emqx_metrics_worker.erl b/apps/emqx_utils/src/emqx_metrics_worker.erl similarity index 100% rename from apps/emqx/src/emqx_metrics_worker.erl rename to apps/emqx_utils/src/emqx_metrics_worker.erl diff --git a/apps/emqx_utils/src/emqx_utils_redact.erl b/apps/emqx_utils/src/emqx_utils_redact.erl index 4d3cc7f7b..c830048a9 100644 --- a/apps/emqx_utils/src/emqx_utils_redact.erl +++ b/apps/emqx_utils/src/emqx_utils_redact.erl @@ -65,8 +65,11 @@ redact(Term, Checker) -> redact_headers(Term) -> do_redact_headers(Term). -do_redact(L, Checker) when is_list(L) -> - lists:map(fun(E) -> do_redact(E, Checker) end, L); +do_redact([], _Checker) -> + []; +do_redact([X | Xs], Checker) -> + %% Note: we could be dealing with an improper list + [do_redact(X, Checker) | do_redact(Xs, Checker)]; do_redact(M, Checker) when is_map(M) -> maps:map( fun(K, V) -> @@ -252,6 +255,14 @@ redact2_test_() -> Keys = [secret, passcode], [{case_name(atom, Key), fun() -> Case(Key, Checker) end} || Key <- Keys]. +redact_improper_list_test_() -> + %% improper lists: check that we don't crash + %% may arise when we redact process states with pending `gen' requests + [ + ?_assertEqual([alias | foo], redact([alias | foo])), + ?_assertEqual([1, 2 | foo], redact([1, 2 | foo])) + ]. + deobfuscate_test() -> NewConf0 = #{foo => <<"bar0">>, password => <<"123456">>}, ?assertEqual(NewConf0, deobfuscate(NewConf0, #{foo => <<"bar">>, password => <<"654321">>})), diff --git a/apps/emqx_utils/src/emqx_variform.erl b/apps/emqx_utils/src/emqx_variform.erl new file mode 100644 index 000000000..09a673851 --- /dev/null +++ b/apps/emqx_utils/src/emqx_variform.erl @@ -0,0 +1,300 @@ +%%-------------------------------------------------------------------- +%% Copyright (c) 2024 EMQ Technologies Co., Ltd. All Rights Reserved. +%% +%% Licensed under the Apache License, Version 2.0 (the "License"); +%% you may not use this file except in compliance with the License. +%% You may obtain a copy of the License at +%% +%% http://www.apache.org/licenses/LICENSE-2.0 +%% +%% Unless required by applicable law or agreed to in writing, software +%% distributed under the License is distributed on an "AS IS" BASIS, +%% WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +%% See the License for the specific language governing permissions and +%% limitations under the License. +%%-------------------------------------------------------------------- + +%% @doc This module provides a single-line expression string rendering engine. +%% A predefined set of functions are allowed to be called in the expressions. +%% Only simple string expressions are supported, and no control flow is allowed. +%% However, with the help from the functions, some control flow can be achieved. +%% For example, the `coalesce` function can be used to provide a default value, +%% or used to choose the first non-empty value from a list of variables. +-module(emqx_variform). + +-export([ + inject_allowed_module/1, + inject_allowed_modules/1, + erase_allowed_module/1, + erase_allowed_modules/1 +]). + +-export([render/2, render/3]). +-export([compile/1, decompile/1]). + +-export_type([compiled/0]). + +-type compiled() :: #{expr := string(), form := term()}. +-define(BIF_MOD, emqx_variform_bif). +-define(IS_ALLOWED_MOD(M), + (M =:= ?BIF_MOD orelse + M =:= lists orelse + M =:= maps) +). + +-define(COALESCE_BADARG, + throw(#{ + reason => coalesce_badarg, + explain => + "must be an array, or a call to a function which returns an array, " + "for example: coalesce([a,b,c]) or coalesce(tokens(var,','))" + }) +). + +%% @doc Render a variform expression with bindings. +%% A variform expression is a template string which supports variable substitution +%% and function calls. +%% +%% The function calls are in the form of `module.function(arg1, arg2, ...)` where `module` +%% is optional, and if not provided, the function is assumed to be in the `emqx_variform_bif` module. +%% Both module and function must be existing atoms, and only whitelisted functions are allowed. +%% +%% A function arg can be a constant string or a number. +%% Strings can be quoted with single quotes or double quotes, without support of escape characters. +%% If some special characters are needed, the function `unescape' can be used convert a escaped string +%% to raw bytes. +%% For example, to get the first line of a multi-line string, the expression can be +%% `coalesce(tokens(variable_name, unescape("\n")))'. +%% +%% The bindings is a map of variables to their values. +%% +%% For unresolved variables, empty string (but not "undefined") is used. +%% In case of runtime exeption, an error is returned. +%% In case of unbound variable is referenced, error is returned. +-spec render(string(), map()) -> {ok, binary()} | {error, term()}. +render(Expression, Bindings) -> + render(Expression, Bindings, #{}). + +render(#{form := Form}, Bindings, Opts) -> + eval_as_string(Form, Bindings, Opts); +render(Expression, Bindings, Opts) -> + case compile(Expression) of + {ok, Compiled} -> + render(Compiled, Bindings, Opts); + {error, Reason} -> + {error, Reason} + end. + +eval_as_string(Expr, Bindings, _Opts) -> + try + {ok, return_str(eval(Expr, Bindings, #{}))} + catch + throw:Reason -> + {error, Reason}; + C:E:S -> + {error, #{exception => C, reason => E, stack_trace => S}} + end. + +%% Force the expression to return binary string. +return_str(Str) when is_binary(Str) -> Str; +return_str(Num) when is_integer(Num) -> integer_to_binary(Num); +return_str(Num) when is_float(Num) -> float_to_binary(Num, [{decimals, 10}, compact]); +return_str(Other) -> + throw(#{ + reason => bad_return, + expected => string, + got => Other + }). + +%% @doc Compile varifom expression. +-spec compile(string() | binary() | compiled()) -> {ok, compiled()} | {error, any()}. +compile(#{form := _} = Compiled) -> + {ok, Compiled}; +compile(Expression) when is_binary(Expression) -> + compile(unicode:characters_to_list(Expression)); +compile(Expression) -> + case emqx_variform_scan:string(Expression) of + {ok, Tokens, _Line} -> + case emqx_variform_parser:parse(Tokens) of + {ok, Form} -> + {ok, #{expr => Expression, form => Form}}; + {error, {_, emqx_variform_parser, Msg}} -> + %% syntax error + {error, lists:flatten(Msg)}; + {error, Reason} -> + {error, Reason} + end; + {error, Reason, _Line} -> + {error, Reason} + end. + +decompile(#{expr := Expression}) -> + Expression; +decompile(Expression) -> + Expression. + +eval({str, Str}, _Bindings, _Opts) -> + unicode:characters_to_binary(Str); +eval({integer, Num}, _Bindings, _Opts) -> + Num; +eval({float, Num}, _Bindings, _Opts) -> + Num; +eval({array, Args}, Bindings, Opts) -> + eval_loop(Args, Bindings, Opts); +eval({call, FuncNameStr, Args}, Bindings, Opts) -> + {Mod, Fun} = resolve_func_name(FuncNameStr), + ok = assert_func_exported(Mod, Fun, length(Args)), + case {Mod, Fun} of + {?BIF_MOD, coalesce} -> + eval_coalesce(Args, Bindings, Opts); + _ -> + call(Mod, Fun, eval_loop(Args, Bindings, Opts)) + end; +eval({var, VarName}, Bindings, Opts) -> + resolve_var_value(VarName, Bindings, Opts). + +eval_loop([], _, _) -> []; +eval_loop([H | T], Bindings, Opts) -> [eval(H, Bindings, Opts) | eval_loop(T, Bindings, Opts)]. + +%% coalesce treats var_unbound exception as empty string '' +eval_coalesce([{array, Args}], Bindings, Opts) -> + NewArgs = [lists:map(fun(Arg) -> try_eval(Arg, Bindings, Opts) end, Args)], + call(?BIF_MOD, coalesce, NewArgs); +eval_coalesce([Arg], Bindings, Opts) -> + case try_eval(Arg, Bindings, Opts) of + List when is_list(List) -> + call(?BIF_MOD, coalesce, List); + <<>> -> + <<>>; + _ -> + ?COALESCE_BADARG + end; +eval_coalesce(_Args, _Bindings, _Opts) -> + ?COALESCE_BADARG. + +try_eval(Arg, Bindings, Opts) -> + try + eval(Arg, Bindings, Opts) + catch + throw:#{reason := var_unbound} -> + <<>> + end. + +%% Some functions accept arbitrary number of arguments but implemented as /1. +call(Mod, Fun, Args) -> + erlang:apply(Mod, Fun, Args). + +resolve_func_name(FuncNameStr) -> + case string:tokens(FuncNameStr, ".") of + [Mod0, Fun0] -> + Mod = + try + list_to_existing_atom(Mod0) + catch + error:badarg -> + throw(#{ + reason => unknown_variform_module, + module => Mod0 + }) + end, + ok = assert_module_allowed(Mod), + Fun = + try + list_to_existing_atom(Fun0) + catch + error:badarg -> + throw(#{ + reason => unknown_variform_function, + function => Fun0 + }) + end, + {Mod, Fun}; + [Fun] -> + FuncName = + try + list_to_existing_atom(Fun) + catch + error:badarg -> + throw(#{ + reason => unknown_variform_function, + function => Fun + }) + end, + {?BIF_MOD, FuncName}; + _ -> + throw(#{reason => invalid_function_reference, function => FuncNameStr}) + end. + +%% _Opts can be extended in the future. For example, unbound var as 'undfeined' +resolve_var_value(VarName, Bindings, _Opts) -> + case emqx_template:lookup_var(split(VarName), Bindings) of + {ok, Value} -> + Value; + {error, _Reason} -> + throw(#{ + var_name => VarName, + reason => var_unbound + }) + end. + +assert_func_exported(Mod, Fun, Arity) -> + ok = try_load(Mod), + case erlang:function_exported(Mod, Fun, Arity) of + true -> + ok; + false -> + throw(#{ + reason => unknown_variform_function, + module => Mod, + function => Fun, + arity => Arity + }) + end. + +%% best effort to load the module because it might not be loaded as a part of the release modules +%% e.g. from a plugin. +%% do not call code server, just try to call a function in the module. +try_load(Mod) -> + try + _ = erlang:apply(Mod, module_info, [md5]), + ok + catch + _:_ -> + ok + end. + +assert_module_allowed(Mod) when ?IS_ALLOWED_MOD(Mod) -> + ok; +assert_module_allowed(Mod) -> + Allowed = get_allowed_modules(), + case lists:member(Mod, Allowed) of + true -> + ok; + false -> + throw(#{ + reason => unallowed_veriform_module, + module => Mod + }) + end. + +inject_allowed_module(Module) when is_atom(Module) -> + inject_allowed_modules([Module]). + +inject_allowed_modules(Modules) when is_list(Modules) -> + Allowed0 = get_allowed_modules(), + Allowed = lists:usort(Allowed0 ++ Modules), + persistent_term:put({emqx_variform, allowed_modules}, Allowed). + +erase_allowed_module(Module) when is_atom(Module) -> + erase_allowed_modules([Module]). + +erase_allowed_modules(Modules) when is_list(Modules) -> + Allowed0 = get_allowed_modules(), + Allowed = Allowed0 -- Modules, + persistent_term:put({emqx_variform, allowed_modules}, Allowed). + +get_allowed_modules() -> + persistent_term:get({emqx_variform, allowed_modules}, []). + +split(VarName) -> + lists:map(fun erlang:iolist_to_binary/1, string:tokens(VarName, ".")). diff --git a/apps/emqx_utils/src/emqx_variform_bif.erl b/apps/emqx_utils/src/emqx_variform_bif.erl new file mode 100644 index 000000000..5c598efbd --- /dev/null +++ b/apps/emqx_utils/src/emqx_variform_bif.erl @@ -0,0 +1,525 @@ +%%-------------------------------------------------------------------- +%% Copyright (c) 2020-2024 EMQ Technologies Co., Ltd. All Rights Reserved. +%% +%% Licensed under the Apache License, Version 2.0 (the "License"); +%% you may not use this file except in compliance with the License. +%% You may obtain a copy of the License at +%% +%% http://www.apache.org/licenses/LICENSE-2.0 +%% +%% Unless required by applicable law or agreed to in writing, software +%% distributed under the License is distributed on an "AS IS" BASIS, +%% WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +%% See the License for the specific language governing permissions and +%% limitations under the License. +%%-------------------------------------------------------------------- + +%% Predefined functions for variform expressions. +-module(emqx_variform_bif). + +%% String Funcs +-export([ + lower/1, + ltrim/1, + ltrim/2, + reverse/1, + rtrim/1, + rtrim/2, + strlen/1, + substr/2, + substr/3, + trim/1, + trim/2, + upper/1, + split/2, + split/3, + concat/1, + concat/2, + tokens/2, + tokens/3, + sprintf_s/2, + pad/2, + pad/3, + pad/4, + replace/3, + replace/4, + regex_match/2, + regex_replace/3, + regex_extract/2, + ascii/1, + find/2, + find/3, + join_to_string/1, + join_to_string/2, + unescape/1, + any_to_str/1 +]). + +%% Array functions +-export([nth/2]). + +%% Control functions +-export([coalesce/1, coalesce/2]). + +%% Random functions +-export([rand_str/1, rand_int/1]). + +%% Schema-less encod/decode +-export([ + bin2hexstr/1, + hexstr2bin/1, + int2hexstr/1, + base64_encode/1, + base64_decode/1 +]). + +%% Hash functions +-export([hash/2, hash_to_range/3, map_to_range/3]). + +-define(IS_EMPTY(X), (X =:= <<>> orelse X =:= "" orelse X =:= undefined)). + +%%------------------------------------------------------------------------------ +%% String Funcs +%%------------------------------------------------------------------------------ + +%% @doc Return the first non-empty string +coalesce(A, B) when ?IS_EMPTY(A) andalso ?IS_EMPTY(B) -> + <<>>; +coalesce(A, B) when ?IS_EMPTY(A) -> + B; +coalesce(A, _B) -> + A. + +%% @doc Return the first non-empty string +coalesce([]) -> + <<>>; +coalesce([H | T]) -> + coalesce(H, coalesce(T)). + +lower(S) when is_binary(S) -> + string:lowercase(S). + +ltrim(S) when is_binary(S) -> + string:trim(S, leading). + +ltrim(S, Chars) -> + string:trim(S, leading, Chars). + +reverse(S) when is_binary(S) -> + iolist_to_binary(string:reverse(S)). + +rtrim(S) when is_binary(S) -> + string:trim(S, trailing). + +rtrim(S, Chars) when is_binary(S) -> + string:trim(S, trailing, Chars). + +strlen(S) when is_binary(S) -> + string:length(S). + +substr(S, Start) when is_binary(S), is_integer(Start) -> + string:slice(S, Start). + +substr(S, Start, Length) when + is_binary(S), + is_integer(Start), + is_integer(Length) +-> + string:slice(S, Start, Length). + +trim(S) when is_binary(S) -> + string:trim(S). + +trim(S, Chars) when is_binary(S) -> + string:trim(S, both, Chars). + +upper(S) when is_binary(S) -> + string:uppercase(S). + +split(S, P) when is_binary(S), is_binary(P) -> + [R || R <- string:split(S, P, all), R =/= <<>> andalso R =/= ""]. + +split(S, P, <<"notrim">>) -> + string:split(S, P, all); +split(S, P, <<"leading_notrim">>) -> + string:split(S, P, leading); +split(S, P, <<"leading">>) when is_binary(S), is_binary(P) -> + [R || R <- string:split(S, P, leading), R =/= <<>> andalso R =/= ""]; +split(S, P, <<"trailing_notrim">>) -> + string:split(S, P, trailing); +split(S, P, <<"trailing">>) when is_binary(S), is_binary(P) -> + [R || R <- string:split(S, P, trailing), R =/= <<>> andalso R =/= ""]. + +tokens(S, Separators) -> + [list_to_binary(R) || R <- string:lexemes(binary_to_list(S), binary_to_list(Separators))]. + +tokens(S, Separators, <<"nocrlf">>) -> + [ + list_to_binary(R) + || R <- string:lexemes(binary_to_list(S), binary_to_list(Separators) ++ [$\r, $\n, [$\r, $\n]]) + ]. + +%% implicit convert args to strings, and then do concatenation +concat(S1, S2) -> + concat([S1, S2]). + +%% @doc Concatenate a list of strings. +%% NOTE: it converts non-string elements to Erlang term literals for backward compatibility +concat(List) -> + unicode:characters_to_binary(lists:map(fun any_to_str/1, List), unicode). + +sprintf_s(Format, Args) when is_list(Args) -> + erlang:iolist_to_binary(io_lib:format(binary_to_list(Format), Args)). + +pad(S, Len) when is_binary(S), is_integer(Len) -> + iolist_to_binary(string:pad(S, Len, trailing)). + +pad(S, Len, <<"trailing">>) when is_binary(S), is_integer(Len) -> + iolist_to_binary(string:pad(S, Len, trailing)); +pad(S, Len, <<"both">>) when is_binary(S), is_integer(Len) -> + iolist_to_binary(string:pad(S, Len, both)); +pad(S, Len, <<"leading">>) when is_binary(S), is_integer(Len) -> + iolist_to_binary(string:pad(S, Len, leading)). + +pad(S, Len, <<"trailing">>, Char) when is_binary(S), is_integer(Len), is_binary(Char) -> + Chars = unicode:characters_to_list(Char, utf8), + iolist_to_binary(string:pad(S, Len, trailing, Chars)); +pad(S, Len, <<"both">>, Char) when is_binary(S), is_integer(Len), is_binary(Char) -> + Chars = unicode:characters_to_list(Char, utf8), + iolist_to_binary(string:pad(S, Len, both, Chars)); +pad(S, Len, <<"leading">>, Char) when is_binary(S), is_integer(Len), is_binary(Char) -> + Chars = unicode:characters_to_list(Char, utf8), + iolist_to_binary(string:pad(S, Len, leading, Chars)). + +replace(SrcStr, P, RepStr) when is_binary(SrcStr), is_binary(P), is_binary(RepStr) -> + iolist_to_binary(string:replace(SrcStr, P, RepStr, all)). + +replace(SrcStr, P, RepStr, <<"all">>) when is_binary(SrcStr), is_binary(P), is_binary(RepStr) -> + iolist_to_binary(string:replace(SrcStr, P, RepStr, all)); +replace(SrcStr, P, RepStr, <<"trailing">>) when + is_binary(SrcStr), is_binary(P), is_binary(RepStr) +-> + iolist_to_binary(string:replace(SrcStr, P, RepStr, trailing)); +replace(SrcStr, P, RepStr, <<"leading">>) when is_binary(SrcStr), is_binary(P), is_binary(RepStr) -> + iolist_to_binary(string:replace(SrcStr, P, RepStr, leading)). + +regex_match(Str, RE) -> + case re:run(Str, RE, [global, {capture, none}]) of + match -> true; + nomatch -> false + end. + +regex_replace(SrcStr, RE, RepStr) -> + re:replace(SrcStr, RE, RepStr, [global, {return, binary}]). + +%% @doc Searches the string Str for patterns specified by Regexp. +%% If matches are found, it returns a list of all captured groups from these matches. +%% If no matches are found or there are no groups captured, it returns an empty list. +%% This function can be used to extract parts of a string based on a regular expression, +%% excluding the complete match itself. +%% Examples: +%% ("Number: 12345", "(\\d+)") -> [<<"12345">>] +%% ("Hello, world!", "(\\w+)") -> [<<"Hello">>, <<"world">>] +%% ("No numbers here!", "(\\d+)") -> [] +%% ("Date: 2021-05-20", "(\\d{4})-(\\d{2})-(\\d{2})") -> [<<"2021">>, <<"05">>, <<"20">>] +regex_extract(Str, Regexp) -> + case re:run(Str, Regexp, [{capture, all_but_first, list}]) of + {match, [_ | _] = L} -> lists:map(fun erlang:iolist_to_binary/1, L); + _ -> [] + end. + +ascii(Char) when is_binary(Char) -> + [FirstC | _] = binary_to_list(Char), + FirstC. + +find(S, P) when is_binary(S), is_binary(P) -> + find_s(S, P, leading). + +find(S, P, <<"trailing">>) when is_binary(S), is_binary(P) -> + find_s(S, P, trailing); +find(S, P, <<"leading">>) when is_binary(S), is_binary(P) -> + find_s(S, P, leading). + +find_s(S, P, Dir) -> + case string:find(S, P, Dir) of + nomatch -> <<"">>; + SubStr -> SubStr + end. + +join_to_string(List) when is_list(List) -> + join_to_string(<<", ">>, List). + +join_to_string(Sep, List) when is_list(List), is_binary(Sep) -> + iolist_to_binary(lists:join(Sep, [any_to_str(Item) || Item <- List])). + +unescape(Bin) when is_binary(Bin) -> + UnicodeList = unicode:characters_to_list(Bin, utf8), + UnescapedUnicodeList = unescape_string(UnicodeList), + UnescapedUTF8Bin = unicode:characters_to_binary(UnescapedUnicodeList, utf32, utf8), + case UnescapedUTF8Bin of + Out when is_binary(Out) -> + Out; + Error -> + throw({invalid_unicode_character, Error}) + end. + +nth(N, List) when (is_list(N) orelse is_binary(N)) andalso is_list(List) -> + try binary_to_integer(iolist_to_binary(N)) of + N1 -> + nth(N1, List) + catch + _:_ -> + throw(#{reason => invalid_argument, func => nth, index => N}) + end; +nth(N, List) when is_integer(N) andalso is_list(List) -> + case length(List) of + L when L < N -> <<>>; + _ -> lists:nth(N, List) + end. + +unescape_string(Input) -> unescape_string(Input, []). + +unescape_string([], Acc) -> + lists:reverse(Acc); +unescape_string([$\\, $\\ | Rest], Acc) -> + unescape_string(Rest, [$\\ | Acc]); +unescape_string([$\\, $n | Rest], Acc) -> + unescape_string(Rest, [$\n | Acc]); +unescape_string([$\\, $t | Rest], Acc) -> + unescape_string(Rest, [$\t | Acc]); +unescape_string([$\\, $r | Rest], Acc) -> + unescape_string(Rest, [$\r | Acc]); +unescape_string([$\\, $b | Rest], Acc) -> + unescape_string(Rest, [$\b | Acc]); +unescape_string([$\\, $f | Rest], Acc) -> + unescape_string(Rest, [$\f | Acc]); +unescape_string([$\\, $v | Rest], Acc) -> + unescape_string(Rest, [$\v | Acc]); +unescape_string([$\\, $' | Rest], Acc) -> + unescape_string(Rest, [$\' | Acc]); +unescape_string([$\\, $" | Rest], Acc) -> + unescape_string(Rest, [$\" | Acc]); +unescape_string([$\\, $? | Rest], Acc) -> + unescape_string(Rest, [$\? | Acc]); +unescape_string([$\\, $a | Rest], Acc) -> + unescape_string(Rest, [$\a | Acc]); +%% Start of HEX escape code +unescape_string([$\\, $x | [$0 | _] = HexStringStart], Acc) -> + unescape_handle_hex_string(HexStringStart, Acc); +unescape_string([$\\, $x | [$1 | _] = HexStringStart], Acc) -> + unescape_handle_hex_string(HexStringStart, Acc); +unescape_string([$\\, $x | [$2 | _] = HexStringStart], Acc) -> + unescape_handle_hex_string(HexStringStart, Acc); +unescape_string([$\\, $x | [$3 | _] = HexStringStart], Acc) -> + unescape_handle_hex_string(HexStringStart, Acc); +unescape_string([$\\, $x | [$4 | _] = HexStringStart], Acc) -> + unescape_handle_hex_string(HexStringStart, Acc); +unescape_string([$\\, $x | [$5 | _] = HexStringStart], Acc) -> + unescape_handle_hex_string(HexStringStart, Acc); +unescape_string([$\\, $x | [$6 | _] = HexStringStart], Acc) -> + unescape_handle_hex_string(HexStringStart, Acc); +unescape_string([$\\, $x | [$7 | _] = HexStringStart], Acc) -> + unescape_handle_hex_string(HexStringStart, Acc); +unescape_string([$\\, $x | [$8 | _] = HexStringStart], Acc) -> + unescape_handle_hex_string(HexStringStart, Acc); +unescape_string([$\\, $x | [$9 | _] = HexStringStart], Acc) -> + unescape_handle_hex_string(HexStringStart, Acc); +unescape_string([$\\, $x | [$A | _] = HexStringStart], Acc) -> + unescape_handle_hex_string(HexStringStart, Acc); +unescape_string([$\\, $x | [$B | _] = HexStringStart], Acc) -> + unescape_handle_hex_string(HexStringStart, Acc); +unescape_string([$\\, $x | [$C | _] = HexStringStart], Acc) -> + unescape_handle_hex_string(HexStringStart, Acc); +unescape_string([$\\, $x | [$D | _] = HexStringStart], Acc) -> + unescape_handle_hex_string(HexStringStart, Acc); +unescape_string([$\\, $x | [$E | _] = HexStringStart], Acc) -> + unescape_handle_hex_string(HexStringStart, Acc); +unescape_string([$\\, $x | [$F | _] = HexStringStart], Acc) -> + unescape_handle_hex_string(HexStringStart, Acc); +unescape_string([$\\, $x | [$a | _] = HexStringStart], Acc) -> + unescape_handle_hex_string(HexStringStart, Acc); +unescape_string([$\\, $x | [$b | _] = HexStringStart], Acc) -> + unescape_handle_hex_string(HexStringStart, Acc); +unescape_string([$\\, $x | [$c | _] = HexStringStart], Acc) -> + unescape_handle_hex_string(HexStringStart, Acc); +unescape_string([$\\, $x | [$d | _] = HexStringStart], Acc) -> + unescape_handle_hex_string(HexStringStart, Acc); +unescape_string([$\\, $x | [$e | _] = HexStringStart], Acc) -> + unescape_handle_hex_string(HexStringStart, Acc); +unescape_string([$\\, $x | [$f | _] = HexStringStart], Acc) -> + unescape_handle_hex_string(HexStringStart, Acc); +%% We treat all other escape sequences as not valid input to leave room for +%% extending the function to support more escape codes +unescape_string([$\\, X | _Rest], _Acc) -> + erlang:throw({unrecognized_escape_sequence, list_to_binary([$\\, X])}); +unescape_string([First | Rest], Acc) -> + unescape_string(Rest, [First | Acc]). + +unescape_handle_hex_string(HexStringStart, Acc) -> + {RemainingString, Num} = parse_hex_string(HexStringStart), + unescape_string(RemainingString, [Num | Acc]). + +parse_hex_string(SeqStartingWithHexDigit) -> + parse_hex_string(SeqStartingWithHexDigit, []). + +parse_hex_string([], Acc) -> + ReversedAcc = lists:reverse(Acc), + {[], list_to_integer(ReversedAcc, 16)}; +parse_hex_string([First | Rest] = String, Acc) -> + case is_hex_digit(First) of + true -> + parse_hex_string(Rest, [First | Acc]); + false -> + ReversedAcc = lists:reverse(Acc), + {String, list_to_integer(ReversedAcc, 16)} + end. + +is_hex_digit($0) -> true; +is_hex_digit($1) -> true; +is_hex_digit($2) -> true; +is_hex_digit($3) -> true; +is_hex_digit($4) -> true; +is_hex_digit($5) -> true; +is_hex_digit($6) -> true; +is_hex_digit($7) -> true; +is_hex_digit($8) -> true; +is_hex_digit($9) -> true; +is_hex_digit($A) -> true; +is_hex_digit($B) -> true; +is_hex_digit($C) -> true; +is_hex_digit($D) -> true; +is_hex_digit($E) -> true; +is_hex_digit($F) -> true; +is_hex_digit($a) -> true; +is_hex_digit($b) -> true; +is_hex_digit($c) -> true; +is_hex_digit($d) -> true; +is_hex_digit($e) -> true; +is_hex_digit($f) -> true; +is_hex_digit(_) -> false. + +%%------------------------------------------------------------------------------ +%% Data Type Conversion Funcs +%%------------------------------------------------------------------------------ + +any_to_str(Data) -> + emqx_utils_conv:bin(Data). + +%%------------------------------------------------------------------------------ +%% Random functions +%%------------------------------------------------------------------------------ + +%% @doc Make a random string with urlsafe-base64 charset. +rand_str(Length) when is_integer(Length) andalso Length > 0 -> + RawBytes = erlang:ceil((Length * 3) / 4), + RandomData = rand:bytes(RawBytes), + urlsafe(binary:part(base64_encode(RandomData), 0, Length)); +rand_str(_) -> + throw(#{reason => badarg, function => ?FUNCTION_NAME}). + +%% @doc Make a random integer in the range `[1, N]`. +rand_int(N) when is_integer(N) andalso N >= 1 -> + rand:uniform(N); +rand_int(N) -> + throw(#{reason => badarg, function => ?FUNCTION_NAME, expected => "positive integer", got => N}). + +%% TODO: call base64:encode(Bin, #{mode => urlsafe, padding => false}) +%% when oldest OTP to support is 26 or newer. +urlsafe(Str0) -> + Str = replace(Str0, <<"+">>, <<"-">>), + replace(Str, <<"/">>, <<"_">>). + +%%------------------------------------------------------------------------------ +%% Data encoding +%%------------------------------------------------------------------------------ + +%% @doc Encode an integer to hex string. e.g. 15 as 'f' +int2hexstr(Int) -> + erlang:integer_to_binary(Int, 16). + +%% @doc Encode bytes in hex string format. +bin2hexstr(Bin) when is_binary(Bin) -> + emqx_utils:bin_to_hexstr(Bin, upper); +%% If Bin is a bitstring which is not divisible by 8, we pad it and then do the +%% conversion +bin2hexstr(Bin) when is_bitstring(Bin), (8 - (bit_size(Bin) rem 8)) >= 4 -> + PadSize = 8 - (bit_size(Bin) rem 8), + Padding = <<0:PadSize>>, + BinToConvert = <>, + <<_FirstByte:8, HexStr/binary>> = emqx_utils:bin_to_hexstr(BinToConvert, upper), + HexStr; +bin2hexstr(Bin) when is_bitstring(Bin) -> + PadSize = 8 - (bit_size(Bin) rem 8), + Padding = <<0:PadSize>>, + BinToConvert = <>, + emqx_utils:bin_to_hexstr(BinToConvert, upper). + +%% @doc Decode hex string into its original bytes. +hexstr2bin(Str) when is_binary(Str) -> + emqx_utils:hexstr_to_bin(Str). + +%% @doc Encode any bytes to base64. +base64_encode(Bin) -> + base64:encode(Bin). + +%% @doc Decode base64 encoded string. +base64_decode(Bin) -> + base64:decode(Bin). + +%%------------------------------------------------------------------------------ +%% Hash functions +%%------------------------------------------------------------------------------ + +%% @doc Hash with all available algorithm provided by crypto module. +%% Return hex format string. +%% - md4 | md5 +%% - sha (sha1) +%% - sha224 | sha256 | sha384 | sha512 +%% - sha3_224 | sha3_256 | sha3_384 | sha3_512 +%% - shake128 | shake256 +%% - blake2b | blake2s +hash(<<"sha1">>, Bin) -> + hash(sha, Bin); +hash(Algorithm, Bin) when is_binary(Algorithm) -> + Type = + try + binary_to_existing_atom(Algorithm) + catch + _:_ -> + throw(#{ + reason => unknown_hash_algorithm, + algorithm => Algorithm + }) + end, + hash(Type, Bin); +hash(Type, Bin) when is_atom(Type) -> + %% lower is for backward compatibility + emqx_utils:bin_to_hexstr(crypto:hash(Type, Bin), lower). + +%% @doc Hash binary data to an integer within a specified range [Min, Max] +hash_to_range(Bin, Min, Max) when + is_binary(Bin) andalso + size(Bin) > 0 andalso + is_integer(Min) andalso + is_integer(Max) andalso + Min =< Max +-> + Hash = hash(sha256, Bin), + HashNum = binary_to_integer(Hash, 16), + map_to_range(HashNum, Min, Max); +hash_to_range(_, _, _) -> + throw(#{reason => badarg, function => ?FUNCTION_NAME}). + +map_to_range(Bin, Min, Max) when is_binary(Bin) andalso size(Bin) > 0 -> + HashNum = binary:decode_unsigned(Bin), + map_to_range(HashNum, Min, Max); +map_to_range(Int, Min, Max) when + is_integer(Int) andalso + is_integer(Min) andalso + is_integer(Max) andalso + Min =< Max +-> + Range = Max - Min + 1, + Min + (Int rem Range); +map_to_range(_, _, _) -> + throw(#{reason => badarg, function => ?FUNCTION_NAME}). diff --git a/apps/emqx_utils/src/emqx_variform_parser.yrl b/apps/emqx_utils/src/emqx_variform_parser.yrl new file mode 100644 index 000000000..45d92696b --- /dev/null +++ b/apps/emqx_utils/src/emqx_variform_parser.yrl @@ -0,0 +1,45 @@ +Nonterminals + expr + call_or_var + array + args + arg. + +Terminals + identifier + integer + float + string + '(' ')' + ',' '[' ']'. + +Rootsymbol + expr. + +%% Grammar Rules + +%% Root expression: function call or variable +expr -> call_or_var : '$1'. + +%% Function call or variable +call_or_var -> identifier '(' args ')' : {call, element(3, '$1'), '$3'}. +call_or_var -> identifier : {var, element(3, '$1')}. + +%% Array is like a arg list, but with square brackets +array -> '[' args ']' : {array, '$2'}. + +%% Argument handling +args -> arg : ['$1']. +args -> args ',' arg : '$1' ++ ['$3']. + +%% Arguments can be expressions, arrays, numbers, or strings +arg -> expr : '$1'. +arg -> array : '$1'. +arg -> integer: {integer, element(3, '$1')}. +arg -> float: {float, element(3, '$1')}. +arg -> string : {str, element(3, '$1')}. + +Erlang code. + +%% mute xref warning +-export([return_error/2]). diff --git a/apps/emqx_utils/src/emqx_variform_scan.xrl b/apps/emqx_utils/src/emqx_variform_scan.xrl new file mode 100644 index 000000000..63c9fba29 --- /dev/null +++ b/apps/emqx_utils/src/emqx_variform_scan.xrl @@ -0,0 +1,31 @@ +Definitions. +%% Define regular expressions for tokens +IDENTIFIER = [a-zA-Z][a-zA-Z0-9_.]* +SQ_STRING = \'[^\']*\' +DQ_STRING = \"[^\"]*\" +INTEGER = [+-]?[0-9]+ +FLOAT = [+-]?\\d+\\.\\d+ +LPAREN = \( +RPAREN = \) +LBRACKET = \[ +RBRACKET = \] +COMMA = , +WHITESPACE = [\s\t\n]+ + +Rules. +{WHITESPACE} : skip_token. +{IDENTIFIER} : {token, {identifier, TokenLine, TokenChars}}. +{SQ_STRING} : {token, {string, TokenLine, unquote(TokenChars, $')}}. +{DQ_STRING} : {token, {string, TokenLine, unquote(TokenChars, $")}}. +{INTEGER} : {token, {integer, TokenLine, list_to_integer(TokenChars)}}. +{FLOAT} : {token, {float, TokenLine, list_to_float(TokenChars)}}. +{LPAREN} : {token, {'(', TokenLine}}. +{RPAREN} : {token, {')', TokenLine}}. +{LBRACKET} : {token, {'[', TokenLine}}. +{RBRACKET} : {token, {']', TokenLine}}. +{COMMA} : {token, {',', TokenLine}}. + +Erlang code. + +unquote(String, Char) -> + string:trim(String, both, [Char]). diff --git a/apps/emqx/test/emqx_metrics_worker_SUITE.erl b/apps/emqx_utils/test/emqx_metrics_worker_SUITE.erl similarity index 99% rename from apps/emqx/test/emqx_metrics_worker_SUITE.erl rename to apps/emqx_utils/test/emqx_metrics_worker_SUITE.erl index 387e069cf..15866feb0 100644 --- a/apps/emqx/test/emqx_metrics_worker_SUITE.erl +++ b/apps/emqx_utils/test/emqx_metrics_worker_SUITE.erl @@ -31,18 +31,17 @@ suite() -> -define(NAME, ?MODULE). init_per_suite(Config) -> - {ok, _} = emqx_metrics_worker:start_link(?NAME), Config. end_per_suite(_Config) -> - ok = emqx_metrics_worker:stop(?NAME). + ok. init_per_testcase(_, Config) -> - ok = emqx_metrics_worker:stop(?NAME), {ok, _} = emqx_metrics_worker:start_link(?NAME), Config. end_per_testcase(_, _Config) -> + ok = emqx_metrics_worker:stop(?NAME), ok. t_get_metrics(_) -> diff --git a/apps/emqx_utils/test/emqx_variform_bif_tests.erl b/apps/emqx_utils/test/emqx_variform_bif_tests.erl new file mode 100644 index 000000000..92144ff43 --- /dev/null +++ b/apps/emqx_utils/test/emqx_variform_bif_tests.erl @@ -0,0 +1,74 @@ +%%-------------------------------------------------------------------- +%% Copyright (c) 2024 EMQ Technologies Co., Ltd. All Rights Reserved. +%% +%% Licensed under the Apache License, Version 2.0 (the "License"); +%% you may not use this file except in compliance with the License. +%% You may obtain a copy of the License at +%% +%% http://www.apache.org/licenses/LICENSE-2.0 +%% +%% Unless required by applicable law or agreed to in writing, software +%% distributed under the License is distributed on an "AS IS" BASIS, +%% WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +%% See the License for the specific language governing permissions and +%% limitations under the License. +%%-------------------------------------------------------------------- + +%% Most of the functions are tested as rule-engine string funcs +-module(emqx_variform_bif_tests). + +-compile(export_all). +-compile(nowarn_export_all). + +-include_lib("eunit/include/eunit.hrl"). + +regex_extract_test_() -> + [ + ?_assertEqual([<<"12345">>], regex_extract("Order number: 12345", "(\\d+)")), + ?_assertEqual( + [<<"Hello">>, <<"world">>], regex_extract("Hello, world!", "(\\w+).*\s(\\w+)") + ), + ?_assertEqual([], regex_extract("No numbers here!", "(\\d+)")), + ?_assertEqual( + [<<"2021">>, <<"05">>, <<"20">>], + regex_extract("Date: 2021-05-20", "(\\d{4})-(\\d{2})-(\\d{2})") + ), + ?_assertEqual([<<"Hello">>], regex_extract("Hello, world!", "(Hello)")), + ?_assertEqual( + [<<"12">>, <<"34">>], regex_extract("Items: 12, Price: 34", "(\\d+).*\s(\\d+)") + ), + ?_assertEqual( + [<<"john.doe@example.com">>], + regex_extract("Contact: john.doe@example.com", "([\\w\\.]+@[\\w\\.]+)") + ), + ?_assertEqual([], regex_extract("Just some text, nothing more.", "([A-Z]\\d{3})")), + ?_assertEqual( + [<<"admin">>, <<"1234">>], + regex_extract("User: admin, Pass: 1234", "User: (\\w+), Pass: (\\d+)") + ), + ?_assertEqual([], regex_extract("", "(\\d+)")), + ?_assertEqual([], regex_extract("$$$###!!!", "(\\d+)")), + ?_assertEqual([<<"23.1">>], regex_extract("Erlang 23.1 version", "(\\d+\\.\\d+)")), + ?_assertEqual( + [<<"192.168.1.1">>], + regex_extract("Server IP: 192.168.1.1 at port 8080", "(\\d+\\.\\d+\\.\\d+\\.\\d+)") + ) + ]. + +regex_extract(Str, RegEx) -> + emqx_variform_bif:regex_extract(Str, RegEx). + +rand_str_test() -> + ?assertEqual(3, size(emqx_variform_bif:rand_str(3))), + ?assertThrow(#{reason := badarg}, size(emqx_variform_bif:rand_str(0))). + +rand_int_test() -> + N = emqx_variform_bif:rand_int(10), + ?assert(N =< 10 andalso N >= 1), + ?assertThrow(#{reason := badarg}, emqx_variform_bif:rand_int(0)), + ?assertThrow(#{reason := badarg}, emqx_variform_bif:rand_int(-1)). + +base64_encode_decode_test() -> + RandBytes = crypto:strong_rand_bytes(100), + Encoded = emqx_variform_bif:base64_encode(RandBytes), + ?assertEqual(RandBytes, emqx_variform_bif:base64_decode(Encoded)). diff --git a/apps/emqx_utils/test/emqx_variform_tests.erl b/apps/emqx_utils/test/emqx_variform_tests.erl new file mode 100644 index 000000000..5f9a13326 --- /dev/null +++ b/apps/emqx_utils/test/emqx_variform_tests.erl @@ -0,0 +1,220 @@ +%%-------------------------------------------------------------------- +%% Copyright (c) 2024 EMQ Technologies Co., Ltd. All Rights Reserved. +%% +%% Licensed under the Apache License, Version 2.0 (the "License"); +%% you may not use this file except in compliance with the License. +%% You may obtain a copy of the License at +%% +%% http://www.apache.org/licenses/LICENSE-2.0 +%% +%% Unless required by applicable law or agreed to in writing, software +%% distributed under the License is distributed on an "AS IS" BASIS, +%% WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +%% See the License for the specific language governing permissions and +%% limitations under the License. +%%-------------------------------------------------------------------- + +-module(emqx_variform_tests). + +-compile(export_all). +-compile(nowarn_export_all). + +-include_lib("eunit/include/eunit.hrl"). + +-define(SYNTAX_ERROR, {error, "syntax error before:" ++ _}). + +redner_test_() -> + [ + {"direct var reference", fun() -> ?assertEqual({ok, <<"1">>}, render("a", #{a => 1})) end}, + {"concat strings", fun() -> + ?assertEqual({ok, <<"a,b">>}, render("concat(['a',',','b'])", #{})) + end}, + {"concat empty string", fun() -> + ?assertEqual({ok, <<"">>}, render("concat([''])", #{})) + end}, + {"tokens 1st", fun() -> + ?assertEqual({ok, <<"a">>}, render("nth(1,tokens(var, ','))", #{var => <<"a,b">>})) + end}, + {"unknown var return error", fun() -> + ?assertMatch({error, #{reason := var_unbound}}, render("var", #{})) + end}, + {"out of range nth index", fun() -> + ?assertEqual({ok, <<>>}, render("nth(2, tokens(var, ','))", #{var => <<"a">>})) + end}, + {"string for nth index", fun() -> + ?assertEqual({ok, <<"a">>}, render("nth('1', tokens(var, ','))", #{var => <<"a">>})) + end}, + {"not a index number for nth", fun() -> + ?assertMatch( + {error, #{reason := invalid_argument, func := nth, index := <<"notnum">>}}, + render("nth('notnum', tokens(var, ','))", #{var => <<"a">>}) + ) + end}, + {"substr", fun() -> + ?assertMatch( + {ok, <<"b">>}, + render("substr(var,1)", #{var => <<"ab">>}) + ) + end}, + {"result in integer", fun() -> + ?assertMatch( + {ok, <<"2">>}, + render("strlen(var)", #{var => <<"ab">>}) + ) + end}, + {"result in float", fun() -> + ?assertMatch( + {ok, <<"2.2">>}, + render("var", #{var => 2.2}) + ) + end}, + {"concat a number", fun() -> + ?assertMatch( + {ok, <<"2.2">>}, + render("concat(strlen(var),'.2')", #{var => <<"xy">>}) + ) + end}, + {"var is an array", fun() -> + ?assertMatch( + {ok, <<"y">>}, + render("nth(2,var)", #{var => [<<"x">>, <<"y">>]}) + ) + end} + ]. + +unknown_func_test_() -> + [ + {"unknown function", fun() -> + ?assertMatch( + {error, #{reason := unknown_variform_function}}, + render("nonexistingatom__(a)", #{}) + ) + end}, + {"unknown module", fun() -> + ?assertMatch( + {error, #{reason := unknown_variform_module}}, + render("nonexistingatom__.nonexistingatom__(a)", #{}) + ) + end}, + {"unknown function in a known module", fun() -> + ?assertMatch( + {error, #{reason := unknown_variform_function}}, + render("emqx_variform_bif.nonexistingatom__(a)", #{}) + ) + end}, + {"invalid func reference", fun() -> + ?assertMatch( + {error, #{reason := invalid_function_reference, function := "a.b.c"}}, + render("a.b.c(var)", #{}) + ) + end} + ]. + +concat(L) -> iolist_to_binary(L). + +inject_allowed_module_test() -> + try + emqx_variform:inject_allowed_module(?MODULE), + ?assertEqual({ok, <<"ab">>}, render(atom_to_list(?MODULE) ++ ".concat(['a','b'])", #{})), + ?assertMatch( + {error, #{ + reason := unknown_variform_function, + module := ?MODULE, + function := concat, + arity := 2 + }}, + render(atom_to_list(?MODULE) ++ ".concat('a','b')", #{}) + ), + ?assertMatch( + {error, #{reason := unallowed_veriform_module, module := emqx}}, + render("emqx.concat('a','b')", #{}) + ) + after + emqx_variform:erase_allowed_module(?MODULE) + end. + +coalesce_test_() -> + [ + {"first", fun() -> + ?assertEqual({ok, <<"a">>}, render("coalesce(['a','b'])", #{})) + end}, + {"second", fun() -> + ?assertEqual({ok, <<"b">>}, render("coalesce(['', 'b'])", #{})) + end}, + {"first var", fun() -> + ?assertEqual({ok, <<"a">>}, render("coalesce([a,b])", #{a => <<"a">>, b => <<"b">>})) + end}, + {"second var", fun() -> + ?assertEqual({ok, <<"b">>}, render("coalesce([a,b])", #{b => <<"b">>})) + end}, + {"empty", fun() -> ?assertEqual({ok, <<>>}, render("coalesce([a,b])", #{})) end}, + {"arg from other func", fun() -> + ?assertEqual({ok, <<"b">>}, render("coalesce(tokens(a,','))", #{a => <<",,b,c">>})) + end}, + {"var unbound", fun() -> ?assertEqual({ok, <<>>}, render("coalesce(a)", #{})) end}, + {"var unbound in call", fun() -> + ?assertEqual({ok, <<>>}, render("coalesce(concat(a))", #{})) + end}, + {"var unbound in calls", fun() -> + ?assertEqual({ok, <<"c">>}, render("coalesce([any_to_str(a),any_to_str(b),'c'])", #{})) + end}, + {"badarg", fun() -> + ?assertMatch( + {error, #{reason := coalesce_badarg}}, render("coalesce(a,b)", #{a => 1, b => 2}) + ) + end}, + {"badarg from return", fun() -> + ?assertMatch( + {error, #{reason := coalesce_badarg}}, render("coalesce(any_to_str(a))", #{a => 1}) + ) + end} + ]. + +syntax_error_test_() -> + [ + {"empty expression", fun() -> ?assertMatch(?SYNTAX_ERROR, render("", #{})) end}, + {"const string single quote", fun() -> ?assertMatch(?SYNTAX_ERROR, render("'a'", #{})) end}, + {"const string double quote", fun() -> + ?assertMatch(?SYNTAX_ERROR, render(<<"\"a\"">>, #{})) + end}, + {"no arity", fun() -> ?assertMatch(?SYNTAX_ERROR, render("concat()", #{})) end} + ]. + +render(Expression, Bindings) -> + emqx_variform:render(Expression, Bindings). + +hash_pick_test() -> + lists:foreach( + fun(_) -> + {ok, Res} = render("nth(hash_to_range(rand_str(10),1,5),[1,2,3,4,5])", #{}), + ?assert(Res >= <<"1">> andalso Res =< <<"5">>) + end, + lists:seq(1, 100) + ). + +map_to_range_pick_test() -> + lists:foreach( + fun(_) -> + {ok, Res} = render("nth(map_to_range(rand_str(10),1,5),[1,2,3,4,5])", #{}), + ?assert(Res >= <<"1">> andalso Res =< <<"5">>) + end, + lists:seq(1, 100) + ). + +-define(ASSERT_BADARG(FUNC, ARGS), + ?_assertEqual( + {error, #{reason => badarg, function => FUNC}}, + render(atom_to_list(FUNC) ++ ARGS, #{}) + ) +). + +to_range_badarg_test_() -> + [ + ?ASSERT_BADARG(hash_to_range, "(1,1,2)"), + ?ASSERT_BADARG(hash_to_range, "('',1,2)"), + ?ASSERT_BADARG(hash_to_range, "('a','1',2)"), + ?ASSERT_BADARG(hash_to_range, "('a',2,1)"), + ?ASSERT_BADARG(map_to_range, "('',1,2)"), + ?ASSERT_BADARG(map_to_range, "('a','1',2)"), + ?ASSERT_BADARG(map_to_range, "('a',2,1)") + ]. diff --git a/build b/build index d63260d1d..73b83a5b6 100755 --- a/build +++ b/build @@ -493,7 +493,7 @@ make_docker() { if [ "${DOCKER_PUSH:-false}" = true ]; then DOCKER_BUILDX_ARGS+=(--push) fi - if [ "${DOCKER_LOAD:-false}" = true ]; then + if [ "${DOCKER_LOAD:-true}" = true ]; then DOCKER_BUILDX_ARGS+=(--load) fi if [ -d "${REBAR_GIT_CACHE_DIR:-}" ]; then diff --git a/changes/ce/feat-12781.en.md b/changes/ce/feat-12781.en.md new file mode 100644 index 000000000..c884ccbc4 --- /dev/null +++ b/changes/ce/feat-12781.en.md @@ -0,0 +1,29 @@ +Added metrics related to EMQX durable storage to Prometheus. + +New metrics: + +- `emqx_ds_egress_batches` + +- `emqx_ds_egress_batches_retry` + +- `emqx_ds_egress_batches_failed` + +- `emqx_ds_egress_messages` + +- `emqx_ds_egress_bytes` + +- `emqx_ds_egress_flush_time` + +- `emqx_ds_store_batch_time` + +- `emqx_ds_builtin_next_time` + +- `emqx_ds_storage_bitfield_lts_counter_seek` + +- `emqx_ds_storage_bitfield_lts_counter_next` + +- `emqx_ds_storage_bitfield_lts_counter_collision` + +Note: these metrics are only visible when session persistence is enabled. + +Number of persisted messages has been also added to the dashboard. diff --git a/changes/ce/feat-12798.en.md b/changes/ce/feat-12798.en.md new file mode 100644 index 000000000..a3b46f5e6 --- /dev/null +++ b/changes/ce/feat-12798.en.md @@ -0,0 +1 @@ +Added new `GET /api/v5/clients_v2` API that uses cursors instead of page numbers for pagination. This should be more efficient than the old API endpoint, which currently traverses tables multiple times. diff --git a/changes/ce/feat-12827.en.md b/changes/ce/feat-12827.en.md new file mode 100644 index 000000000..633a33d6b --- /dev/null +++ b/changes/ce/feat-12827.en.md @@ -0,0 +1 @@ +It is now possible to trace rules with a new Rule ID trace filter as well as with the Client ID filter. For testing purposes it is now also possible to use a new HTTP API endpoint (rules/:id/test) to artificially apply a rule and optionally stop its actions after they have been rendered. diff --git a/changes/ce/feat-12863.en.md b/changes/ce/feat-12863.en.md new file mode 100644 index 000000000..45bebfbd6 --- /dev/null +++ b/changes/ce/feat-12863.en.md @@ -0,0 +1 @@ +You can now format trace log entries as JSON objects by setting the formatter parameter to "json" when creating the trace pattern. diff --git a/changes/ce/feat-12750.en.md b/changes/ce/feat-12872.en.md similarity index 93% rename from changes/ce/feat-12750.en.md rename to changes/ce/feat-12872.en.md index bd7375168..dfc799bb2 100644 --- a/changes/ce/feat-12750.en.md +++ b/changes/ce/feat-12872.en.md @@ -7,8 +7,8 @@ an MQTT connection. ### Initialization of `client_attrs` -- The `client_attrs` field can be initially populated based on the configuration from one of the - following sources: +- The `client_attrs` fields can be initially populated from one of the + following `clientinfo` fields: - `cn`: The common name from the TLS client's certificate. - `dn`: The distinguished name from the TLS client's certificate, that is, the certificate "Subject". - `clientid`: The MQTT client ID provided by the client. diff --git a/changes/ce/fix-12802.en.md b/changes/ce/fix-12802.en.md new file mode 100644 index 000000000..f63603a97 --- /dev/null +++ b/changes/ce/fix-12802.en.md @@ -0,0 +1,3 @@ +Improve cluster discovery behaviour when a node is manually removed from a cluster using 'emqx ctl cluster leave' command. +Previously, if the configured cluster 'discovery_strategy' was not 'manual', the left node might re-discover and re-join the same cluster shortly after it left (unless it was stopped). +After this change, 'cluster leave' command disables automatic cluster_discovery, so that the left node won't re-join the same cluster again. Cluster discovery can be re-enabled by running 'emqx ctl discovery enable` or by restarting the left node. diff --git a/changes/ce/fix-12812.en.md b/changes/ce/fix-12812.en.md new file mode 100644 index 000000000..f530c2060 --- /dev/null +++ b/changes/ce/fix-12812.en.md @@ -0,0 +1 @@ +Made resource health checks non-blocking operations. This means that operations such as updating or removing a resource won't be blocked by a lengthy running health check. diff --git a/changes/ce/fix-12814.en.md b/changes/ce/fix-12814.en.md new file mode 100644 index 000000000..f84025561 --- /dev/null +++ b/changes/ce/fix-12814.en.md @@ -0,0 +1,4 @@ +Handle several errors in `/clients/{clientid}/mqueue_messages` and `/clients/{clientid}/inflight_messages` APIs: + +- Internal timeout, which means that EMQX failed to get the list of Inflight/Mqueue messages within the default timeout of 5 s. This error may occur when the system is under a heavy load. The API will return 500 `{"code":"INTERNAL_ERROR","message":"timeout"}` response and log additional details. +- Client shutdown. The error may occur if the client connection is shutdown during the API call. The API will return 404 `{"code": "CLIENT_SHUTDOWN", "message": "Client connection has been shutdown"}` response in this case. diff --git a/changes/ce/fix-12824.en.md b/changes/ce/fix-12824.en.md new file mode 100644 index 000000000..01c13146d --- /dev/null +++ b/changes/ce/fix-12824.en.md @@ -0,0 +1,2 @@ +Make sure stats `'subscribers.count'` `'subscribers.max'` countains shared-subscribers. +It only contains non-shared subscribers previously. diff --git a/changes/ce/fix-12826.en.md b/changes/ce/fix-12826.en.md new file mode 100644 index 000000000..28829cf87 --- /dev/null +++ b/changes/ce/fix-12826.en.md @@ -0,0 +1,6 @@ +Fixed an issue that prevented importing source data integrations and retained messages. + +Before the fix: + +- source data integrations are ignored from the backup file +- importing the `mnesia` table for retained messages are not supported diff --git a/changes/ce/fix-12830.en.md b/changes/ce/fix-12830.en.md new file mode 100644 index 000000000..5800a9bd3 --- /dev/null +++ b/changes/ce/fix-12830.en.md @@ -0,0 +1 @@ +Made channel (action/source) health checks non-blocking operations. This means that operations such as updating or removing an action/source data integration won't be blocked by a lengthy running health check. diff --git a/changes/ce/fix-12843.en.md b/changes/ce/fix-12843.en.md new file mode 100644 index 000000000..000026c00 --- /dev/null +++ b/changes/ce/fix-12843.en.md @@ -0,0 +1,2 @@ +Fixed `cluster_rpc_commit` transaction ID cleanup procedure after `cluster leave` on replicant nodes. +Previously, the transaction id of the core node would be deleted prematurely, blocking configuration updates on the core node. diff --git a/changes/ce/fix-12874.en.md b/changes/ce/fix-12874.en.md new file mode 100644 index 000000000..1a5814b07 --- /dev/null +++ b/changes/ce/fix-12874.en.md @@ -0,0 +1,7 @@ +- Ensure consistency of the durable message replay when the subscriptions are modified before session reconnects + +- Persistent sessions save inflight packet IDs for the received QoS2 messages + +- Make behavior of the persistent sessions consistent with the non-persistent sessions in regard to overlapping subscriptions + +- List persistent subscriptions in the REST API diff --git a/changes/ce/fix-12887.en.md b/changes/ce/fix-12887.en.md new file mode 100644 index 000000000..c25d3a320 --- /dev/null +++ b/changes/ce/fix-12887.en.md @@ -0,0 +1,2 @@ +Fix MQTT enhanced auth with sasl scram. + diff --git a/changes/ce/fix-12902.md b/changes/ce/fix-12902.md new file mode 100644 index 000000000..83409ee6d --- /dev/null +++ b/changes/ce/fix-12902.md @@ -0,0 +1 @@ +Pass the Content-type of MQTT message to the Stomp message. diff --git a/changes/e5.6.1.en.md b/changes/e5.6.1.en.md new file mode 100644 index 000000000..3deb7466b --- /dev/null +++ b/changes/e5.6.1.en.md @@ -0,0 +1,46 @@ +# e5.6.1 + +## Bug Fixes + +- [#12759](https://github.com/emqx/emqx/pull/12759) Do not save invalid uploaded backup files. + +- [#12766](https://github.com/emqx/emqx/pull/12766) Rename `message_queue_too_long` error reason to `mailbox_overflow` + + `mailbox_overflow` is consistent with the corresponding config parameter: `force_shutdown.max_mailbox_size`. + +- [#12773](https://github.com/emqx/emqx/pull/12773) Upgrade HTTP client libraries. + + The HTTP client library (`gun-1.3`) incorrectly appends a `:portnumber` suffix to the `Host` header for + standard ports (`http` on port 80, `https` on port 443). This could cause compatibility issues with servers or + gateways performing strict `Host` header checks (e.g., AWS Lambda, Alibaba Cloud HTTP gateways), leading to + errors such as `InvalidCustomDomain.NotFound` or "The specified CustomDomain does not exist." + +- [#12802](https://github.com/emqx/emqx/pull/12802) Improve cluster discovery behaviour when a node is manually removed from a cluster using 'emqx ctl cluster leave' command. + Previously, if the configured cluster 'discovery_strategy' was not 'manual', the left node might re-discover and re-join the same cluster shortly after it left (unless it was stopped). + After this change, 'cluster leave' command disables automatic cluster_discovery, so that the left node won't re-join the same cluster again. Cluster discovery can be re-enabled by running 'emqx ctl discovery enable` or by restarting the left node. + +- [#12814](https://github.com/emqx/emqx/pull/12814) Handle several errors in `/clients/{clientid}/mqueue_messages` and `/clients/{clientid}/inflight_messages` APIs: + + - Internal timeout, which means that EMQX failed to get the list of Inflight/Mqueue messages within the default timeout of 5 s. This error may occur when the system is under a heavy load. The API will return 500 `{"code":"INTERNAL_ERROR","message":"timeout"}` response and log additional details. + - Client shutdown. The error may occur if the client connection is shutdown during the API call. The API will return 404 `{"code": "CLIENT_SHUTDOWN", "message": "Client connection has been shutdown"}` response in this case. + +- [#12824](https://github.com/emqx/emqx/pull/12824) Make sure stats `'subscribers.count'` `'subscribers.max'` countains shared-subscribers. + It only contains non-shared subscribers previously. + +- [#12826](https://github.com/emqx/emqx/pull/12826) Fixed an issue that prevented importing source data integrations and retained messages. + + Before the fix: + + - source data integrations are ignored from the backup file + - importing the `mnesia` table for retained messages are not supported + +- [#12843](https://github.com/emqx/emqx/pull/12843) Fixed `cluster_rpc_commit` transaction ID cleanup procedure after `cluster leave` on replicant nodes. + Previously, the transaction id of the core node would be deleted prematurely, blocking configuration updates on the core node. + +- [#12882](https://github.com/emqx/emqx/pull/12882) The RocketMQ action has been fixed so that the topic configiuration works correctly. If more than one action used a single connector before this fix, all actions messages got delivered to the topic that was used first. + +- [#12885](https://github.com/emqx/emqx/pull/12885) Fixed an issue when users were not able to see the "Retained Messages" under the "Monitoring" menu in the admin dashboard. + +"Retained messages" backend API uses `qlc`, and `qlc` uses `file_sorter` that puts temporary files in the working directory by default, which is not writable by emqx user since 58d0f04. + +This patch fixes this by making `/opt/emqx` directory owned by `emqx:emqx`. diff --git a/changes/ee/feat-12898.en.md b/changes/ee/feat-12898.en.md new file mode 100644 index 000000000..67e5ea965 --- /dev/null +++ b/changes/ee/feat-12898.en.md @@ -0,0 +1 @@ +IoTDB bridge support for iotdb 1.3.0 and batch insert(batch_size/batch_time) options. diff --git a/changes/ee/feat-12899.en.md b/changes/ee/feat-12899.en.md new file mode 100644 index 000000000..8d5b62bcc --- /dev/null +++ b/changes/ee/feat-12899.en.md @@ -0,0 +1 @@ +Added support for namespace and key dispatch strategy. diff --git a/changes/ee/fix-12871.en.md b/changes/ee/fix-12871.en.md new file mode 100644 index 000000000..5b7520645 --- /dev/null +++ b/changes/ee/fix-12871.en.md @@ -0,0 +1 @@ +Fix startup process of evacuated node. Previously, if a node was evacuated and stoped without stopping evacuation, it would not start back. diff --git a/changes/ee/fix-12882.en.md b/changes/ee/fix-12882.en.md new file mode 100644 index 000000000..804665fef --- /dev/null +++ b/changes/ee/fix-12882.en.md @@ -0,0 +1 @@ +The RocketMQ action has been fixed so that the topic configiuration works correctly. If more than one action used a single connector before this fix, all actions messages got delivered to the topic that was used first. diff --git a/changes/ee/fix-12888.en.md b/changes/ee/fix-12888.en.md new file mode 100644 index 000000000..98b42065c --- /dev/null +++ b/changes/ee/fix-12888.en.md @@ -0,0 +1 @@ +Fix License related configuration loss after importing backup data. diff --git a/changes/ee/fix-12892.md b/changes/ee/fix-12892.md new file mode 100644 index 000000000..45fd5c825 --- /dev/null +++ b/changes/ee/fix-12892.md @@ -0,0 +1,3 @@ +Fix an error in OCPP gateway's handling of downstream BootNotification. + +Fix the `gateways/ocpp/listeners` endpoint to return the correct number of current connections. diff --git a/changes/ee/fix-12895.en.md b/changes/ee/fix-12895.en.md new file mode 100644 index 000000000..dbfd52e2b --- /dev/null +++ b/changes/ee/fix-12895.en.md @@ -0,0 +1,6 @@ +Complemented some necessary but missed keys for the DynamoDB connector and the action. + +## Breaking changes +* The old configuration no longer works, although it actually didn't work properly until this fix. +* For DynamoDB connector, a new key `region` is necessary. +* `hash_key` and `range_key` are now supported in the DynamoDB action, and `hash_key` is required. diff --git a/changes/ee/fix-12909.en.md b/changes/ee/fix-12909.en.md new file mode 100644 index 000000000..c400abcff --- /dev/null +++ b/changes/ee/fix-12909.en.md @@ -0,0 +1 @@ +Fixed UDP listener process handling on errors or closure, The fix ensures the UDP listener is cleanly stopped and restarted as needed if these error conditions occur. diff --git a/changes/feat-12833.en.md b/changes/feat-12833.en.md new file mode 100644 index 000000000..ef1d2fb30 --- /dev/null +++ b/changes/feat-12833.en.md @@ -0,0 +1,16 @@ +Added REST API endpoints and CLI commands for durable storage management. + +New REST endpoints: + +- `/ds/sites` +- `/ds/sites/:site` +- `/ds/storages` +- `/ds/storages/:ds` +- `/ds/storages/:ds/replicas` +- `/ds/storages/:ds/replicas/:site` + +New CLI commands: + +- `ds set_replicas` +- `ds join` +- `ds leave` diff --git a/changes/fix-12844.en.md b/changes/fix-12844.en.md new file mode 100644 index 000000000..851c877ac --- /dev/null +++ b/changes/fix-12844.en.md @@ -0,0 +1 @@ +CPU usage/idle statistics values are only retained with 2 decimal precision. This affects Prometheus statistical metrics and OpenTelemetry governance metrics. diff --git a/changes/fix-12855.en.md b/changes/fix-12855.en.md new file mode 100644 index 000000000..422008243 --- /dev/null +++ b/changes/fix-12855.en.md @@ -0,0 +1,2 @@ +Fix when the client subscribes/unsubscribes to a shared topic, the system topic messages for Client subscribed/unsubscribed notification cannot be serialized correctly. +Fix the `$queue` shared topics format error in endpoint `/topics`. diff --git a/changes/fix-12880.en.md b/changes/fix-12880.en.md new file mode 100644 index 000000000..7d7a53777 --- /dev/null +++ b/changes/fix-12880.en.md @@ -0,0 +1,3 @@ +Fixed the issue where serialization failed when the value in the tag set used a literal value (int or float) in the influxdb action configuration. + +Which Tag Set value's type is always `String`. See also: [Line Protocol - Tag Set](https://docs.influxdata.com/influxdb/v2/reference/syntax/line-protocol/#tag-set) diff --git a/changes/v5.6.1.en.md b/changes/v5.6.1.en.md new file mode 100644 index 000000000..e33af057e --- /dev/null +++ b/changes/v5.6.1.en.md @@ -0,0 +1,44 @@ +# v5.6.1 + +## Bug Fixes + +- [#12759](https://github.com/emqx/emqx/pull/12759) Do not save invalid uploaded backup files. + +- [#12766](https://github.com/emqx/emqx/pull/12766) Rename `message_queue_too_long` error reason to `mailbox_overflow` + + `mailbox_overflow` is consistent with the corresponding config parameter: `force_shutdown.max_mailbox_size`. + +- [#12773](https://github.com/emqx/emqx/pull/12773) Upgrade HTTP client libraries. + + The HTTP client library (`gun-1.3`) incorrectly appends a `:portnumber` suffix to the `Host` header for + standard ports (`http` on port 80, `https` on port 443). This could cause compatibility issues with servers or + gateways performing strict `Host` header checks (e.g., AWS Lambda, Alibaba Cloud HTTP gateways), leading to + errors such as `InvalidCustomDomain.NotFound` or "The specified CustomDomain does not exist." + +- [#12802](https://github.com/emqx/emqx/pull/12802) Improve cluster discovery behaviour when a node is manually removed from a cluster using 'emqx ctl cluster leave' command. + Previously, if the configured cluster 'discovery_strategy' was not 'manual', the left node might re-discover and re-join the same cluster shortly after it left (unless it was stopped). + After this change, 'cluster leave' command disables automatic cluster_discovery, so that the left node won't re-join the same cluster again. Cluster discovery can be re-enabled by running 'emqx ctl discovery enable` or by restarting the left node. + +- [#12814](https://github.com/emqx/emqx/pull/12814) Handle several errors in `/clients/{clientid}/mqueue_messages` and `/clients/{clientid}/inflight_messages` APIs: + + - Internal timeout, which means that EMQX failed to get the list of Inflight/Mqueue messages within the default timeout of 5 s. This error may occur when the system is under a heavy load. The API will return 500 `{"code":"INTERNAL_ERROR","message":"timeout"}` response and log additional details. + - Client shutdown. The error may occur if the client connection is shutdown during the API call. The API will return 404 `{"code": "CLIENT_SHUTDOWN", "message": "Client connection has been shutdown"}` response in this case. + +- [#12824](https://github.com/emqx/emqx/pull/12824) Make sure stats `'subscribers.count'` `'subscribers.max'` countains shared-subscribers. + It only contains non-shared subscribers previously. + +- [#12826](https://github.com/emqx/emqx/pull/12826) Fixed an issue that prevented importing source data integrations and retained messages. + + Before the fix: + + - source data integrations are ignored from the backup file + - importing the `mnesia` table for retained messages are not supported + +- [#12843](https://github.com/emqx/emqx/pull/12843) Fixed `cluster_rpc_commit` transaction ID cleanup procedure after `cluster leave` on replicant nodes. + Previously, the transaction id of the core node would be deleted prematurely, blocking configuration updates on the core node. + +- [#12885](https://github.com/emqx/emqx/pull/12885) Fixed an issue when users were not able to see the "Retained Messages" under the "Monitoring" menu in the admin dashboard. + +"Retained messages" backend API uses `qlc`, and `qlc` uses `file_sorter` that puts temporary files in the working directory by default, which is not writable by emqx user since 58d0f04. + +This patch fixes this by making `/opt/emqx` directory owned by `emqx:emqx`. diff --git a/deploy/charts/emqx/Chart.yaml b/deploy/charts/emqx/Chart.yaml index 60498d5d7..e4c15c7f7 100644 --- a/deploy/charts/emqx/Chart.yaml +++ b/deploy/charts/emqx/Chart.yaml @@ -14,8 +14,8 @@ type: application # This is the chart version. This version number should be incremented each time you make changes # to the chart and its templates, including the app version. -version: 5.7.0-alpha.1 +version: 5.6.1 # This is the version number of the application being deployed. This version number should be # incremented each time you make changes to the application. -appVersion: 5.7.0-alpha.1 +appVersion: 5.6.1 diff --git a/deploy/docker/Dockerfile b/deploy/docker/Dockerfile index d43b4a19f..ea7bb27cc 100644 --- a/deploy/docker/Dockerfile +++ b/deploy/docker/Dockerfile @@ -47,18 +47,19 @@ ENV LC_ALL=C.UTF-8 ENV LANG=C.UTF-8 COPY deploy/docker/docker-entrypoint.sh /usr/bin/ -COPY --from=builder /emqx-rel /opt/ RUN set -eu; \ apt-get update; \ apt-get install -y --no-install-recommends ca-certificates procps $(echo "${EXTRA_DEPS}" | tr ',' ' '); \ rm -rf /var/lib/apt/lists/*; \ - find /opt/emqx -name 'swagger*.js.map' -exec rm {} +; \ - ln -s /opt/emqx/bin/* /usr/local/bin/; \ groupadd -r -g 1000 emqx; \ - useradd -r -m -u 1000 -g emqx emqx; \ - mkdir -p /opt/emqx/log /opt/emqx/data /opt/emqx/plugins; \ - chown -R emqx:emqx /opt/emqx/log /opt/emqx/data /opt/emqx/plugins + useradd -r -m -u 1000 -g emqx emqx; + +COPY --from=builder --chown=emqx:emqx /emqx-rel /opt/ + +RUN set -eu; \ + find /opt/emqx -name 'swagger*.js.map' -exec rm {} +; \ + ln -s /opt/emqx/bin/* /usr/local/bin/; WORKDIR /opt/emqx diff --git a/mix.exs b/mix.exs index 486484f72..bbb11cc52 100644 --- a/mix.exs +++ b/mix.exs @@ -53,14 +53,14 @@ defmodule EMQXUmbrella.MixProject do {:gproc, github: "emqx/gproc", tag: "0.9.0.1", override: true}, {:jiffy, github: "emqx/jiffy", tag: "1.0.6", override: true}, {:cowboy, github: "emqx/cowboy", tag: "2.9.2", override: true}, - {:esockd, github: "emqx/esockd", tag: "5.11.1", override: true}, + {:esockd, github: "emqx/esockd", tag: "5.11.2", override: true}, {:rocksdb, github: "emqx/erlang-rocksdb", tag: "1.8.0-emqx-2", override: true}, - {:ekka, github: "emqx/ekka", tag: "0.19.1", override: true}, + {:ekka, github: "emqx/ekka", tag: "0.19.3", override: true}, {:gen_rpc, github: "emqx/gen_rpc", tag: "3.3.1", override: true}, {:grpc, github: "emqx/grpc-erl", tag: "0.6.12", override: true}, {:minirest, github: "emqx/minirest", tag: "1.4.0", override: true}, {:ecpool, github: "emqx/ecpool", tag: "0.5.7", override: true}, - {:replayq, github: "emqx/replayq", tag: "0.3.7", override: true}, + {:replayq, github: "emqx/replayq", tag: "0.3.8", override: true}, {:pbkdf2, github: "emqx/erlang-pbkdf2", tag: "2.0.4", override: true}, # maybe forbid to fetch quicer {:emqtt, @@ -74,7 +74,7 @@ defmodule EMQXUmbrella.MixProject do {:snabbkaffe, github: "kafka4beam/snabbkaffe", tag: "1.0.8", override: true}, {:hocon, github: "emqx/hocon", tag: "0.42.1", override: true}, {:emqx_http_lib, github: "emqx/emqx_http_lib", tag: "0.5.3", override: true}, - {:esasl, github: "emqx/esasl", tag: "0.2.0"}, + {:esasl, github: "emqx/esasl", tag: "0.2.1"}, {:jose, github: "potatosalad/erlang-jose", tag: "1.11.2"}, # in conflict by ehttpc and emqtt {:gun, github: "emqx/gun", tag: "1.3.11", override: true}, @@ -213,7 +213,8 @@ defmodule EMQXUmbrella.MixProject do {:crc32cer, "0.1.8", override: true}, {:supervisor3, "1.1.12", override: true}, {:opentsdb, github: "emqx/opentsdb-client-erl", tag: "v0.5.1", override: true}, - {:greptimedb, github: "GreptimeTeam/greptimedb-client-erl", tag: "v0.1.7", override: true}, + {:greptimedb, + github: "GreptimeTeam/greptimedb-ingester-erl", tag: "v0.1.8", override: true}, # The following two are dependencies of rabbit_common. They are needed here to # make mix not complain about conflicting versions {:thoas, github: "emqx/thoas", tag: "v1.0.0", override: true}, @@ -331,7 +332,9 @@ defmodule EMQXUmbrella.MixProject do :emqx_s3, :emqx_opentelemetry, :emqx_durable_storage, - :rabbit_common + :rabbit_common, + :emqx_eviction_agent, + :emqx_node_rebalance ], steps: steps, strip_beams: false diff --git a/rebar.config b/rebar.config index 8bbe178aa..e0f88893c 100644 --- a/rebar.config +++ b/rebar.config @@ -81,14 +81,14 @@ {gproc, {git, "https://github.com/emqx/gproc", {tag, "0.9.0.1"}}}, {jiffy, {git, "https://github.com/emqx/jiffy", {tag, "1.0.6"}}}, {cowboy, {git, "https://github.com/emqx/cowboy", {tag, "2.9.2"}}}, - {esockd, {git, "https://github.com/emqx/esockd", {tag, "5.11.1"}}}, + {esockd, {git, "https://github.com/emqx/esockd", {tag, "5.11.2"}}}, {rocksdb, {git, "https://github.com/emqx/erlang-rocksdb", {tag, "1.8.0-emqx-2"}}}, - {ekka, {git, "https://github.com/emqx/ekka", {tag, "0.19.1"}}}, + {ekka, {git, "https://github.com/emqx/ekka", {tag, "0.19.3"}}}, {gen_rpc, {git, "https://github.com/emqx/gen_rpc", {tag, "3.3.1"}}}, {grpc, {git, "https://github.com/emqx/grpc-erl", {tag, "0.6.12"}}}, {minirest, {git, "https://github.com/emqx/minirest", {tag, "1.4.0"}}}, {ecpool, {git, "https://github.com/emqx/ecpool", {tag, "0.5.7"}}}, - {replayq, {git, "https://github.com/emqx/replayq.git", {tag, "0.3.7"}}}, + {replayq, {git, "https://github.com/emqx/replayq.git", {tag, "0.3.8"}}}, {pbkdf2, {git, "https://github.com/emqx/erlang-pbkdf2.git", {tag, "2.0.4"}}}, {emqtt, {git, "https://github.com/emqx/emqtt", {tag, "1.10.1"}}}, {rulesql, {git, "https://github.com/emqx/rulesql", {tag, "0.2.0"}}}, @@ -99,7 +99,7 @@ {snabbkaffe, {git, "https://github.com/kafka4beam/snabbkaffe.git", {tag, "1.0.8"}}}, {hocon, {git, "https://github.com/emqx/hocon.git", {tag, "0.42.1"}}}, {emqx_http_lib, {git, "https://github.com/emqx/emqx_http_lib.git", {tag, "0.5.3"}}}, - {esasl, {git, "https://github.com/emqx/esasl", {tag, "0.2.0"}}}, + {esasl, {git, "https://github.com/emqx/esasl", {tag, "0.2.1"}}}, {jose, {git, "https://github.com/potatosalad/erlang-jose", {tag, "1.11.2"}}}, {telemetry, "1.1.0"}, {hackney, {git, "https://github.com/emqx/hackney.git", {tag, "1.18.1-1"}}}, diff --git a/rebar.config.erl b/rebar.config.erl index a81d162a9..35f76b187 100644 --- a/rebar.config.erl +++ b/rebar.config.erl @@ -116,6 +116,8 @@ is_community_umbrella_app("apps/emqx_gateway_ocpp") -> false; is_community_umbrella_app("apps/emqx_gateway_jt808") -> false; is_community_umbrella_app("apps/emqx_bridge_syskeeper") -> false; is_community_umbrella_app("apps/emqx_message_validation") -> false; +is_community_umbrella_app("apps/emqx_eviction_agent") -> false; +is_community_umbrella_app("apps/emqx_node_rebalance") -> false; is_community_umbrella_app(_) -> true. %% BUILD_WITHOUT_JQ diff --git a/rel/i18n/emqx_bridge_dynamo.hocon b/rel/i18n/emqx_bridge_dynamo.hocon index 0d3bcd3f9..31771832a 100644 --- a/rel/i18n/emqx_bridge_dynamo.hocon +++ b/rel/i18n/emqx_bridge_dynamo.hocon @@ -60,4 +60,9 @@ config_connector.desc: config_connector.label: """DynamoDB Connector Configuration""" +hash_key.desc: +"""DynamoDB Hash Key""" + +range_key.desc: +"""DynamoDB Range Key""" } diff --git a/rel/i18n/emqx_bridge_dynamo_connector.hocon b/rel/i18n/emqx_bridge_dynamo_connector.hocon index 7c37676b5..18c3670aa 100644 --- a/rel/i18n/emqx_bridge_dynamo_connector.hocon +++ b/rel/i18n/emqx_bridge_dynamo_connector.hocon @@ -18,6 +18,9 @@ table.desc: table.label: """Table """ +region.desc: +"""Region of AWS Dynamo""" + url.desc: """The url of DynamoDB endpoint.""" diff --git a/rel/i18n/emqx_bridge_pulsar.hocon b/rel/i18n/emqx_bridge_pulsar.hocon index 913ab8d2a..56db961c3 100644 --- a/rel/i18n/emqx_bridge_pulsar.hocon +++ b/rel/i18n/emqx_bridge_pulsar.hocon @@ -13,15 +13,15 @@ connector_resource_opts.label: auth_basic.desc: """Parameters for basic authentication.""" auth_basic.label: -"""Basic auth params""" +"""Basic auth parameters""" auth_basic_password.desc: -"""Basic authentication password.""" +"""Basic authentication password. The `password` part of the `username:password` authentication string.""" auth_basic_password.label: """Password""" auth_basic_username.desc: -"""Basic authentication username.""" +"""Basic authentication username. The `username` part of the `username:password` authentication string.""" auth_basic_username.label: """Username""" diff --git a/rel/i18n/emqx_bridge_rocketmq.hocon b/rel/i18n/emqx_bridge_rocketmq.hocon index b6bb3aad6..fe6ca8c8d 100644 --- a/rel/i18n/emqx_bridge_rocketmq.hocon +++ b/rel/i18n/emqx_bridge_rocketmq.hocon @@ -59,4 +59,7 @@ config_connector.desc: config_connector.label: """RocketMQ Client Configuration""" +strategy.desc: +"""Producer key dispatch strategy, the default is `roundrobin`, also supports placeholders, such as: `clientid`, `messageid`, `username`.""" + } diff --git a/rel/i18n/emqx_bridge_rocketmq_connector.hocon b/rel/i18n/emqx_bridge_rocketmq_connector.hocon index b13e015c2..b65ce5405 100644 --- a/rel/i18n/emqx_bridge_rocketmq_connector.hocon +++ b/rel/i18n/emqx_bridge_rocketmq_connector.hocon @@ -50,4 +50,10 @@ topic.desc: topic.label: """RocketMQ Topic""" +namespace.desc: +"""The namespace field MUST be set if you are using the RocketMQ service in +aliyun cloud and also the namespace is enabled, +or if you have configured a namespace in your RocketMQ server. +For RocketMQ in aliyun cloud, the namespace is the instance ID.""" + } diff --git a/rel/i18n/emqx_conf_schema_types.hocon b/rel/i18n/emqx_conf_schema_types.hocon index 6b9dac9ea..f9eefbe1d 100644 --- a/rel/i18n/emqx_conf_schema_types.hocon +++ b/rel/i18n/emqx_conf_schema_types.hocon @@ -9,4 +9,7 @@ emqx_conf_schema_types { secret.desc: """A string holding some sensitive information, such as a password. When secret starts with file://, the rest of the string is interpreted as a path to a file containing the secret itself: whole content of the file except any trailing whitespace characters is considered a secret value. Note: when clustered, all EMQX nodes should have the same file present before using file:// secrets.""" + template.desc: """~ + A string for `${.path.to.var}` style value interpolation, + where the leading dot is optional, and `${.}` represents all values as an object.""" } diff --git a/rel/i18n/emqx_license_schema.hocon b/rel/i18n/emqx_license_schema.hocon index 72f31266b..e3d418029 100644 --- a/rel/i18n/emqx_license_schema.hocon +++ b/rel/i18n/emqx_license_schema.hocon @@ -12,17 +12,12 @@ connection_low_watermark_field.desc: connection_low_watermark_field.label: """Connection low watermark""" -connection_high_watermark_field_deprecated.desc: -"""deprecated use /license/setting instead""" - -connection_high_watermark_field_deprecated.label: -"""deprecated use /license/setting instead""" - -connection_low_watermark_field_deprecated.desc: -"""deprecated use /license/setting instead""" - -connection_low_watermark_field_deprecated.label: -"""deprecated use /license/setting instead""" +dynamic_max_connections { + label: "Dynamic Connections Limit" + desc: """~ + Only applicable for "Business Critical" license type. This config sets the current allocation of license for the current cluster. + This value cannot exceed the connections limit assigned in the license key.""" +} key_field.desc: """This configuration parameter is designated for the license key and supports below input formats: @@ -43,7 +38,7 @@ license_root.desc: """Defines the EMQX Enterprise license. EMQX Enterprise is initially provided with a default trial license. -This license, issued in December 2023, is valid for a period of 5 years. +This license, issued in April 2024, is valid for a period of 5 years. It supports up to 25 concurrent connections, catering to early-stage development and testing needs. For deploying EMQX Enterprise in a production environment, a different license is required. You can apply for a production license by visiting https://www.emqx.com/apply-licenses/emqx?version=5""" diff --git a/rel/i18n/emqx_mgmt_api_trace.hocon b/rel/i18n/emqx_mgmt_api_trace.hocon index 67462ab43..13d814c21 100644 --- a/rel/i18n/emqx_mgmt_api_trace.hocon +++ b/rel/i18n/emqx_mgmt_api_trace.hocon @@ -80,6 +80,11 @@ client_ip_addess.desc: client_ip_addess.label: """Client IP Address""" +ruleid.desc: +"""Specify the Rule ID if the trace type is 'ruleid'.""" +ruleid.label: +"""Rule ID""" + trace_status.desc: """trace status""" trace_status.label: @@ -110,4 +115,9 @@ current_trace_offset.desc: current_trace_offset.label: """Offset from the current trace position.""" +trace_log_formatter.desc: +"""The formatter that will be used to format the trace log entries. Set this to text to format the log entries as plain text (default). Set it to json to format each log entry as a JSON object.""" +trace_log_formatter.label: +"""Trace Log Entry Formatter""" + } diff --git a/rel/i18n/emqx_rule_api_schema.hocon b/rel/i18n/emqx_rule_api_schema.hocon index 0289f53ab..68c6a560d 100644 --- a/rel/i18n/emqx_rule_api_schema.hocon +++ b/rel/i18n/emqx_rule_api_schema.hocon @@ -66,6 +66,12 @@ test_context.desc: test_context.label: """Event Conetxt""" +stop_action_after_template_render.desc: +"""Set this to true if the action should be stopped after its template has been rendered (default is true).""" + +stop_action_after_template_render.label: +"""Stop Action After Template Rendering""" + node_node.desc: """The node name""" diff --git a/rel/i18n/emqx_rule_engine_api.hocon b/rel/i18n/emqx_rule_engine_api.hocon index 385b71ddc..0745a108d 100644 --- a/rel/i18n/emqx_rule_engine_api.hocon +++ b/rel/i18n/emqx_rule_engine_api.hocon @@ -90,4 +90,10 @@ api9.desc: api9.label: """Get configuration""" +api11.desc: +"""Apply a rule with the given message and environment""" + +api11.label: +"""Apply Rule""" + } diff --git a/rel/i18n/emqx_schema.hocon b/rel/i18n/emqx_schema.hocon index 0bd8c74d5..cb504694c 100644 --- a/rel/i18n/emqx_schema.hocon +++ b/rel/i18n/emqx_schema.hocon @@ -1575,47 +1575,37 @@ client_attrs_init { label: "Client Attributes Initialization" desc: """~ Specify how to initialize client attributes. - One initial client attribute can be initialized as `client_attrs.NAME`, - where `NAME` is the name of the attribute specified in the config `extract_as`. + Each client attribute can be initialized as `client_attrs.{NAME}`, + where `{NAME}` is the name of the attribute specified in the config field `set_as_attr`. The initialized client attribute will be stored in the `client_attrs` property with the specified name, and can be used as a placeholder in a template for authentication and authorization. - For example, use `${client_attrs.alias}` to render an HTTP POST body when `extract_as = alias`, + For example, use `${client_attrs.alias}` to render an HTTP POST body when `set_as_attr = alias`, or render listener config `moutpoint = devices/${client_attrs.alias}/` to initialize a per-client topic namespace.""" } -client_attrs_init_extract_from { - label: "Client Property to Extract Attribute" - desc: """~ - Specify from which client property the client attribute should be extracted. - - Supported values: - - `clientid`: Extract from the client ID. - - `username`: Extract from the username. - - `cn`: Extract from the Common Name (CN) field of the client certificate. - - `dn`: Extract from the Distinguished Name (DN) field of the client certificate. - - `user_property`: Extract from the user property sent in the MQTT v5 `CONNECT` packet. - In this case, `extract_regexp` is not applicable, and `extract_as` should be the user property key. - - NOTE: this extraction happens **after** `clientid` or `username` is initialized - from `peer_cert_as_clientid` or `peer_cert_as_username` config.""" -} - -client_attrs_init_extract_regexp { +client_attrs_init_expression { label: "Client Attribute Extraction Regular Expression" desc: """~ - The regular expression to extract a client attribute from the client property specified by `client_attrs_init.extract_from` config. - The expression should match the entire client property value, and capturing groups are concatenated to make the client attribute. - For example if the client attribute is the first part of the client ID delimited by a dash, the regular expression would be `^(.+?)-.*$`. - Note that failure to match the regular expression will result in the client attribute being absent but not an empty string. - Note also that currently only printable ASCII characters are allowed as input for the regular expression extraction.""" + A one line expression to evaluate a set of predefined string functions (like in the rule engine SQL statements). + The expression can be a function call with nested calls as its arguments, or direct variable reference. + So far, it does not provide user-defined variable binding (like `var a=1`) or user-defined functions. + As an example, to extract the prefix of client ID delimited by a dot: `nth(1, tokens(clientid, '.'))`. + + The variables pre-bound variables are: + - `cn`: Client's TLS certificate common name. + - `dn`: Client's TLS certificate distinguished name (the subject). + - `clientid`: MQTT Client ID. + - `username`: MQTT Client's username. + - `user_property.{NAME}`: User properties in the CONNECT packet. + + You can read more about variform expressions in EMQX docs.""" } -client_attrs_init_extract_as { +client_attrs_init_set_as_attr { label: "Name The Extracted Attribute" desc: """~ - The name of the client attribute extracted from the client property specified by `client_attrs_init.extract_from` config. - The extracted attribute will be stored in the `client_attrs` property with this name. - In case `extract_from = user_property`, this should be the key of the user property.""" + The name of the client attribute extracted from the client data. + The extracted attribute will be stored in the `client_attrs` property with this name.""" } } diff --git a/scripts/find-apps.sh b/scripts/find-apps.sh index 565c60449..42c2d7b69 100755 --- a/scripts/find-apps.sh +++ b/scripts/find-apps.sh @@ -105,6 +105,10 @@ matrix() { entries+=("$(format_app_entry "$app" 1 emqx "$runner")") entries+=("$(format_app_entry "$app" 1 emqx-enterprise "$runner")") ;; + apps/emqx_management) + entries+=("$(format_app_entry "$app" 1 emqx "$runner")") + entries+=("$(format_app_entry "$app" 1 emqx-enterprise "$runner")") + ;; apps/*) if [[ -f "${app}/BSL.txt" ]]; then profile='emqx-enterprise' diff --git a/scripts/spellcheck/dicts/emqx.txt b/scripts/spellcheck/dicts/emqx.txt index 7e8fed96f..d68c85716 100644 --- a/scripts/spellcheck/dicts/emqx.txt +++ b/scripts/spellcheck/dicts/emqx.txt @@ -259,6 +259,7 @@ uplink url utc util +variform ver vm vsn @@ -305,3 +306,4 @@ elasticsearch ElasticSearch doc_as_upsert upsert +aliyun diff --git a/scripts/test/emqx-smoke-test.sh b/scripts/test/emqx-smoke-test.sh index 4430a313a..8177d7b85 100755 --- a/scripts/test/emqx-smoke-test.sh +++ b/scripts/test/emqx-smoke-test.sh @@ -82,8 +82,10 @@ main() { ## The json status feature was added after hotconf and bridges schema API if [ "$JSON_STATUS" != 'NOT_JSON' ]; then check_swagger_json - check_schema_json hotconf "EMQX Hot Conf API Schema" - check_schema_json bridges "EMQX Data Bridge API Schema" + check_schema_json hotconf "Hot Conf Schema" + check_schema_json bridges "Data Bridge Schema" + check_schema_json actions "Actions and Sources Schema" + check_schema_json connectors "Connectors Schema" fi } diff --git a/scripts/ui-tests/dashboard_test.py b/scripts/ui-tests/dashboard_test.py index 91a7264ec..7003802ab 100644 --- a/scripts/ui-tests/dashboard_test.py +++ b/scripts/ui-tests/dashboard_test.py @@ -1,3 +1,4 @@ +import os import time import unittest import pytest @@ -8,6 +9,7 @@ from selenium.webdriver.common.keys import Keys from selenium.webdriver.chrome.options import Options from selenium.webdriver.support.wait import WebDriverWait from selenium.webdriver.common import utils +from selenium.common.exceptions import NoSuchElementException @pytest.fixture def driver(): @@ -30,11 +32,13 @@ def dashboard_url(dashboard_host, dashboard_port): @pytest.fixture def login(driver, dashboard_url): + # admin is set in CI jobs, hence as default value + password = os.getenv("EMQX_DASHBOARD__DEFAULT_PASSWORD", "admin") driver.get(dashboard_url) assert "EMQX Dashboard" == driver.title assert f"{dashboard_url}/#/login?to=/dashboard/overview" == driver.current_url driver.find_element(By.XPATH, "//div[@class='login']//form[1]//input[@type='text']").send_keys("admin") - driver.find_element(By.XPATH, "//div[@class='login']//form[1]//input[@type='password']").send_keys("admin") + driver.find_element(By.XPATH, "//div[@class='login']//form[1]//input[@type='password']").send_keys(password) driver.find_element(By.XPATH, "//div[@class='login']//form[1]//button[1]").click() dest_url = urljoin(dashboard_url, "/#/dashboard/overview") driver.get(dest_url) @@ -48,20 +52,21 @@ def ensure_current_url(driver, url): count += 1 time.sleep(1) -def wait_title(driver): - return WebDriverWait(driver, 10).until(lambda x: x.find_element("xpath", "//div[@id='app']//h1[@class='header-title']")) +def title(driver): + return driver.find_element("xpath", "//div[@id='app']//h1[@class='header-title']") + +def wait_title_text(driver, text): + return WebDriverWait(driver, 10).until(lambda x: title(x).text == text) def test_basic(driver, login, dashboard_url): driver.get(dashboard_url) - title = wait_title(driver) - assert "Cluster Overview" == title.text + wait_title_text(driver, "Cluster Overview") def test_log(driver, login, dashboard_url): dest_url = urljoin(dashboard_url, "/#/log") driver.get(dest_url) ensure_current_url(driver, dest_url) - title = wait_title(driver) - assert "Logging" == title.text + wait_title_text(driver, "Logging") label = driver.find_element(By.XPATH, "//div[@id='app']//form//label[contains(., 'Enable Log Handler')]") assert driver.find_elements(By.ID, label.get_attribute("for")) @@ -72,3 +77,29 @@ def test_log(driver, login, dashboard_url): label = driver.find_element(By.XPATH, "//div[@id='app']//form//label[contains(., 'Time Offset')]") assert driver.find_elements(By.ID, label.get_attribute("for")) +def test_docs_link(driver, login, dashboard_url): + dest_url = urljoin(dashboard_url, "/#/dashboard/overview") + driver.get(dest_url) + ensure_current_url(driver, dest_url) + xpath_link_help = "//div[@id='app']//div[@class='nav-header']//a[contains(@class, 'link-help')]" + link_help = driver.find_element(By.XPATH, xpath_link_help) + driver.execute_script("arguments[0].click();", link_help) + + emqx_name = os.getenv("EMQX_NAME") + emqx_community_version = os.getenv("EMQX_COMMUNITY_VERSION") + emqx_enterprise_version = os.getenv("EMQX_ENTERPRISE_VERSION") + if emqx_name == 'emqx-enterprise': + emqx_version = f"v{emqx_enterprise_version}" + docs_base_url = "https://docs.emqx.com/en/enterprise" + else: + emqx_version = f"v{emqx_community_version}" + docs_base_url = "https://www.emqx.io/docs/en" + + emqx_version = ".".join(emqx_version.split(".")[:2]) + docs_url = f"{docs_base_url}/{emqx_version}" + xpath = f"//div[@id='app']//div[@class='nav-header']//a[@href[starts-with(.,'{docs_url}')]]" + + try: + driver.find_element(By.XPATH, xpath) + except NoSuchElementException: + raise AssertionError(f"Cannot find the doc URL for {emqx_name} version {emqx_version}, please make sure the dashboard package is up to date.") diff --git a/scripts/ui-tests/docker-compose.yaml b/scripts/ui-tests/docker-compose.yaml index f5a66ab33..c4a92e51f 100644 --- a/scripts/ui-tests/docker-compose.yaml +++ b/scripts/ui-tests/docker-compose.yaml @@ -9,6 +9,10 @@ services: selenium: shm_size: '2gb' image: ghcr.io/emqx/selenium-chrome:latest + environment: + EMQX_NAME: ${EMQX_NAME} + EMQX_COMMUNITY_VERSION: ${EMQX_VERSION} + EMQX_ENTERPRISE_VERSION: ${EMQX_ENTERPRISE_VERSION} volumes: - ./:/app depends_on: