From 9b194baf69a5c1b3aafd2676071bad0cd1c5633f Mon Sep 17 00:00:00 2001 From: "Zaiming (Stone) Shi" Date: Sat, 5 Nov 2022 17:11:12 +0100 Subject: [PATCH 1/9] fix(emqx_rule_monitor): sleep before retry but not after --- .../emqx_rule_engine/src/emqx_rule_engine.erl | 5 +-- .../src/emqx_rule_monitor.erl | 40 +++++++++++++++---- .../test/emqx_rule_monitor_SUITE.erl | 5 ++- 3 files changed, 37 insertions(+), 13 deletions(-) diff --git a/apps/emqx_rule_engine/src/emqx_rule_engine.erl b/apps/emqx_rule_engine/src/emqx_rule_engine.erl index e642932c2..b458469da 100644 --- a/apps/emqx_rule_engine/src/emqx_rule_engine.erl +++ b/apps/emqx_rule_engine/src/emqx_rule_engine.erl @@ -79,8 +79,6 @@ , action_instance_params/0 ]). --define(T_RETRY, 60000). - %% redefine this macro to confine the appup scope -undef(RAISE). -define(RAISE(_EXP_, _ERROR_CONTEXT_), @@ -684,8 +682,7 @@ init_resource_with_retrier(Module, OnCreate, ResId, Config) -> status = #{is_alive => true}}, emqx_rule_registry:add_resource_params(ResParams) catch Class:Reason:ST -> - Interval = persistent_term:get({emqx_rule_engine, resource_restart_interval}, ?T_RETRY), - emqx_rule_monitor:ensure_resource_retrier(ResId, Interval), + emqx_rule_monitor:ensure_resource_retrier(ResId), erlang:raise(Class, {init_resource, Reason}, ST) end. diff --git a/apps/emqx_rule_engine/src/emqx_rule_monitor.erl b/apps/emqx_rule_engine/src/emqx_rule_monitor.erl index 8af8aa7ff..82a93d0be 100644 --- a/apps/emqx_rule_engine/src/emqx_rule_monitor.erl +++ b/apps/emqx_rule_engine/src/emqx_rule_monitor.erl @@ -32,10 +32,18 @@ -export([ start_link/0 , stop/0 , async_refresh_resources_rules/0 - , ensure_resource_retrier/2 + , ensure_resource_retrier/1 , retry_loop/3 ]). +%% fot test +-export([ put_retry_interval/1 + , get_retry_interval/0 + , erase_retry_interval/0 + ]). + +-define(T_RETRY, 60000). + start_link() -> gen_server:start_link({local, ?MODULE}, ?MODULE, [], []). @@ -46,10 +54,22 @@ init([]) -> _ = erlang:process_flag(trap_exit, true), {ok, #{retryers => #{}}}. +put_retry_interval(I) when is_integer(I) andalso I >= 10 -> + _ = persistent_term:put({?MODULE, resource_restart_interval}, I), + ok. + +erase_retry_interval() -> + _ = persistent_term:erase({?MODULE, resource_restart_interval}), + ok. + +get_retry_interval() -> + persistent_term:get({?MODULE, resource_restart_interval}, ?T_RETRY). + async_refresh_resources_rules() -> gen_server:cast(?MODULE, async_refresh). -ensure_resource_retrier(ResId, Interval) -> +ensure_resource_retrier(ResId) -> + Interval = get_retry_interval(), gen_server:cast(?MODULE, {create_restart_handler, resource, ResId, Interval}). handle_call(_Msg, _From, State) -> @@ -111,11 +131,12 @@ update_object(Tag, Obj, Retryer, State) -> }. create_restart_handler(Tag, Obj, Interval) -> - ?LOG(info, "keep restarting ~p ~p, interval: ~p", [Tag, Obj, Interval]), + ?LOG(info, "starting_a_retry_loop for ~p ~p, with delay interval: ~p", [Tag, Obj, Interval]), %% spawn a dedicated process to handle the restarting asynchronously spawn_link(?MODULE, retry_loop, [Tag, Obj, Interval]). retry_loop(resource, ResId, Interval) -> + timer:sleep(Interval), case emqx_rule_registry:find_resource(ResId) of {ok, #resource{type = Type, config = Config}} -> try @@ -124,10 +145,15 @@ retry_loop(resource, ResId, Interval) -> ok = emqx_rule_engine:init_resource(M, F, ResId, Config), refresh_and_enable_rules_of_resource(ResId) catch - Err:Reason:ST -> - ?LOG(warning, "init_resource failed: ~p, ~0p", - [{Err, Reason}, ST]), - timer:sleep(Interval), + Err:Reason:Stacktrace -> + %% do not log stacktrace if it's a throw + LogContext = + case Err of + throw -> Reason; + _ -> {Reason, Stacktrace} + end, + ?LOG_SENSITIVE(warning, "init_resource_retry_failed ~p, ~0p", [ResId, LogContext]), + %% keep looping ?MODULE:retry_loop(resource, ResId, Interval) end; not_found -> diff --git a/apps/emqx_rule_engine/test/emqx_rule_monitor_SUITE.erl b/apps/emqx_rule_engine/test/emqx_rule_monitor_SUITE.erl index 6389c1a2e..fac76d235 100644 --- a/apps/emqx_rule_engine/test/emqx_rule_monitor_SUITE.erl +++ b/apps/emqx_rule_engine/test/emqx_rule_monitor_SUITE.erl @@ -48,7 +48,7 @@ end_per_suite(_Config) -> ok. init_per_testcase(t_restart_resource, Config) -> - persistent_term:put({emqx_rule_engine, resource_restart_interval}, 100), + emqx_rule_monitor:put_retry_interval(100), Opts = [public, named_table, set, {read_concurrency, true}], _ = ets:new(?RES_PARAMS_TAB, [{keypos, #resource_params.id}|Opts]), ets:new(t_restart_resource, [named_table, public]), @@ -77,7 +77,6 @@ init_per_testcase(_, Config) -> Config. end_per_testcase(t_restart_resource, Config) -> - persistent_term:put({emqx_rule_engine, resource_restart_interval}, 60000), ets:delete(t_restart_resource), common_end_per_testcases(), Config; @@ -91,7 +90,9 @@ end_per_testcase(_, Config) -> common_init_per_testcase() -> {ok, _} = emqx_rule_monitor:start_link(). + common_end_per_testcases() -> + emqx_rule_monitor:erase_retry_interval(), emqx_rule_monitor:stop(). t_restart_resource(_) -> From dff02ba0e5044cf5594144ed13c2689760790dab Mon Sep 17 00:00:00 2001 From: "Zaiming (Stone) Shi" Date: Sat, 5 Nov 2022 17:48:46 +0100 Subject: [PATCH 2/9] docs: add change log v4.3.22 v4.4.11 --- changes/v4.3.22-en.md | 4 +++- changes/v4.3.22-zh.md | 3 +++ 2 files changed, 6 insertions(+), 1 deletion(-) diff --git a/changes/v4.3.22-en.md b/changes/v4.3.22-en.md index 386b9b0ff..91c7a281c 100644 --- a/changes/v4.3.22-en.md +++ b/changes/v4.3.22-en.md @@ -36,13 +36,15 @@ For example: `acl_order = jwt,http`, this will make sure `jwt` is always checked before `http`, meaning if JWT is not found (or no `acl` cliam) for a client, then the ACL check will fallback to use the HTTP backend. - - Added configurations to enable more `client.disconnected` events (and counter bumps) [#9267](https://github.com/emqx/emqx/pull/9267). Prior to this change, the `client.disconnected` event (and counter bump) is triggered when a client performs a 'normal' disconnect, or is 'kicked' by system admin, but NOT triggered when a stale connection had to be 'discarded' (for clean session) or 'takenover' (for non-clean session). Now it is possible to set configs `broker.client_disconnect_discarded` and `broker.client_disconnect_takenover` to `on` to enable the event in these scenarios. +- For Rule-Engine resource creation failure, delay before the first retry [#9313](https://github.com/emqx/emqx/pull/9313). + Prior to this change, the retry delay was added *after* the retry failure. + ## Bug fixes - Fix that after uploading a backup file with an non-ASCII filename, HTTP API `GET /data/export` fails with status code 500 [#9224](https://github.com/emqx/emqx/pull/9224). diff --git a/changes/v4.3.22-zh.md b/changes/v4.3.22-zh.md index 9304d5c2e..4f516a107 100644 --- a/changes/v4.3.22-zh.md +++ b/changes/v4.3.22-zh.md @@ -37,6 +37,9 @@ 但不会在旧 session 被废弃 (clean_session = true) 或旧 session 被接管 (clean_session = false) 时被触发。 可将 `broker.client_disconnect_discarded` 和 `broker.client_disconnect_takovered` 选项设置为 `on` 来启用此场景下的客户端断连事件。 +- 规则引擎资源创建失败后,第一次重试前增加一个延迟 [#9313](https://github.com/emqx/emqx/pull/9313)。 + 在此之前,重试的延迟发生在重试失败之后。 + ## 修复 - 修复若上传的备份文件名中包含非 ASCII 字符,`GET /data/export` HTTP 接口返回 500 错误 [#9224](https://github.com/emqx/emqx/pull/9224)。 From 42158f79e0b6014002666f9c596aecae29812442 Mon Sep 17 00:00:00 2001 From: Traphalet Date: Mon, 7 Nov 2022 20:51:12 +0200 Subject: [PATCH 3/9] fix: env variable reference in upload --- .github/workflows/build_packages.yaml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/workflows/build_packages.yaml b/.github/workflows/build_packages.yaml index cbd8ee5dd..38c1ea472 100644 --- a/.github/workflows/build_packages.yaml +++ b/.github/workflows/build_packages.yaml @@ -144,8 +144,8 @@ jobs: apple_developer_id_bundle_password: ${{ secrets.APPLE_DEVELOPER_ID_BUNDLE_PASSWORD }} - uses: actions/upload-artifact@v3 with: - name: ${EMQX_NAME}-${{ matrix.otp }} - path: _packages/${EMQX_NAME}/ + name: ${{ env.EMQX_NAME }}-${{ matrix.otp }} + path: _packages/${{ env.EMQX_NAME }}/ linux: runs-on: ubuntu-20.04 From 3ed10eae0743245fd729298edc7a52781105c2d5 Mon Sep 17 00:00:00 2001 From: "Zaiming (Stone) Shi" Date: Mon, 7 Nov 2022 21:29:53 +0100 Subject: [PATCH 4/9] ci: set CT_READABLE default to true --- Makefile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Makefile b/Makefile index 64f278098..59b775ba7 100644 --- a/Makefile +++ b/Makefile @@ -20,7 +20,7 @@ REL_PROFILES := emqx emqx-edge PKG_PROFILES := emqx-pkg emqx-edge-pkg PROFILES := $(REL_PROFILES) $(PKG_PROFILES) default -CT_READABLE ?= false +CT_READABLE ?= true export REBAR_GIT_CLONE_OPTIONS += --depth=1 From 31a582241bc66eb5b63b2eeab7989ebeae1a4434 Mon Sep 17 00:00:00 2001 From: "Zaiming (Stone) Shi" Date: Mon, 7 Nov 2022 21:57:04 +0100 Subject: [PATCH 5/9] chore: bump version to 4.4.11-alpha.1 --- deploy/charts/emqx/Chart.yaml | 4 ++-- include/emqx_release.hrl | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/deploy/charts/emqx/Chart.yaml b/deploy/charts/emqx/Chart.yaml index fe880a4fa..1bd3f411d 100644 --- a/deploy/charts/emqx/Chart.yaml +++ b/deploy/charts/emqx/Chart.yaml @@ -13,8 +13,8 @@ type: application # This is the chart version. This version number should be incremented each time you make changes # to the chart and its templates, including the app version. -version: 4.4.10 +version: 4.4.11 # This is the version number of the application being deployed. This version number should be # incremented each time you make changes to the application. -appVersion: 4.4.10 +appVersion: 4.4.11 diff --git a/include/emqx_release.hrl b/include/emqx_release.hrl index ca6b9ba01..242f4156f 100644 --- a/include/emqx_release.hrl +++ b/include/emqx_release.hrl @@ -29,7 +29,7 @@ -ifndef(EMQX_ENTERPRISE). --define(EMQX_RELEASE, {opensource, "4.4.10"}). +-define(EMQX_RELEASE, {opensource, "4.4.11-alpha.1"}). -else. From 491b402be3660611e8ac24f54a75e32c2d335d41 Mon Sep 17 00:00:00 2001 From: JianBo He Date: Tue, 8 Nov 2022 09:46:01 +0800 Subject: [PATCH 6/9] test(stomp): fix flaky tests --- apps/emqx_stomp/test/emqx_stomp_SUITE.erl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/apps/emqx_stomp/test/emqx_stomp_SUITE.erl b/apps/emqx_stomp/test/emqx_stomp_SUITE.erl index 0c02e9f29..8aca570ff 100644 --- a/apps/emqx_stomp/test/emqx_stomp_SUITE.erl +++ b/apps/emqx_stomp/test/emqx_stomp_SUITE.erl @@ -349,7 +349,7 @@ t_1000_msg_send(_) -> receive {deliver, Topic, _Msg}-> ok - after 100 -> + after 5000 -> ?assert(false, "waiting message timeout") end end, From c319e917587ba9994e2a95249ca2032b2feeec66 Mon Sep 17 00:00:00 2001 From: Traphalet Date: Mon, 7 Nov 2022 16:02:37 +0200 Subject: [PATCH 7/9] fix: remove outdated cert store from packages --- .github/workflows/build_packages.yaml | 5 +++++ build | 15 +++++++++++++-- 2 files changed, 18 insertions(+), 2 deletions(-) diff --git a/.github/workflows/build_packages.yaml b/.github/workflows/build_packages.yaml index 38c1ea472..195e3c23c 100644 --- a/.github/workflows/build_packages.yaml +++ b/.github/workflows/build_packages.yaml @@ -94,6 +94,11 @@ jobs: } make ensure-rebar3 make ${{ matrix.profile }} + ## Delete certifi cert store + $Cert = Get-ChildItem "_build/${{ matrix.profile }}/rel/emqx/lib/certifi*/priv/cacerts.pem" + if (Test-Path $Cert) { + Remove-Item $Cert + } mkdir -p _packages/${{ matrix.profile }} Compress-Archive -Path _build/${{ matrix.profile }}/rel/emqx -DestinationPath _build/${{ matrix.profile }}/rel/$pkg_name mv _build/${{ matrix.profile }}/rel/$pkg_name _packages/${{ matrix.profile }} diff --git a/build b/build index 0ffb810eb..1cb6bd713 100755 --- a/build +++ b/build @@ -61,9 +61,20 @@ log() { echo "===< $msg" } +delete_unwanted_file() { + if [ -e "${1}" ]; then + log "Deleting file: ${1}" + rm -f "${1}" + else + log "Cannot delete file: ${1} -- file not found" + fi +} + make_rel() { - # shellcheck disable=SC1010 - ./rebar3 as "$PROFILE" do release,tar + ./rebar3 as "$PROFILE" release + # delete outdated cert store + delete_unwanted_file _build/"${PROFILE}"/rel/emqx/lib/certifi*/priv/cacerts.pem + ./rebar3 as "$PROFILE" tar } ## unzip previous version .zip files to _build/$PROFILE/rel/emqx/releases before making relup From 407a197c8a3e90326c8e7228f18d66165348406d Mon Sep 17 00:00:00 2001 From: Shawn <506895667@qq.com> Date: Wed, 9 Nov 2022 12:06:58 +0800 Subject: [PATCH 8/9] fix: typos in the RAISE macro of rule_engine --- apps/emqx_rule_engine/src/emqx_rule_engine.erl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/apps/emqx_rule_engine/src/emqx_rule_engine.erl b/apps/emqx_rule_engine/src/emqx_rule_engine.erl index b458469da..8ba75101e 100644 --- a/apps/emqx_rule_engine/src/emqx_rule_engine.erl +++ b/apps/emqx_rule_engine/src/emqx_rule_engine.erl @@ -88,7 +88,7 @@ throw : Reason -> throw({_ERROR_CONTEXT_, Reason}); _EXCLASS_:_EXCPTION_:_ST_ -> - throw({_ERROR_CONTEXT_, {_EXCPTION_, _EXCPTION_, _ST_}}) + throw({_ERROR_CONTEXT_, {_EXCLASS_, _EXCPTION_, _ST_}}) end end()). From 6725a36dbf200e9142c087108d558c120fc27eb0 Mon Sep 17 00:00:00 2001 From: Shawn <506895667@qq.com> Date: Wed, 9 Nov 2022 22:18:28 +0800 Subject: [PATCH 9/9] fix: make init_resource_with_retrier/4 only 'throw' when failed --- .../emqx_rule_engine/src/emqx_rule_engine.erl | 26 +++++++++++-------- 1 file changed, 15 insertions(+), 11 deletions(-) diff --git a/apps/emqx_rule_engine/src/emqx_rule_engine.erl b/apps/emqx_rule_engine/src/emqx_rule_engine.erl index 8ba75101e..0f62a2ac7 100644 --- a/apps/emqx_rule_engine/src/emqx_rule_engine.erl +++ b/apps/emqx_rule_engine/src/emqx_rule_engine.erl @@ -82,12 +82,15 @@ %% redefine this macro to confine the appup scope -undef(RAISE). -define(RAISE(_EXP_, _ERROR_CONTEXT_), + ?RAISE(_EXP_, do_nothing, _ERROR_CONTEXT_)). +-define(RAISE(_EXP_, _EXP_ON_FAIL_, _ERROR_CONTEXT_), fun() -> try (_EXP_) catch throw : Reason -> throw({_ERROR_CONTEXT_, Reason}); _EXCLASS_:_EXCPTION_:_ST_ -> + _EXP_ON_FAIL_, throw({_ERROR_CONTEXT_, {_EXCLASS_, _EXCPTION_, _ST_}}) end end()). @@ -496,7 +499,12 @@ refresh_resource(Type) when is_atom(Type) -> refresh_resource(#resource{id = ResId, type = Type, config = Config}) -> {ok, #resource_type{on_create = {M, F}}} = emqx_rule_registry:find_resource_type(Type), - ok = emqx_rule_engine:init_resource_with_retrier(M, F, ResId, Config). + try + init_resource_with_retrier(M, F, ResId, Config) + catch + throw:Reason -> + ?LOG_SENSITIVE(warning, "refresh_resource failed: ~0p", [Reason]) + end. -spec(refresh_rules_when_boot() -> ok). refresh_rules_when_boot() -> @@ -675,16 +683,12 @@ init_resource(Module, OnCreate, ResId, Config) -> emqx_rule_registry:add_resource_params(ResParams). init_resource_with_retrier(Module, OnCreate, ResId, Config) -> - try - Params = Module:OnCreate(ResId, Config), - ResParams = #resource_params{id = ResId, - params = Params, - status = #{is_alive => true}}, - emqx_rule_registry:add_resource_params(ResParams) - catch Class:Reason:ST -> - emqx_rule_monitor:ensure_resource_retrier(ResId), - erlang:raise(Class, {init_resource, Reason}, ST) - end. + Params = ?RAISE(Module:OnCreate(ResId, Config), + emqx_rule_monitor:ensure_resource_retrier(ResId), {Module, OnCreate}), + ResParams = #resource_params{id = ResId, + params = Params, + status = #{is_alive => true}}, + emqx_rule_registry:add_resource_params(ResParams). init_action(Module, OnCreate, ActionInstId, Params) -> ok = emqx_rule_metrics:create_metrics(ActionInstId),