diff --git a/apps/emqx_rule_engine/src/emqx_rule_engine_api.erl b/apps/emqx_rule_engine/src/emqx_rule_engine_api.erl index 48711d4cc..2583a5d03 100644 --- a/apps/emqx_rule_engine/src/emqx_rule_engine_api.erl +++ b/apps/emqx_rule_engine/src/emqx_rule_engine_api.erl @@ -529,47 +529,69 @@ printable_function_name(Mod, Func) -> list_to_binary(lists:concat([Mod, ":", Func])). get_rule_metrics(Id) -> - Format = fun( - Node, - #{ - counters := - #{ - 'matched' := Matched, - 'passed' := Passed, - 'failed' := Failed, - 'failed.exception' := FailedEx, - 'failed.no_result' := FailedNoRes, - 'actions.total' := OTotal, - 'actions.failed' := OFailed, - 'actions.failed.out_of_service' := OFailedOOS, - 'actions.failed.unknown' := OFailedUnknown, - 'actions.success' := OFailedSucc - }, - rate := - #{ - 'matched' := - #{current := Current, max := Max, last5m := Last5M} - } - } - ) -> - #{ - metrics => ?METRICS( - Matched, - Passed, - Failed, - FailedEx, - FailedNoRes, - OTotal, - OFailed, - OFailedOOS, - OFailedUnknown, - OFailedSucc, - Current, - Max, - Last5M - ), - node => Node - } + Format = fun + ( + Node, + #{ + counters := + #{ + 'matched' := Matched, + 'passed' := Passed, + 'failed' := Failed, + 'failed.exception' := FailedEx, + 'failed.no_result' := FailedNoRes, + 'actions.total' := OTotal, + 'actions.failed' := OFailed, + 'actions.failed.out_of_service' := OFailedOOS, + 'actions.failed.unknown' := OFailedUnknown, + 'actions.success' := OFailedSucc + }, + rate := + #{ + 'matched' := + #{current := Current, max := Max, last5m := Last5M} + } + } + ) -> + #{ + metrics => ?METRICS( + Matched, + Passed, + Failed, + FailedEx, + FailedNoRes, + OTotal, + OFailed, + OFailedOOS, + OFailedUnknown, + OFailedSucc, + Current, + Max, + Last5M + ), + node => Node + }; + (Node, _Metrics) -> + %% Empty metrics: can happen when a node joins another and a bridge is not yet + %% replicated to it, so the counters map is empty. + #{ + metrics => ?METRICS( + _Matched = 0, + _Passed = 0, + _Failed = 0, + _FailedEx = 0, + _FailedNoRes = 0, + _OTotal = 0, + _OFailed = 0, + _OFailedOOS = 0, + _OFailedUnknown = 0, + _OFailedSucc = 0, + _Current = 0, + _Max = 0, + _Last5M = 0 + ), + node => Node + } end, [ Format(Node, emqx_plugin_libs_proto_v1:get_metrics(Node, rule_metrics, Id)) diff --git a/apps/emqx_rule_engine/test/emqx_rule_engine_api_SUITE.erl b/apps/emqx_rule_engine/test/emqx_rule_engine_api_SUITE.erl index ccee05604..e55eea977 100644 --- a/apps/emqx_rule_engine/test/emqx_rule_engine_api_SUITE.erl +++ b/apps/emqx_rule_engine/test/emqx_rule_engine_api_SUITE.erl @@ -94,6 +94,26 @@ t_crud_rule_api(_Config) -> ct:pal("RMetrics : ~p", [Metrics]), ?assertMatch(#{id := RuleId, metrics := _, node_metrics := _}, Metrics), + %% simulating a node joining a cluster and lagging the configuration replication; in + %% such cases, when fetching metrics, a rule may exist in the cluster but not on the + %% new node. We just check that it doesn't provoke a crash. + emqx_common_test_helpers:with_mock( + emqx_metrics_worker, + get_metrics, + fun(HandlerName, MetricId) -> + %% change the metric id to some unknown id. + meck:passthrough([HandlerName, <<"unknown-", MetricId/binary>>]) + end, + fun() -> + {200, Metrics1} = emqx_rule_engine_api:'/rules/:id/metrics'(get, #{ + bindings => #{id => RuleId} + }), + ct:pal("RMetrics : ~p", [Metrics1]), + ?assertMatch(#{id := RuleId, metrics := _, node_metrics := _}, Metrics1), + ok + end + ), + {200, Rule2} = emqx_rule_engine_api:'/rules/:id'(put, #{ bindings => #{id => RuleId}, body => ?SIMPLE_RULE(RuleId)#{<<"sql">> => <<"select * from \"t/b\"">>} diff --git a/changes/ce/fix-10884.en.md b/changes/ce/fix-10884.en.md new file mode 100644 index 000000000..d0848e099 --- /dev/null +++ b/changes/ce/fix-10884.en.md @@ -0,0 +1 @@ +Fixes an issue where trying to get rule info or metrics could result in a crash when a node is joining a cluster.