From 1e25ebb64cd1da147ac052e01157c1654a963e5a Mon Sep 17 00:00:00 2001 From: Thales Macedo Garitezi Date: Tue, 30 May 2023 17:20:32 -0300 Subject: [PATCH 1/7] test(gcp_pubsub): attempt to fix flakiness https://github.com/emqx/emqx/actions/runs/5125118728/jobs/9218520994?pr=10887#step:8:309 ``` =CRITICAL REPORT==== 30-May-2023::19:19:34.887082 === "check stage" failed: error {assertMatch,[{module,emqx_bridge_gcp_pubsub_SUITE}, {line,1066}, {expression,"? of_kind ( gcp_pubsub_request_failed , Trace )"}, {pattern,"[ # { reason := Error , connector := ResourceId } | _ ]"}, {value,[#{connector => <<"bridge:gcp_pubsub:emqx_bridge_gcp_pubsub_SUITE0005FCEE15534E9CD4CD02004CF10000">>, msg => gcp_pubsub_request_failed,query_mode => async, reason => {closed,"The connection was lost."}, recoverable_error => true, '~meta' => #{gl => <0.17903.2>, location => #Fun, node => 'test@127.0.0.1',pid => <0.19724.2>, time => -576460610660164}}]}]} Stacktrace: [{emqx_bridge_gcp_pubsub_SUITE, '-do_econnrefused_or_timeout_test/2-fun-2-',3, [{file, "/__w/emqx/emqx/source/apps/emqx_bridge_gcp_pubsub/test/emqx_bridge_gcp_pubsub_SUITE.erl"}, {line,1066}]}, {emqx_bridge_gcp_pubsub_SUITE,do_econnrefused_or_timeout_test,2, [{file, "/__w/emqx/emqx/source/apps/emqx_bridge_gcp_pubsub/test/emqx_bridge_gcp_pubsub_SUITE.erl"}, {line,1022}]}] ``` --- .../test/emqx_bridge_gcp_pubsub_SUITE.erl | 13 +++++++++---- 1 file changed, 9 insertions(+), 4 deletions(-) diff --git a/apps/emqx_bridge_gcp_pubsub/test/emqx_bridge_gcp_pubsub_SUITE.erl b/apps/emqx_bridge_gcp_pubsub/test/emqx_bridge_gcp_pubsub_SUITE.erl index 814051733..65b88d45b 100644 --- a/apps/emqx_bridge_gcp_pubsub/test/emqx_bridge_gcp_pubsub_SUITE.erl +++ b/apps/emqx_bridge_gcp_pubsub/test/emqx_bridge_gcp_pubsub_SUITE.erl @@ -1062,10 +1062,15 @@ do_econnrefused_or_timeout_test(Config, Error) -> fun(Trace) -> case Error of econnrefused -> - ?assertMatch( - [#{reason := Error, connector := ResourceId} | _], - ?of_kind(gcp_pubsub_request_failed, Trace) - ); + case ?of_kind(gcp_pubsub_request_failed, Trace) of + [#{reason := Error, connector := ResourceId} | _] -> + ok; + [#{reason := {closed, _Msg}, connector := ResourceId} | _] -> + %% _Msg = "The connection was lost." + ok; + Trace0 -> + error({unexpected_trace, Trace0}) + end; timeout -> ?assertMatch( [_, _ | _], From 0d539e91d16c59c84523fe0ce4590a7cc1d13ffb Mon Sep 17 00:00:00 2001 From: Thales Macedo Garitezi Date: Tue, 30 May 2023 17:32:05 -0300 Subject: [PATCH 2/7] test(pulsar_producer): attempt to stabilize flaky test https://github.com/emqx/emqx/actions/runs/5125166433/jobs/9218613872?pr=10886#step:7:679 ``` =CRITICAL REPORT==== 30-May-2023::19:38:58.003170 === Run stage failed: error:{badmatch, {timeout, [#{msg => pulsar_producer_bridge_started, '~meta' => #{gl => <97891.472.0>, location => #Fun, node => 'autocluster_node1@127.0.0.1', pid => <97891.787.0>, time => -576460611692219}}]}} Stacktrace: [{emqx_bridge_pulsar_impl_producer_SUITE,'-t_cluster/1-fun-10-', 6, [{file, "/emqx/apps/emqx_bridge_pulsar/test/emqx_bridge_pulsar_impl_producer_SUITE.erl"}, {line,1073}]}, {emqx_bridge_pulsar_impl_producer_SUITE,t_cluster,1, [{file, "/emqx/apps/emqx_bridge_pulsar/test/emqx_bridge_pulsar_impl_producer_SUITE.erl"}, {line,1064}]}] ``` --- apps/emqx/test/emqx_common_test_helpers.erl | 2 +- apps/emqx_bridge/src/emqx_bridge_app.erl | 3 +++ .../test/emqx_bridge_pulsar_impl_producer_SUITE.erl | 10 +++++++++- 3 files changed, 13 insertions(+), 2 deletions(-) diff --git a/apps/emqx/test/emqx_common_test_helpers.erl b/apps/emqx/test/emqx_common_test_helpers.erl index f85ed2599..7aaf93c99 100644 --- a/apps/emqx/test/emqx_common_test_helpers.erl +++ b/apps/emqx/test/emqx_common_test_helpers.erl @@ -756,8 +756,8 @@ start_slave(Name, Opts) when is_map(Opts) -> throw(Other) end, pong = net_adm:ping(Node), - setup_node(Node, Opts), ok = snabbkaffe:forward_trace(Node), + setup_node(Node, Opts), Node. %% Node stopping diff --git a/apps/emqx_bridge/src/emqx_bridge_app.erl b/apps/emqx_bridge/src/emqx_bridge_app.erl index daae15a17..e59259c3e 100644 --- a/apps/emqx_bridge/src/emqx_bridge_app.erl +++ b/apps/emqx_bridge/src/emqx_bridge_app.erl @@ -17,6 +17,8 @@ -behaviour(application). +-include_lib("snabbkaffe/include/snabbkaffe.hrl"). + -export([start/2, stop/1]). -export([ @@ -34,6 +36,7 @@ start(_StartType, _StartArgs) -> ok = emqx_bridge:load_hook(), ok = emqx_config_handler:add_handler(?LEAF_NODE_HDLR_PATH, ?MODULE), ok = emqx_config_handler:add_handler(?TOP_LELVE_HDLR_PATH, emqx_bridge), + ?tp(emqx_bridge_app_started, #{}), {ok, Sup}. stop(_State) -> diff --git a/apps/emqx_bridge_pulsar/test/emqx_bridge_pulsar_impl_producer_SUITE.erl b/apps/emqx_bridge_pulsar/test/emqx_bridge_pulsar_impl_producer_SUITE.erl index a5c04160c..9dc2f05d6 100644 --- a/apps/emqx_bridge_pulsar/test/emqx_bridge_pulsar_impl_producer_SUITE.erl +++ b/apps/emqx_bridge_pulsar/test/emqx_bridge_pulsar_impl_producer_SUITE.erl @@ -1064,9 +1064,17 @@ t_cluster(Config) -> ?check_trace( begin Nodes = [N1, N2 | _] = start_cluster(Cluster), + %% wait until bridge app supervisor is up; by that point, + %% `emqx_config_handler:add_handler' has been called and the node should be + %% ready to create bridges. + NumNodes = length(Nodes), + {ok, _} = snabbkaffe:block_until( + ?match_n_events(NumNodes, #{?snk_kind := emqx_bridge_app_started}), + 15_000 + ), {ok, SRef0} = snabbkaffe:subscribe( ?match_event(#{?snk_kind := pulsar_producer_bridge_started}), - length(Nodes), + NumNodes, 15_000 ), {ok, _} = erpc:call(N1, fun() -> create_bridge(Config) end), From e7b35011e60f50113d3c582477edf9eab6802f70 Mon Sep 17 00:00:00 2001 From: Thales Macedo Garitezi Date: Tue, 30 May 2023 18:00:35 -0300 Subject: [PATCH 3/7] test(oracle): attempt to fix flaky test https://github.com/emqx/emqx/actions/runs/5126014459/jobs/9220155908#step:7:754 ``` =CRITICAL REPORT==== 30-May-2023::20:50:26.267103 === Run stage failed: error:{assertMatch, [{module,emqx_bridge_oracle_SUITE}, {line,394}, {expression, "emqx_resource : simple_sync_query ( ResourceId , { query , \"SELECT COUNT(*) FROM mqtt_test\" } )"}, {pattern, "{ ok , [ { result_set , _ , _ , [ [ { 3 } ] ] } ] }"}, {value, {ok,[{result_set,[<<"COUNT(*)">>],[],[[{0}]]}]}}]} Stacktrace: [{emqx_bridge_oracle_SUITE,'-t_batch_sync_query/1-fun-5-',1, [{file, "/emqx/apps/emqx_bridge_oracle/test/emqx_bridge_oracle_SUITE.erl"}, {line,394}]}, {emqx_bridge_oracle_SUITE,'-t_batch_sync_query/1-fun-6-',5, [{file, "/emqx/apps/emqx_bridge_oracle/test/emqx_bridge_oracle_SUITE.erl"}, {line,390}]}, {emqx_bridge_oracle_SUITE,t_batch_sync_query,1, [{file, "/emqx/apps/emqx_bridge_oracle/test/emqx_bridge_oracle_SUITE.erl"}, {line,364}]}] ``` --- .../test/emqx_bridge_oracle_SUITE.erl | 19 +++++++++++++------ 1 file changed, 13 insertions(+), 6 deletions(-) diff --git a/apps/emqx_bridge_oracle/test/emqx_bridge_oracle_SUITE.erl b/apps/emqx_bridge_oracle/test/emqx_bridge_oracle_SUITE.erl index f280c51d6..721beab6e 100644 --- a/apps/emqx_bridge_oracle/test/emqx_bridge_oracle_SUITE.erl +++ b/apps/emqx_bridge_oracle/test/emqx_bridge_oracle_SUITE.erl @@ -203,8 +203,9 @@ oracle_config(TestCase, _ConnectionType, Config) -> " pool_size = 1\n" " sql = \"~s\"\n" " resource_opts = {\n" - " auto_restart_interval = 5000\n" - " request_timeout = 30000\n" + " auto_restart_interval = \"5s\"\n" + " health_check_interval = \"5s\"\n" + " request_timeout = \"30s\"\n" " query_mode = \"async\"\n" " enable_batch = true\n" " batch_size = 3\n" @@ -233,6 +234,11 @@ resource_id(Config) -> Name = ?config(oracle_name, Config), emqx_bridge_resource:resource_id(Type, Name). +bridge_id(Config) -> + Type = ?BRIDGE_TYPE_BIN, + Name = ?config(oracle_name, Config), + emqx_bridge_resource:bridge_id(Type, Name). + create_bridge(Config) -> create_bridge(Config, _Overrides = #{}). @@ -361,6 +367,7 @@ t_batch_sync_query(Config) -> ProxyHost = ?config(proxy_host, Config), ProxyName = ?config(proxy_name, Config), ResourceId = resource_id(Config), + BridgeId = bridge_id(Config), ?check_trace( begin ?assertMatch({ok, _}, create_bridge_api(Config)), @@ -380,12 +387,12 @@ t_batch_sync_query(Config) -> % Send 3 async messages while resource is down. When it comes back, these messages % will be delivered in sync way. If we try to send sync messages directly, it will % be sent async as callback_mode is set to async_if_possible. - Message = {send_message, Params}, emqx_common_test_helpers:with_failure(down, ProxyName, ProxyHost, ProxyPort, fun() -> ct:sleep(1000), - emqx_resource:query(ResourceId, Message), - emqx_resource:query(ResourceId, Message), - emqx_resource:query(ResourceId, Message) + emqx_bridge:send_message(BridgeId, Params), + emqx_bridge:send_message(BridgeId, Params), + emqx_bridge:send_message(BridgeId, Params), + ok end), ?retry( _Sleep = 1_000, From 83a7c800cc231a3d899290182bdc2ce7d8e71281 Mon Sep 17 00:00:00 2001 From: Thales Macedo Garitezi Date: Wed, 31 May 2023 09:48:08 -0300 Subject: [PATCH 4/7] test(kafka_consumer): reduce flakiness The crash reason can also be `{error, not_found}` depending on timing... ``` =ERROR REPORT==== 31-May-2023::12:17:31.461490 === exception: error key_path: [bridges,kafka_consumer, 't_resource_manager_crash_before_subscriber_started-576460752303419630'] module: emqx_conf_schema msg: change_config_crashed reason: {badmatch,{error,not_found}} stacktrace: [{emqx_resource_manager,create_and_return_data,5, [{file, "/emqx/apps/emqx_resource/src/emqx_resource_manager.erl"}, {line,119}]}, {emqx_bridge_resource,create,4, [{file, "/emqx/apps/emqx_bridge/src/emqx_bridge_resource.erl"}, {line,171}]}, {emqx_bridge,'-perform_bridge_changes/2-fun-1-',5, [{file,"/emqx/apps/emqx_bridge/src/emqx_bridge.erl"}, {line,367}]}, {maps,fold_1,3,[{file,"maps.erl"},{line,411}]}, {emqx_bridge,perform_bridge_changes,2, [{file,"/emqx/apps/emqx_bridge/src/emqx_bridge.erl"}, {line,354}]}, {emqx_bridge,post_config_update,5, [{file,"/emqx/apps/emqx_bridge/src/emqx_bridge.erl"}, {line,229}]}, {emqx_config_handler,call_post_config_update,7, [{file,"/emqx/apps/emqx/src/emqx_config_handler.erl"}, {line,398}]}, {emqx_config_handler,do_post_config_update,8, [{file,"/emqx/apps/emqx/src/emqx_config_handler.erl"}, {line,336}]}, {emqx_config_handler,check_and_save_configs,7, [{file,"/emqx/apps/emqx/src/emqx_config_handler.erl"}, {line,281}]}, {emqx_config_handler,handle_update_request,4, [{file,"/emqx/apps/emqx/src/emqx_config_handler.erl"}, {line,195}]}, {emqx_config_handler,handle_call,3, [{file,"/emqx/apps/emqx/src/emqx_config_handler.erl"}, {line,124}]}, {gen_server,try_handle_call,4, [{file,"gen_server.erl"},{line,1149}]}, {gen_server,handle_msg,6, [{file,"gen_server.erl"},{line,1178}]}, {proc_lib,init_p_do_apply,3, [{file,"proc_lib.erl"},{line,240}]}] update_req: {{update,#{<<"authentication">> => <<"none">>, <<"bootstrap_hosts">> => <<"toxiproxy.emqx.net:9292">>, <<"connect_timeout">> => <<"5s">>, <<"enable">> => true, <<"kafka">> => #{<<"max_batch_bytes">> => <<"896KB">>, <<"max_rejoin_attempts">> => 5, <<"offset_commit_interval_seconds">> => 3, <<"offset_reset_policy">> => <<"latest">>}, <<"key_encoding_mode">> => <<"none">>, <<"metadata_request_timeout">> => <<"5s">>, <<"min_metadata_refresh_interval">> => <<"3s">>, <<"ssl">> => #{<<"enable">> => false, <<"server_name_indication">> => <<"auto">>, <<"verify">> => <<"verify_none">>}, <<"topic_mapping">> => [#{<<"kafka_topic">> => <<"t_resource_manager_crash_before_subscriber_started-576460752303419662">>, <<"mqtt_topic">> => <<"mqtt/topic/-576460752303422911">>, <<"payload_template">> => <<"${.}">>, <<"qos">> => 0}], <<"value_encoding_mode">> => <<"none">>}}, #{override_to => cluster}} ``` --- .../test/emqx_bridge_kafka_impl_consumer_SUITE.erl | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/apps/emqx_bridge_kafka/test/emqx_bridge_kafka_impl_consumer_SUITE.erl b/apps/emqx_bridge_kafka/test/emqx_bridge_kafka_impl_consumer_SUITE.erl index 7e7acbcd5..33c207c39 100644 --- a/apps/emqx_bridge_kafka/test/emqx_bridge_kafka_impl_consumer_SUITE.erl +++ b/apps/emqx_bridge_kafka/test/emqx_bridge_kafka_impl_consumer_SUITE.erl @@ -2078,7 +2078,7 @@ t_resource_manager_crash_after_subscriber_started(Config) -> 10_000 ), case Res of - {error, {config_update_crashed, {killed, _}}} -> + {error, {config_update_crashed, _}} -> ok; {ok, _} -> %% the new manager may have had time to startup @@ -2135,7 +2135,7 @@ t_resource_manager_crash_before_subscriber_started(Config) -> 10_000 ), case Res of - {error, {config_update_crashed, {killed, _}}} -> + {error, {config_update_crashed, _}} -> ok; {ok, _} -> %% the new manager may have had time to startup From dc4ae1a424e72a9fa15e44051a5768f15a613f7e Mon Sep 17 00:00:00 2001 From: Thales Macedo Garitezi Date: Wed, 31 May 2023 10:56:06 -0300 Subject: [PATCH 5/7] ci: make emqx standalone test output more readable --- .github/workflows/run_emqx_app_tests.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/run_emqx_app_tests.yaml b/.github/workflows/run_emqx_app_tests.yaml index ffed8d0c8..24c3d2b42 100644 --- a/.github/workflows/run_emqx_app_tests.yaml +++ b/.github/workflows/run_emqx_app_tests.yaml @@ -71,7 +71,7 @@ jobs: ./rebar3 xref ./rebar3 dialyzer ./rebar3 eunit -v - ./rebar3 ct -v + ./rebar3 ct -v --readable=true ./rebar3 proper -d test/props - uses: actions/upload-artifact@v3 if: failure() From 77c9eda036e32d351e9f266f3b449ba41bbd71b2 Mon Sep 17 00:00:00 2001 From: Thales Macedo Garitezi Date: Wed, 31 May 2023 11:07:44 -0300 Subject: [PATCH 6/7] test: rm stray `clear_screen` calls --- apps/emqx/test/emqx_broker_SUITE.erl | 1 - lib-ee/emqx_ee_bridge/test/emqx_ee_bridge_mysql_SUITE.erl | 1 - 2 files changed, 2 deletions(-) diff --git a/apps/emqx/test/emqx_broker_SUITE.erl b/apps/emqx/test/emqx_broker_SUITE.erl index d0c26ceb5..6e03971a5 100644 --- a/apps/emqx/test/emqx_broker_SUITE.erl +++ b/apps/emqx/test/emqx_broker_SUITE.erl @@ -93,7 +93,6 @@ end_per_group(_Group, _Config) -> emqx_common_test_helpers:stop_apps([]). init_per_suite(Config) -> - emqx_common_test_helpers:clear_screen(), Config. end_per_suite(_Config) -> diff --git a/lib-ee/emqx_ee_bridge/test/emqx_ee_bridge_mysql_SUITE.erl b/lib-ee/emqx_ee_bridge/test/emqx_ee_bridge_mysql_SUITE.erl index e497e0a47..707aa47ea 100644 --- a/lib-ee/emqx_ee_bridge/test/emqx_ee_bridge_mysql_SUITE.erl +++ b/lib-ee/emqx_ee_bridge/test/emqx_ee_bridge_mysql_SUITE.erl @@ -102,7 +102,6 @@ end_per_group(_Group, _Config) -> ok. init_per_suite(Config) -> - emqx_common_test_helpers:clear_screen(), Config. end_per_suite(_Config) -> From 189e2c87be8f8f5da2d13c6212a50f9523de3cdf Mon Sep 17 00:00:00 2001 From: Thales Macedo Garitezi Date: Wed, 31 May 2023 11:29:48 -0300 Subject: [PATCH 7/7] test(channel): fix flaky test ``` %%% emqx_channel_SUITE ==> t_handle_kicked_publish_will_msg: FAILED %%% emqx_channel_SUITE ==> will_message_not_published ``` The problem was that the whole generated message was compared for equality when receiving the will message, but the timestamp may be different at publishing time... --- apps/emqx/test/emqx_channel_SUITE.erl | 16 +++++++++++++--- 1 file changed, 13 insertions(+), 3 deletions(-) diff --git a/apps/emqx/test/emqx_channel_SUITE.erl b/apps/emqx/test/emqx_channel_SUITE.erl index e77bef90c..ce02f16c3 100644 --- a/apps/emqx/test/emqx_channel_SUITE.erl +++ b/apps/emqx/test/emqx_channel_SUITE.erl @@ -885,14 +885,24 @@ t_handle_kicked_publish_will_msg(_) -> Self = self(), ok = meck:expect(emqx_broker, publish, fun(M) -> Self ! {pub, M} end), - Msg = emqx_message:make(test, <<"will_topic">>, <<"will_payload">>), + ClientId = test, + WillTopic = <<"will_topic">>, + WillPayload = <<"will_payload">>, + Msg = emqx_message:make(ClientId, WillTopic, WillPayload), {shutdown, kicked, ok, ?DISCONNECT_PACKET(?RC_ADMINISTRATIVE_ACTION), _} = emqx_channel:handle_call( kick, channel(#{will_msg => Msg}) ), receive - {pub, Msg} -> ok - after 10_000 -> exit(will_message_not_published) + {pub, RecMsg} -> + ?assertEqual(ClientId, RecMsg#message.from, #{msg => Msg}), + ?assertEqual(WillTopic, RecMsg#message.topic, #{msg => Msg}), + ?assertEqual(WillPayload, RecMsg#message.payload, #{msg => Msg}), + ok + after 5_000 -> + ct:pal("expected message: ~p", [Msg]), + ct:pal("~p mailbox: ~p", [?LINE, process_info(self(), messages)]), + exit(will_message_not_published) end. t_handle_call_discard(_) ->