Merge pull request #13070 from zmstone/0518-improve-kafka-connection-error-logs

0518 improve kafka connection error logs
This commit is contained in:
zmstone 2024-05-29 16:24:01 +02:00 committed by GitHub
commit c54d25de98
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
3 changed files with 51 additions and 25 deletions

View File

@ -572,33 +572,54 @@ check_client_connectivity(ClientId) ->
{error, {find_client, Reason}} {error, {find_client, Reason}}
end. end.
is_alive(Pid) ->
is_pid(Pid) andalso erlang:is_process_alive(Pid).
error_summary(Map, [Error]) ->
Map#{error => Error};
error_summary(Map, [Error | More]) ->
Map#{first_error => Error, total_errors => length(More) + 1}.
check_if_healthy_leaders(ClientId, ClientPid, KafkaTopic, MaxPartitions) when is_pid(ClientPid) -> check_if_healthy_leaders(ClientId, ClientPid, KafkaTopic, MaxPartitions) when is_pid(ClientPid) ->
Leaders = case wolff_client:get_leader_connections(ClientPid, KafkaTopic, MaxPartitions) of
case wolff_client:get_leader_connections(ClientPid, KafkaTopic, MaxPartitions) of {ok, Leaders} ->
{ok, LeadersToCheck} -> %% Kafka is considered healthy as long as any of the partition leader is reachable.
%% Kafka is considered healthy as long as any of the partition leader is reachable. case lists:partition(fun({_Partition, Pid}) -> is_alive(Pid) end, Leaders) of
lists:filtermap( {[], Errors} ->
fun({_Partition, Pid}) -> throw(
case is_pid(Pid) andalso erlang:is_process_alive(Pid) of error_summary(
true -> {true, Pid}; #{
_ -> false cause => "no_connected_partition_leader",
end kafka_client => ClientId,
end, kafka_topic => KafkaTopic
LeadersToCheck },
); Errors
{error, _} -> )
[] );
end, {_, []} ->
case Leaders of ok;
[] -> {_, Errors} ->
?SLOG(
warning,
"not_all_kafka_partitions_connected",
error_summary(
#{
kafka_client => ClientId,
kafka_topic => KafkaTopic
},
Errors
)
),
ok
end;
{error, Reason} ->
%% If failed to fetch metadata, wolff_client logs a warning level message
%% which includes the reason for each seed host
throw(#{ throw(#{
error => no_connected_partition_leader, cause => Reason,
kafka_client => ClientId, kafka_client => ClientId,
kafka_topic => KafkaTopic, kafka_topic => KafkaTopic
partitions_limit => MaxPartitions })
});
_ ->
ok
end. end.
check_topic_status(ClientId, WolffClientPid, KafkaTopic) -> check_topic_status(ClientId, WolffClientPid, KafkaTopic) ->

View File

@ -245,7 +245,7 @@ t_license_setting_bc(_Config) ->
?assertMatch(#{<<"max_connections">> := 25}, request_dump()), ?assertMatch(#{<<"max_connections">> := 25}, request_dump()),
%% get %% get
GetRes = request(get, uri(["license", "setting"]), []), GetRes = request(get, uri(["license", "setting"]), []),
%% aslo check that the settings return correctly %% also check that the settings return correctly
validate_setting(GetRes, <<"75%">>, <<"80%">>, 25), validate_setting(GetRes, <<"75%">>, <<"80%">>, 25),
%% update %% update
Low = <<"50%">>, Low = <<"50%">>,

View File

@ -0,0 +1,5 @@
Improve Kafka connector error logs.
Previously, specific error details, such as unreachable advertised listeners, were not logged.
Now, error details are captured in the logs to provide more diagnostic information.
To manage log verbosity, only the first occurrence of an error is logged, accompanied by the total count of similar errors.