feat(rule engine SQL): add an `unescape` function
The added `unescape` function unescapes escape sequences, transforming them back to their represented characters. The following escape sequences are supported: - Standard C escape sequences: - `\n` for newline (LF) - `\t` for horizontal tab (HT) - `\r` for carriage return (CR) - `\b` for backspace (BS) - `\f` for formfeed (FF) - `\v` for vertical tab (VT) - `\'` for single quote (') - `\"` for double quote (") - `\\` for backslash (\) - `\?` for question mark (?) - `\a` for alert (bell, BEL) - Hexadecimal escape codes: - `\xH...` where `H...` is one or more hexadecimal digits (0-9, A-F, a-f), allowing for the encoding of arbitrary utf32 characters. If an escape sequence is not recognized, or if the hexadecimal escape does not form a valid Unicode character, the function generates an exception. Fixes: https://github.com/emqx/emqx/issues/12460 https://emqx.atlassian.net/browse/EMQX-11847
This commit is contained in:
parent
060e02c4c4
commit
5a6f96212d
|
@ -161,7 +161,8 @@
|
|||
join_to_string/2,
|
||||
join_to_sql_values_string/1,
|
||||
jq/2,
|
||||
jq/3
|
||||
jq/3,
|
||||
unescape/1
|
||||
]).
|
||||
|
||||
%% Map Funcs
|
||||
|
@ -937,6 +938,138 @@ jq(FilterProgram, JSONBin) ->
|
|||
])
|
||||
).
|
||||
|
||||
unescape(Bin) when is_binary(Bin) ->
|
||||
UnicodeList = unicode:characters_to_list(Bin, utf8),
|
||||
UnescapedUnicodeList = unescape_string(UnicodeList),
|
||||
UnescapedUTF8Bin = unicode:characters_to_binary(UnescapedUnicodeList, utf32, utf8),
|
||||
case UnescapedUTF8Bin of
|
||||
Out when is_binary(Out) ->
|
||||
Out;
|
||||
Error ->
|
||||
throw({invalid_unicode_character, Error})
|
||||
end.
|
||||
|
||||
unescape_string(Input) -> unescape_string(Input, []).
|
||||
|
||||
unescape_string([], Acc) ->
|
||||
lists:reverse(Acc);
|
||||
unescape_string([$\\, $\\ | Rest], Acc) ->
|
||||
unescape_string(Rest, [$\\ | Acc]);
|
||||
unescape_string([$\\, $n | Rest], Acc) ->
|
||||
unescape_string(Rest, [$\n | Acc]);
|
||||
unescape_string([$\\, $t | Rest], Acc) ->
|
||||
unescape_string(Rest, [$\t | Acc]);
|
||||
unescape_string([$\\, $r | Rest], Acc) ->
|
||||
unescape_string(Rest, [$\r | Acc]);
|
||||
unescape_string([$\\, $b | Rest], Acc) ->
|
||||
unescape_string(Rest, [$\b | Acc]);
|
||||
unescape_string([$\\, $f | Rest], Acc) ->
|
||||
unescape_string(Rest, [$\f | Acc]);
|
||||
unescape_string([$\\, $v | Rest], Acc) ->
|
||||
unescape_string(Rest, [$\v | Acc]);
|
||||
unescape_string([$\\, $' | Rest], Acc) ->
|
||||
unescape_string(Rest, [$\' | Acc]);
|
||||
unescape_string([$\\, $" | Rest], Acc) ->
|
||||
unescape_string(Rest, [$\" | Acc]);
|
||||
unescape_string([$\\, $? | Rest], Acc) ->
|
||||
unescape_string(Rest, [$\? | Acc]);
|
||||
unescape_string([$\\, $a | Rest], Acc) ->
|
||||
unescape_string(Rest, [$\a | Acc]);
|
||||
%% Start of HEX escape code
|
||||
unescape_string([$\\, $x | [$0 | _] = HexStringStart], Acc) ->
|
||||
unescape_handle_hex_string(HexStringStart, Acc);
|
||||
unescape_string([$\\, $x | [$1 | _] = HexStringStart], Acc) ->
|
||||
unescape_handle_hex_string(HexStringStart, Acc);
|
||||
unescape_string([$\\, $x | [$2 | _] = HexStringStart], Acc) ->
|
||||
unescape_handle_hex_string(HexStringStart, Acc);
|
||||
unescape_string([$\\, $x | [$3 | _] = HexStringStart], Acc) ->
|
||||
unescape_handle_hex_string(HexStringStart, Acc);
|
||||
unescape_string([$\\, $x | [$4 | _] = HexStringStart], Acc) ->
|
||||
unescape_handle_hex_string(HexStringStart, Acc);
|
||||
unescape_string([$\\, $x | [$5 | _] = HexStringStart], Acc) ->
|
||||
unescape_handle_hex_string(HexStringStart, Acc);
|
||||
unescape_string([$\\, $x | [$6 | _] = HexStringStart], Acc) ->
|
||||
unescape_handle_hex_string(HexStringStart, Acc);
|
||||
unescape_string([$\\, $x | [$7 | _] = HexStringStart], Acc) ->
|
||||
unescape_handle_hex_string(HexStringStart, Acc);
|
||||
unescape_string([$\\, $x | [$8 | _] = HexStringStart], Acc) ->
|
||||
unescape_handle_hex_string(HexStringStart, Acc);
|
||||
unescape_string([$\\, $x | [$9 | _] = HexStringStart], Acc) ->
|
||||
unescape_handle_hex_string(HexStringStart, Acc);
|
||||
unescape_string([$\\, $x | [$A | _] = HexStringStart], Acc) ->
|
||||
unescape_handle_hex_string(HexStringStart, Acc);
|
||||
unescape_string([$\\, $x | [$B | _] = HexStringStart], Acc) ->
|
||||
unescape_handle_hex_string(HexStringStart, Acc);
|
||||
unescape_string([$\\, $x | [$C | _] = HexStringStart], Acc) ->
|
||||
unescape_handle_hex_string(HexStringStart, Acc);
|
||||
unescape_string([$\\, $x | [$D | _] = HexStringStart], Acc) ->
|
||||
unescape_handle_hex_string(HexStringStart, Acc);
|
||||
unescape_string([$\\, $x | [$E | _] = HexStringStart], Acc) ->
|
||||
unescape_handle_hex_string(HexStringStart, Acc);
|
||||
unescape_string([$\\, $x | [$F | _] = HexStringStart], Acc) ->
|
||||
unescape_handle_hex_string(HexStringStart, Acc);
|
||||
unescape_string([$\\, $x | [$a | _] = HexStringStart], Acc) ->
|
||||
unescape_handle_hex_string(HexStringStart, Acc);
|
||||
unescape_string([$\\, $x | [$b | _] = HexStringStart], Acc) ->
|
||||
unescape_handle_hex_string(HexStringStart, Acc);
|
||||
unescape_string([$\\, $x | [$c | _] = HexStringStart], Acc) ->
|
||||
unescape_handle_hex_string(HexStringStart, Acc);
|
||||
unescape_string([$\\, $x | [$d | _] = HexStringStart], Acc) ->
|
||||
unescape_handle_hex_string(HexStringStart, Acc);
|
||||
unescape_string([$\\, $x | [$e | _] = HexStringStart], Acc) ->
|
||||
unescape_handle_hex_string(HexStringStart, Acc);
|
||||
unescape_string([$\\, $x | [$f | _] = HexStringStart], Acc) ->
|
||||
unescape_handle_hex_string(HexStringStart, Acc);
|
||||
%% We treat all other escape sequences as not valid input to leave room for
|
||||
%% extending the function to support more escape codes
|
||||
unescape_string([$\\, X | _Rest], _Acc) ->
|
||||
erlang:throw({unrecognized_escape_sequence, list_to_binary([$\\, X])});
|
||||
unescape_string([First | Rest], Acc) ->
|
||||
unescape_string(Rest, [First | Acc]).
|
||||
|
||||
unescape_handle_hex_string(HexStringStart, Acc) ->
|
||||
{RemainingString, Num} = parse_hex_string(HexStringStart),
|
||||
unescape_string(RemainingString, [Num | Acc]).
|
||||
|
||||
parse_hex_string(SeqStartingWithHexDigit) ->
|
||||
parse_hex_string(SeqStartingWithHexDigit, []).
|
||||
|
||||
parse_hex_string([], Acc) ->
|
||||
ReversedAcc = lists:reverse(Acc),
|
||||
{[], list_to_integer(ReversedAcc, 16)};
|
||||
parse_hex_string([First | Rest] = String, Acc) ->
|
||||
case is_hex_digit(First) of
|
||||
true ->
|
||||
parse_hex_string(Rest, [First | Acc]);
|
||||
false ->
|
||||
ReversedAcc = lists:reverse(Acc),
|
||||
{String, list_to_integer(ReversedAcc, 16)}
|
||||
end.
|
||||
|
||||
is_hex_digit($0) -> true;
|
||||
is_hex_digit($1) -> true;
|
||||
is_hex_digit($2) -> true;
|
||||
is_hex_digit($3) -> true;
|
||||
is_hex_digit($4) -> true;
|
||||
is_hex_digit($5) -> true;
|
||||
is_hex_digit($6) -> true;
|
||||
is_hex_digit($7) -> true;
|
||||
is_hex_digit($8) -> true;
|
||||
is_hex_digit($9) -> true;
|
||||
is_hex_digit($A) -> true;
|
||||
is_hex_digit($B) -> true;
|
||||
is_hex_digit($C) -> true;
|
||||
is_hex_digit($D) -> true;
|
||||
is_hex_digit($E) -> true;
|
||||
is_hex_digit($F) -> true;
|
||||
is_hex_digit($a) -> true;
|
||||
is_hex_digit($b) -> true;
|
||||
is_hex_digit($c) -> true;
|
||||
is_hex_digit($d) -> true;
|
||||
is_hex_digit($e) -> true;
|
||||
is_hex_digit($f) -> true;
|
||||
is_hex_digit(_) -> false.
|
||||
|
||||
%%------------------------------------------------------------------------------
|
||||
%% Array Funcs
|
||||
%%------------------------------------------------------------------------------
|
||||
|
|
|
@ -736,6 +736,60 @@ t_regex_replace(_) ->
|
|||
?assertEqual(<<"aebed">>, apply_func(regex_replace, [<<"accbcd">>, <<"c+">>, <<"e">>])),
|
||||
?assertEqual(<<"a[cc]b[c]d">>, apply_func(regex_replace, [<<"accbcd">>, <<"c+">>, <<"[&]">>])).
|
||||
|
||||
t_unescape(_) ->
|
||||
?assertEqual(<<"\n">>, emqx_rule_funcs:unescape(<<"\\n">>)),
|
||||
?assertEqual(<<"\t">>, emqx_rule_funcs:unescape(<<"\\t">>)),
|
||||
?assertEqual(<<"\r">>, emqx_rule_funcs:unescape(<<"\\r">>)),
|
||||
?assertEqual(<<"\b">>, emqx_rule_funcs:unescape(<<"\\b">>)),
|
||||
?assertEqual(<<"\f">>, emqx_rule_funcs:unescape(<<"\\f">>)),
|
||||
?assertEqual(<<"\v">>, emqx_rule_funcs:unescape(<<"\\v">>)),
|
||||
?assertEqual(<<"'">>, emqx_rule_funcs:unescape(<<"\\'">>)),
|
||||
?assertEqual(<<"\"">>, emqx_rule_funcs:unescape(<<"\\\"">>)),
|
||||
?assertEqual(<<"?">>, emqx_rule_funcs:unescape(<<"\\?">>)),
|
||||
?assertEqual(<<"\a">>, emqx_rule_funcs:unescape(<<"\\a">>)),
|
||||
% Test escaping backslash itself
|
||||
?assertEqual(<<"\\">>, emqx_rule_funcs:unescape(<<"\\\\">>)),
|
||||
% Test a string without any escape sequences
|
||||
?assertEqual(<<"Hello, World!">>, emqx_rule_funcs:unescape(<<"Hello, World!">>)),
|
||||
% Test a string with escape sequences
|
||||
?assertEqual(<<"Hello,\t World\n!">>, emqx_rule_funcs:unescape(<<"Hello,\\t World\\n!">>)),
|
||||
% Test unrecognized escape sequence (should throw an error)
|
||||
?assertException(
|
||||
throw, {unrecognized_escape_sequence, <<$\\, $L>>}, emqx_rule_funcs:unescape(<<"\\L">>)
|
||||
),
|
||||
% Test hexadecimal escape sequences
|
||||
|
||||
% Newline
|
||||
?assertEqual(<<"\n">>, emqx_rule_funcs:unescape(<<"\\x0A">>)),
|
||||
% Newline
|
||||
?assertEqual(<<"hej\n">>, emqx_rule_funcs:unescape(<<"hej\\x0A">>)),
|
||||
% Newline
|
||||
?assertEqual(<<"\nhej">>, emqx_rule_funcs:unescape(<<"\\x0Ahej">>)),
|
||||
% Newline
|
||||
?assertEqual(<<"hej\nhej">>, emqx_rule_funcs:unescape(<<"hej\\x0Ahej">>)),
|
||||
% "ABC"
|
||||
?assertEqual(<<"ABC">>, emqx_rule_funcs:unescape(<<"\\x41\\x42\\x43">>)),
|
||||
% "\xFF" = 255 in decimal
|
||||
?assertEqual(<<"\xFF"/utf8>>, emqx_rule_funcs:unescape(<<"\\xFF">>)),
|
||||
% "W" = \x57
|
||||
?assertEqual(<<"Hello, World!">>, emqx_rule_funcs:unescape(<<"Hello, \\x57orld!">>)).
|
||||
|
||||
t_unescape_hex(_) ->
|
||||
?assertEqual(<<"A"/utf8>>, emqx_rule_funcs:unescape(<<"\\x41">>)),
|
||||
?assertEqual(<<"Hello"/utf8>>, emqx_rule_funcs:unescape(<<"\\x48\\x65\\x6c\\x6c\\x6f">>)),
|
||||
?assertEqual(<<"A"/utf8>>, emqx_rule_funcs:unescape(<<"\\x0041">>)),
|
||||
?assertEqual(<<"€"/utf8>>, emqx_rule_funcs:unescape(<<"\\x20AC">>)),
|
||||
?assertEqual(<<"❤"/utf8>>, emqx_rule_funcs:unescape(<<"\\x2764">>)),
|
||||
?assertException(
|
||||
throw, {unrecognized_escape_sequence, <<"\\x">>}, emqx_rule_funcs:unescape(<<"\\xG1">>)
|
||||
),
|
||||
?assertException(
|
||||
throw, {invalid_unicode_character, _}, emqx_rule_funcs:unescape(<<"\\x11000000">>)
|
||||
),
|
||||
?assertEqual(
|
||||
<<"Hello, 世界"/utf8>>, emqx_rule_funcs:unescape(<<"Hello, \\x00004E16\\x0000754C">>)
|
||||
).
|
||||
|
||||
jq_1_elm_res(JSONString) ->
|
||||
Bin = list_to_binary(JSONString),
|
||||
[apply_func(json_decode, [Bin])].
|
||||
|
|
Loading…
Reference in New Issue