feat(rule engine SQL): add an `unescape` function

The added `unescape` function unescapes escape sequences, transforming
them back to their represented characters. The following escape
sequences are supported:

- Standard C escape sequences:
  - `\n` for newline (LF)
  - `\t` for horizontal tab (HT)
  - `\r` for carriage return (CR)
  - `\b` for backspace (BS)
  - `\f` for formfeed (FF)
  - `\v` for vertical tab (VT)
  - `\'` for single quote (')
  - `\"` for double quote (")
  - `\\` for backslash (\)
  - `\?` for question mark (?)
  - `\a` for alert (bell, BEL)

- Hexadecimal escape codes:
  - `\xH...` where `H...` is one or more hexadecimal digits (0-9, A-F,
    a-f), allowing for the encoding of arbitrary utf32 characters.

If an escape sequence is not recognized, or if the hexadecimal escape
does not form a valid Unicode character, the function generates an
exception.

Fixes:
https://github.com/emqx/emqx/issues/12460
https://emqx.atlassian.net/browse/EMQX-11847
This commit is contained in:
Kjell Winblad 2024-03-08 11:52:34 +01:00
parent 060e02c4c4
commit 5a6f96212d
2 changed files with 188 additions and 1 deletions

View File

@ -161,7 +161,8 @@
join_to_string/2,
join_to_sql_values_string/1,
jq/2,
jq/3
jq/3,
unescape/1
]).
%% Map Funcs
@ -937,6 +938,138 @@ jq(FilterProgram, JSONBin) ->
])
).
unescape(Bin) when is_binary(Bin) ->
UnicodeList = unicode:characters_to_list(Bin, utf8),
UnescapedUnicodeList = unescape_string(UnicodeList),
UnescapedUTF8Bin = unicode:characters_to_binary(UnescapedUnicodeList, utf32, utf8),
case UnescapedUTF8Bin of
Out when is_binary(Out) ->
Out;
Error ->
throw({invalid_unicode_character, Error})
end.
unescape_string(Input) -> unescape_string(Input, []).
unescape_string([], Acc) ->
lists:reverse(Acc);
unescape_string([$\\, $\\ | Rest], Acc) ->
unescape_string(Rest, [$\\ | Acc]);
unescape_string([$\\, $n | Rest], Acc) ->
unescape_string(Rest, [$\n | Acc]);
unescape_string([$\\, $t | Rest], Acc) ->
unescape_string(Rest, [$\t | Acc]);
unescape_string([$\\, $r | Rest], Acc) ->
unescape_string(Rest, [$\r | Acc]);
unescape_string([$\\, $b | Rest], Acc) ->
unescape_string(Rest, [$\b | Acc]);
unescape_string([$\\, $f | Rest], Acc) ->
unescape_string(Rest, [$\f | Acc]);
unescape_string([$\\, $v | Rest], Acc) ->
unescape_string(Rest, [$\v | Acc]);
unescape_string([$\\, $' | Rest], Acc) ->
unescape_string(Rest, [$\' | Acc]);
unescape_string([$\\, $" | Rest], Acc) ->
unescape_string(Rest, [$\" | Acc]);
unescape_string([$\\, $? | Rest], Acc) ->
unescape_string(Rest, [$\? | Acc]);
unescape_string([$\\, $a | Rest], Acc) ->
unescape_string(Rest, [$\a | Acc]);
%% Start of HEX escape code
unescape_string([$\\, $x | [$0 | _] = HexStringStart], Acc) ->
unescape_handle_hex_string(HexStringStart, Acc);
unescape_string([$\\, $x | [$1 | _] = HexStringStart], Acc) ->
unescape_handle_hex_string(HexStringStart, Acc);
unescape_string([$\\, $x | [$2 | _] = HexStringStart], Acc) ->
unescape_handle_hex_string(HexStringStart, Acc);
unescape_string([$\\, $x | [$3 | _] = HexStringStart], Acc) ->
unescape_handle_hex_string(HexStringStart, Acc);
unescape_string([$\\, $x | [$4 | _] = HexStringStart], Acc) ->
unescape_handle_hex_string(HexStringStart, Acc);
unescape_string([$\\, $x | [$5 | _] = HexStringStart], Acc) ->
unescape_handle_hex_string(HexStringStart, Acc);
unescape_string([$\\, $x | [$6 | _] = HexStringStart], Acc) ->
unescape_handle_hex_string(HexStringStart, Acc);
unescape_string([$\\, $x | [$7 | _] = HexStringStart], Acc) ->
unescape_handle_hex_string(HexStringStart, Acc);
unescape_string([$\\, $x | [$8 | _] = HexStringStart], Acc) ->
unescape_handle_hex_string(HexStringStart, Acc);
unescape_string([$\\, $x | [$9 | _] = HexStringStart], Acc) ->
unescape_handle_hex_string(HexStringStart, Acc);
unescape_string([$\\, $x | [$A | _] = HexStringStart], Acc) ->
unescape_handle_hex_string(HexStringStart, Acc);
unescape_string([$\\, $x | [$B | _] = HexStringStart], Acc) ->
unescape_handle_hex_string(HexStringStart, Acc);
unescape_string([$\\, $x | [$C | _] = HexStringStart], Acc) ->
unescape_handle_hex_string(HexStringStart, Acc);
unescape_string([$\\, $x | [$D | _] = HexStringStart], Acc) ->
unescape_handle_hex_string(HexStringStart, Acc);
unescape_string([$\\, $x | [$E | _] = HexStringStart], Acc) ->
unescape_handle_hex_string(HexStringStart, Acc);
unescape_string([$\\, $x | [$F | _] = HexStringStart], Acc) ->
unescape_handle_hex_string(HexStringStart, Acc);
unescape_string([$\\, $x | [$a | _] = HexStringStart], Acc) ->
unescape_handle_hex_string(HexStringStart, Acc);
unescape_string([$\\, $x | [$b | _] = HexStringStart], Acc) ->
unescape_handle_hex_string(HexStringStart, Acc);
unescape_string([$\\, $x | [$c | _] = HexStringStart], Acc) ->
unescape_handle_hex_string(HexStringStart, Acc);
unescape_string([$\\, $x | [$d | _] = HexStringStart], Acc) ->
unescape_handle_hex_string(HexStringStart, Acc);
unescape_string([$\\, $x | [$e | _] = HexStringStart], Acc) ->
unescape_handle_hex_string(HexStringStart, Acc);
unescape_string([$\\, $x | [$f | _] = HexStringStart], Acc) ->
unescape_handle_hex_string(HexStringStart, Acc);
%% We treat all other escape sequences as not valid input to leave room for
%% extending the function to support more escape codes
unescape_string([$\\, X | _Rest], _Acc) ->
erlang:throw({unrecognized_escape_sequence, list_to_binary([$\\, X])});
unescape_string([First | Rest], Acc) ->
unescape_string(Rest, [First | Acc]).
unescape_handle_hex_string(HexStringStart, Acc) ->
{RemainingString, Num} = parse_hex_string(HexStringStart),
unescape_string(RemainingString, [Num | Acc]).
parse_hex_string(SeqStartingWithHexDigit) ->
parse_hex_string(SeqStartingWithHexDigit, []).
parse_hex_string([], Acc) ->
ReversedAcc = lists:reverse(Acc),
{[], list_to_integer(ReversedAcc, 16)};
parse_hex_string([First | Rest] = String, Acc) ->
case is_hex_digit(First) of
true ->
parse_hex_string(Rest, [First | Acc]);
false ->
ReversedAcc = lists:reverse(Acc),
{String, list_to_integer(ReversedAcc, 16)}
end.
is_hex_digit($0) -> true;
is_hex_digit($1) -> true;
is_hex_digit($2) -> true;
is_hex_digit($3) -> true;
is_hex_digit($4) -> true;
is_hex_digit($5) -> true;
is_hex_digit($6) -> true;
is_hex_digit($7) -> true;
is_hex_digit($8) -> true;
is_hex_digit($9) -> true;
is_hex_digit($A) -> true;
is_hex_digit($B) -> true;
is_hex_digit($C) -> true;
is_hex_digit($D) -> true;
is_hex_digit($E) -> true;
is_hex_digit($F) -> true;
is_hex_digit($a) -> true;
is_hex_digit($b) -> true;
is_hex_digit($c) -> true;
is_hex_digit($d) -> true;
is_hex_digit($e) -> true;
is_hex_digit($f) -> true;
is_hex_digit(_) -> false.
%%------------------------------------------------------------------------------
%% Array Funcs
%%------------------------------------------------------------------------------

View File

@ -736,6 +736,60 @@ t_regex_replace(_) ->
?assertEqual(<<"aebed">>, apply_func(regex_replace, [<<"accbcd">>, <<"c+">>, <<"e">>])),
?assertEqual(<<"a[cc]b[c]d">>, apply_func(regex_replace, [<<"accbcd">>, <<"c+">>, <<"[&]">>])).
t_unescape(_) ->
?assertEqual(<<"\n">>, emqx_rule_funcs:unescape(<<"\\n">>)),
?assertEqual(<<"\t">>, emqx_rule_funcs:unescape(<<"\\t">>)),
?assertEqual(<<"\r">>, emqx_rule_funcs:unescape(<<"\\r">>)),
?assertEqual(<<"\b">>, emqx_rule_funcs:unescape(<<"\\b">>)),
?assertEqual(<<"\f">>, emqx_rule_funcs:unescape(<<"\\f">>)),
?assertEqual(<<"\v">>, emqx_rule_funcs:unescape(<<"\\v">>)),
?assertEqual(<<"'">>, emqx_rule_funcs:unescape(<<"\\'">>)),
?assertEqual(<<"\"">>, emqx_rule_funcs:unescape(<<"\\\"">>)),
?assertEqual(<<"?">>, emqx_rule_funcs:unescape(<<"\\?">>)),
?assertEqual(<<"\a">>, emqx_rule_funcs:unescape(<<"\\a">>)),
% Test escaping backslash itself
?assertEqual(<<"\\">>, emqx_rule_funcs:unescape(<<"\\\\">>)),
% Test a string without any escape sequences
?assertEqual(<<"Hello, World!">>, emqx_rule_funcs:unescape(<<"Hello, World!">>)),
% Test a string with escape sequences
?assertEqual(<<"Hello,\t World\n!">>, emqx_rule_funcs:unescape(<<"Hello,\\t World\\n!">>)),
% Test unrecognized escape sequence (should throw an error)
?assertException(
throw, {unrecognized_escape_sequence, <<$\\, $L>>}, emqx_rule_funcs:unescape(<<"\\L">>)
),
% Test hexadecimal escape sequences
% Newline
?assertEqual(<<"\n">>, emqx_rule_funcs:unescape(<<"\\x0A">>)),
% Newline
?assertEqual(<<"hej\n">>, emqx_rule_funcs:unescape(<<"hej\\x0A">>)),
% Newline
?assertEqual(<<"\nhej">>, emqx_rule_funcs:unescape(<<"\\x0Ahej">>)),
% Newline
?assertEqual(<<"hej\nhej">>, emqx_rule_funcs:unescape(<<"hej\\x0Ahej">>)),
% "ABC"
?assertEqual(<<"ABC">>, emqx_rule_funcs:unescape(<<"\\x41\\x42\\x43">>)),
% "\xFF" = 255 in decimal
?assertEqual(<<"\xFF"/utf8>>, emqx_rule_funcs:unescape(<<"\\xFF">>)),
% "W" = \x57
?assertEqual(<<"Hello, World!">>, emqx_rule_funcs:unescape(<<"Hello, \\x57orld!">>)).
t_unescape_hex(_) ->
?assertEqual(<<"A"/utf8>>, emqx_rule_funcs:unescape(<<"\\x41">>)),
?assertEqual(<<"Hello"/utf8>>, emqx_rule_funcs:unescape(<<"\\x48\\x65\\x6c\\x6c\\x6f">>)),
?assertEqual(<<"A"/utf8>>, emqx_rule_funcs:unescape(<<"\\x0041">>)),
?assertEqual(<<""/utf8>>, emqx_rule_funcs:unescape(<<"\\x20AC">>)),
?assertEqual(<<""/utf8>>, emqx_rule_funcs:unescape(<<"\\x2764">>)),
?assertException(
throw, {unrecognized_escape_sequence, <<"\\x">>}, emqx_rule_funcs:unescape(<<"\\xG1">>)
),
?assertException(
throw, {invalid_unicode_character, _}, emqx_rule_funcs:unescape(<<"\\x11000000">>)
),
?assertEqual(
<<"Hello, 世界"/utf8>>, emqx_rule_funcs:unescape(<<"Hello, \\x00004E16\\x0000754C">>)
).
jq_1_elm_res(JSONString) ->
Bin = list_to_binary(JSONString),
[apply_func(json_decode, [Bin])].