From 5a6f96212dc35ad2bcba4aad0e37813c002f24e0 Mon Sep 17 00:00:00 2001 From: Kjell Winblad Date: Fri, 8 Mar 2024 11:52:34 +0100 Subject: [PATCH 1/2] feat(rule engine SQL): add an `unescape` function The added `unescape` function unescapes escape sequences, transforming them back to their represented characters. The following escape sequences are supported: - Standard C escape sequences: - `\n` for newline (LF) - `\t` for horizontal tab (HT) - `\r` for carriage return (CR) - `\b` for backspace (BS) - `\f` for formfeed (FF) - `\v` for vertical tab (VT) - `\'` for single quote (') - `\"` for double quote (") - `\\` for backslash (\) - `\?` for question mark (?) - `\a` for alert (bell, BEL) - Hexadecimal escape codes: - `\xH...` where `H...` is one or more hexadecimal digits (0-9, A-F, a-f), allowing for the encoding of arbitrary utf32 characters. If an escape sequence is not recognized, or if the hexadecimal escape does not form a valid Unicode character, the function generates an exception. Fixes: https://github.com/emqx/emqx/issues/12460 https://emqx.atlassian.net/browse/EMQX-11847 --- apps/emqx_rule_engine/src/emqx_rule_funcs.erl | 135 +++++++++++++++++- .../test/emqx_rule_funcs_SUITE.erl | 54 +++++++ 2 files changed, 188 insertions(+), 1 deletion(-) diff --git a/apps/emqx_rule_engine/src/emqx_rule_funcs.erl b/apps/emqx_rule_engine/src/emqx_rule_funcs.erl index 3cebdd67a..c5a083ef4 100644 --- a/apps/emqx_rule_engine/src/emqx_rule_funcs.erl +++ b/apps/emqx_rule_engine/src/emqx_rule_funcs.erl @@ -161,7 +161,8 @@ join_to_string/2, join_to_sql_values_string/1, jq/2, - jq/3 + jq/3, + unescape/1 ]). %% Map Funcs @@ -937,6 +938,138 @@ jq(FilterProgram, JSONBin) -> ]) ). +unescape(Bin) when is_binary(Bin) -> + UnicodeList = unicode:characters_to_list(Bin, utf8), + UnescapedUnicodeList = unescape_string(UnicodeList), + UnescapedUTF8Bin = unicode:characters_to_binary(UnescapedUnicodeList, utf32, utf8), + case UnescapedUTF8Bin of + Out when is_binary(Out) -> + Out; + Error -> + throw({invalid_unicode_character, Error}) + end. + +unescape_string(Input) -> unescape_string(Input, []). + +unescape_string([], Acc) -> + lists:reverse(Acc); +unescape_string([$\\, $\\ | Rest], Acc) -> + unescape_string(Rest, [$\\ | Acc]); +unescape_string([$\\, $n | Rest], Acc) -> + unescape_string(Rest, [$\n | Acc]); +unescape_string([$\\, $t | Rest], Acc) -> + unescape_string(Rest, [$\t | Acc]); +unescape_string([$\\, $r | Rest], Acc) -> + unescape_string(Rest, [$\r | Acc]); +unescape_string([$\\, $b | Rest], Acc) -> + unescape_string(Rest, [$\b | Acc]); +unescape_string([$\\, $f | Rest], Acc) -> + unescape_string(Rest, [$\f | Acc]); +unescape_string([$\\, $v | Rest], Acc) -> + unescape_string(Rest, [$\v | Acc]); +unescape_string([$\\, $' | Rest], Acc) -> + unescape_string(Rest, [$\' | Acc]); +unescape_string([$\\, $" | Rest], Acc) -> + unescape_string(Rest, [$\" | Acc]); +unescape_string([$\\, $? | Rest], Acc) -> + unescape_string(Rest, [$\? | Acc]); +unescape_string([$\\, $a | Rest], Acc) -> + unescape_string(Rest, [$\a | Acc]); +%% Start of HEX escape code +unescape_string([$\\, $x | [$0 | _] = HexStringStart], Acc) -> + unescape_handle_hex_string(HexStringStart, Acc); +unescape_string([$\\, $x | [$1 | _] = HexStringStart], Acc) -> + unescape_handle_hex_string(HexStringStart, Acc); +unescape_string([$\\, $x | [$2 | _] = HexStringStart], Acc) -> + unescape_handle_hex_string(HexStringStart, Acc); +unescape_string([$\\, $x | [$3 | _] = HexStringStart], Acc) -> + unescape_handle_hex_string(HexStringStart, Acc); +unescape_string([$\\, $x | [$4 | _] = HexStringStart], Acc) -> + unescape_handle_hex_string(HexStringStart, Acc); +unescape_string([$\\, $x | [$5 | _] = HexStringStart], Acc) -> + unescape_handle_hex_string(HexStringStart, Acc); +unescape_string([$\\, $x | [$6 | _] = HexStringStart], Acc) -> + unescape_handle_hex_string(HexStringStart, Acc); +unescape_string([$\\, $x | [$7 | _] = HexStringStart], Acc) -> + unescape_handle_hex_string(HexStringStart, Acc); +unescape_string([$\\, $x | [$8 | _] = HexStringStart], Acc) -> + unescape_handle_hex_string(HexStringStart, Acc); +unescape_string([$\\, $x | [$9 | _] = HexStringStart], Acc) -> + unescape_handle_hex_string(HexStringStart, Acc); +unescape_string([$\\, $x | [$A | _] = HexStringStart], Acc) -> + unescape_handle_hex_string(HexStringStart, Acc); +unescape_string([$\\, $x | [$B | _] = HexStringStart], Acc) -> + unescape_handle_hex_string(HexStringStart, Acc); +unescape_string([$\\, $x | [$C | _] = HexStringStart], Acc) -> + unescape_handle_hex_string(HexStringStart, Acc); +unescape_string([$\\, $x | [$D | _] = HexStringStart], Acc) -> + unescape_handle_hex_string(HexStringStart, Acc); +unescape_string([$\\, $x | [$E | _] = HexStringStart], Acc) -> + unescape_handle_hex_string(HexStringStart, Acc); +unescape_string([$\\, $x | [$F | _] = HexStringStart], Acc) -> + unescape_handle_hex_string(HexStringStart, Acc); +unescape_string([$\\, $x | [$a | _] = HexStringStart], Acc) -> + unescape_handle_hex_string(HexStringStart, Acc); +unescape_string([$\\, $x | [$b | _] = HexStringStart], Acc) -> + unescape_handle_hex_string(HexStringStart, Acc); +unescape_string([$\\, $x | [$c | _] = HexStringStart], Acc) -> + unescape_handle_hex_string(HexStringStart, Acc); +unescape_string([$\\, $x | [$d | _] = HexStringStart], Acc) -> + unescape_handle_hex_string(HexStringStart, Acc); +unescape_string([$\\, $x | [$e | _] = HexStringStart], Acc) -> + unescape_handle_hex_string(HexStringStart, Acc); +unescape_string([$\\, $x | [$f | _] = HexStringStart], Acc) -> + unescape_handle_hex_string(HexStringStart, Acc); +%% We treat all other escape sequences as not valid input to leave room for +%% extending the function to support more escape codes +unescape_string([$\\, X | _Rest], _Acc) -> + erlang:throw({unrecognized_escape_sequence, list_to_binary([$\\, X])}); +unescape_string([First | Rest], Acc) -> + unescape_string(Rest, [First | Acc]). + +unescape_handle_hex_string(HexStringStart, Acc) -> + {RemainingString, Num} = parse_hex_string(HexStringStart), + unescape_string(RemainingString, [Num | Acc]). + +parse_hex_string(SeqStartingWithHexDigit) -> + parse_hex_string(SeqStartingWithHexDigit, []). + +parse_hex_string([], Acc) -> + ReversedAcc = lists:reverse(Acc), + {[], list_to_integer(ReversedAcc, 16)}; +parse_hex_string([First | Rest] = String, Acc) -> + case is_hex_digit(First) of + true -> + parse_hex_string(Rest, [First | Acc]); + false -> + ReversedAcc = lists:reverse(Acc), + {String, list_to_integer(ReversedAcc, 16)} + end. + +is_hex_digit($0) -> true; +is_hex_digit($1) -> true; +is_hex_digit($2) -> true; +is_hex_digit($3) -> true; +is_hex_digit($4) -> true; +is_hex_digit($5) -> true; +is_hex_digit($6) -> true; +is_hex_digit($7) -> true; +is_hex_digit($8) -> true; +is_hex_digit($9) -> true; +is_hex_digit($A) -> true; +is_hex_digit($B) -> true; +is_hex_digit($C) -> true; +is_hex_digit($D) -> true; +is_hex_digit($E) -> true; +is_hex_digit($F) -> true; +is_hex_digit($a) -> true; +is_hex_digit($b) -> true; +is_hex_digit($c) -> true; +is_hex_digit($d) -> true; +is_hex_digit($e) -> true; +is_hex_digit($f) -> true; +is_hex_digit(_) -> false. + %%------------------------------------------------------------------------------ %% Array Funcs %%------------------------------------------------------------------------------ diff --git a/apps/emqx_rule_engine/test/emqx_rule_funcs_SUITE.erl b/apps/emqx_rule_engine/test/emqx_rule_funcs_SUITE.erl index 073ac5bf6..b31889ab4 100644 --- a/apps/emqx_rule_engine/test/emqx_rule_funcs_SUITE.erl +++ b/apps/emqx_rule_engine/test/emqx_rule_funcs_SUITE.erl @@ -736,6 +736,60 @@ t_regex_replace(_) -> ?assertEqual(<<"aebed">>, apply_func(regex_replace, [<<"accbcd">>, <<"c+">>, <<"e">>])), ?assertEqual(<<"a[cc]b[c]d">>, apply_func(regex_replace, [<<"accbcd">>, <<"c+">>, <<"[&]">>])). +t_unescape(_) -> + ?assertEqual(<<"\n">>, emqx_rule_funcs:unescape(<<"\\n">>)), + ?assertEqual(<<"\t">>, emqx_rule_funcs:unescape(<<"\\t">>)), + ?assertEqual(<<"\r">>, emqx_rule_funcs:unescape(<<"\\r">>)), + ?assertEqual(<<"\b">>, emqx_rule_funcs:unescape(<<"\\b">>)), + ?assertEqual(<<"\f">>, emqx_rule_funcs:unescape(<<"\\f">>)), + ?assertEqual(<<"\v">>, emqx_rule_funcs:unescape(<<"\\v">>)), + ?assertEqual(<<"'">>, emqx_rule_funcs:unescape(<<"\\'">>)), + ?assertEqual(<<"\"">>, emqx_rule_funcs:unescape(<<"\\\"">>)), + ?assertEqual(<<"?">>, emqx_rule_funcs:unescape(<<"\\?">>)), + ?assertEqual(<<"\a">>, emqx_rule_funcs:unescape(<<"\\a">>)), + % Test escaping backslash itself + ?assertEqual(<<"\\">>, emqx_rule_funcs:unescape(<<"\\\\">>)), + % Test a string without any escape sequences + ?assertEqual(<<"Hello, World!">>, emqx_rule_funcs:unescape(<<"Hello, World!">>)), + % Test a string with escape sequences + ?assertEqual(<<"Hello,\t World\n!">>, emqx_rule_funcs:unescape(<<"Hello,\\t World\\n!">>)), + % Test unrecognized escape sequence (should throw an error) + ?assertException( + throw, {unrecognized_escape_sequence, <<$\\, $L>>}, emqx_rule_funcs:unescape(<<"\\L">>) + ), + % Test hexadecimal escape sequences + + % Newline + ?assertEqual(<<"\n">>, emqx_rule_funcs:unescape(<<"\\x0A">>)), + % Newline + ?assertEqual(<<"hej\n">>, emqx_rule_funcs:unescape(<<"hej\\x0A">>)), + % Newline + ?assertEqual(<<"\nhej">>, emqx_rule_funcs:unescape(<<"\\x0Ahej">>)), + % Newline + ?assertEqual(<<"hej\nhej">>, emqx_rule_funcs:unescape(<<"hej\\x0Ahej">>)), + % "ABC" + ?assertEqual(<<"ABC">>, emqx_rule_funcs:unescape(<<"\\x41\\x42\\x43">>)), + % "\xFF" = 255 in decimal + ?assertEqual(<<"\xFF"/utf8>>, emqx_rule_funcs:unescape(<<"\\xFF">>)), + % "W" = \x57 + ?assertEqual(<<"Hello, World!">>, emqx_rule_funcs:unescape(<<"Hello, \\x57orld!">>)). + +t_unescape_hex(_) -> + ?assertEqual(<<"A"/utf8>>, emqx_rule_funcs:unescape(<<"\\x41">>)), + ?assertEqual(<<"Hello"/utf8>>, emqx_rule_funcs:unescape(<<"\\x48\\x65\\x6c\\x6c\\x6f">>)), + ?assertEqual(<<"A"/utf8>>, emqx_rule_funcs:unescape(<<"\\x0041">>)), + ?assertEqual(<<"€"/utf8>>, emqx_rule_funcs:unescape(<<"\\x20AC">>)), + ?assertEqual(<<"❤"/utf8>>, emqx_rule_funcs:unescape(<<"\\x2764">>)), + ?assertException( + throw, {unrecognized_escape_sequence, <<"\\x">>}, emqx_rule_funcs:unescape(<<"\\xG1">>) + ), + ?assertException( + throw, {invalid_unicode_character, _}, emqx_rule_funcs:unescape(<<"\\x11000000">>) + ), + ?assertEqual( + <<"Hello, 世界"/utf8>>, emqx_rule_funcs:unescape(<<"Hello, \\x00004E16\\x0000754C">>) + ). + jq_1_elm_res(JSONString) -> Bin = list_to_binary(JSONString), [apply_func(json_decode, [Bin])]. From 69ddd51af1382610550248cd9d0ca56aeb6c9763 Mon Sep 17 00:00:00 2001 From: Kjell Winblad Date: Fri, 8 Mar 2024 12:35:39 +0100 Subject: [PATCH 2/2] docs: add change log entry --- changes/ce/feat-12671.en.md | 1 + 1 file changed, 1 insertion(+) create mode 100644 changes/ce/feat-12671.en.md diff --git a/changes/ce/feat-12671.en.md b/changes/ce/feat-12671.en.md new file mode 100644 index 000000000..d49f11269 --- /dev/null +++ b/changes/ce/feat-12671.en.md @@ -0,0 +1 @@ +An `unescape` function has been added to the rule engine SQL language to handle expansion of escape sequences in strings. This addition has been done because string literals in the SQL language don't support any escape codes (e.g., `\n` and `\t`). This enhancement allows for more flexible string manipulation within SQL expressions.