[njs] Improved processing of invalid surrogate pairs in strings.
Alexander Borisov
alexander.borisov at nginx.com
Wed May 29 14:55:30 UTC 2019
details: https://hg.nginx.org/njs/rev/96dc9de9f92c
branches:
changeset: 991:96dc9de9f92c
user: Alexander Borisov <alexander.borisov at nginx.com>
date: Tue May 28 20:49:58 2019 +0300
description:
Improved processing of invalid surrogate pairs in strings.
Previously, an exception was thrown on invalid surrogate pairs.
Now, all such pairs are converted to replacement character.
This closes #170 issue on GitHub.
diffstat:
njs/njs_parser_terminal.c | 51 +++++++++++++++++++++++++++++++---------------
njs/test/njs_unit_test.c | 20 +++++++++++++----
nxt/nxt_utf8.h | 2 +
3 files changed, 51 insertions(+), 22 deletions(-)
diffs (129 lines):
diff -r 8e7e7ba29c71 -r 96dc9de9f92c njs/njs_parser_terminal.c
--- a/njs/njs_parser_terminal.c Thu May 23 18:03:46 2019 +0300
+++ b/njs/njs_parser_terminal.c Tue May 28 20:49:58 2019 +0300
@@ -1049,12 +1049,27 @@ njs_parser_escape_string_create(njs_vm_t
}
if (cp_pair != 0) {
- cp = njs_string_surrogate_pair(cp_pair, cp);
+ if (nxt_fast_path(cp >= 0xdc00 && cp <= 0xdfff)) {
+ cp = njs_string_surrogate_pair(cp_pair, cp);
+
+ } else if (nxt_slow_path(cp >= 0xd800 && cp <= 0xdbff)) {
+ cp = NXT_UTF8_REPLACEMENT;
+
+ dst = nxt_utf8_encode(dst, (uint32_t) cp);
+
+ } else {
+ dst = nxt_utf8_encode(dst, NXT_UTF8_REPLACEMENT);
+ }
+
cp_pair = 0;
} else if (cp >= 0xd800 && cp <= 0xdfff) {
- cp_pair = cp;
- continue;
+ if (cp <= 0xdbff && src[0] == '\\' && src[1] == 'u') {
+ cp_pair = cp;
+ continue;
+ }
+
+ cp = NXT_UTF8_REPLACEMENT;
}
dst = nxt_utf8_encode(dst, (uint32_t) cp);
@@ -1183,20 +1198,29 @@ njs_parser_escape_string_calc_length(njs
}
if (cp_pair != 0) {
- if (nxt_slow_path(cp < 0xdc00 || cp > 0xdfff)) {
- goto invalid_pair;
+ if (nxt_fast_path(cp >= 0xdc00 && cp <= 0xdfff)) {
+ cp = njs_string_surrogate_pair(cp_pair, cp);
+
+ } else if (nxt_slow_path(cp >= 0xd800 && cp <= 0xdbff)) {
+ cp = NXT_UTF8_REPLACEMENT;
+
+ size += nxt_utf8_size(cp);
+ length++;
+
+ } else {
+ size += nxt_utf8_size(NXT_UTF8_REPLACEMENT);
+ length++;
}
- cp = njs_string_surrogate_pair(cp_pair, cp);
cp_pair = 0;
} else if (cp >= 0xd800 && cp <= 0xdfff) {
- if (nxt_slow_path(cp > 0xdbff || src[0] != '\\' || src[1] != 'u')) {
- goto invalid_pair;
+ if (cp <= 0xdbff && src[0] == '\\' && src[1] == 'u') {
+ cp_pair = cp;
+ continue;
}
- cp_pair = cp;
- continue;
+ cp = NXT_UTF8_REPLACEMENT;
}
size += nxt_utf8_size(cp);
@@ -1214,11 +1238,4 @@ invalid:
njs_parser_text(parser));
return NJS_ERROR;
-
-invalid_pair:
-
- njs_parser_syntax_error(vm, parser, "Invalid surrogate pair \"%V\"",
- njs_parser_text(parser));
-
- return NJS_ERROR;
}
diff -r 8e7e7ba29c71 -r 96dc9de9f92c njs/test/njs_unit_test.c
--- a/njs/test/njs_unit_test.c Thu May 23 18:03:46 2019 +0300
+++ b/njs/test/njs_unit_test.c Tue May 28 20:49:58 2019 +0300
@@ -4448,15 +4448,25 @@ static njs_unit_test_t njs_test[] =
nxt_string("1") },
{ nxt_string("'\\ud83d abc \\udc4d'"),
- nxt_string("SyntaxError: Invalid surrogate pair "
- "\"\\ud83d abc \\udc4d\" in 1") },
+ nxt_string("� abc �") },
{ nxt_string("'\\ud83d'"),
- nxt_string("SyntaxError: Invalid surrogate pair \"\\ud83d\" in 1") },
+ nxt_string("�") },
{ nxt_string("'\\ud83d\\uabcd'"),
- nxt_string("SyntaxError: Invalid surrogate pair "
- "\"\\ud83d\\uabcd\" in 1") },
+ nxt_string("�ꯍ") },
+
+ { nxt_string("'\\u{d800}\\u{dB00}'"),
+ nxt_string("��") },
+
+ { nxt_string("'\\u{d800}\\u{d7ff}'"),
+ nxt_string("�") },
+
+ { nxt_string("'\\u{d800}['"),
+ nxt_string("�[") },
+
+ { nxt_string("'\\u{D800}\\u{'"),
+ nxt_string("SyntaxError: Invalid Unicode code point \"\\u{D800}\\u{\" in 1") },
{ nxt_string("''.hasOwnProperty('length')"),
nxt_string("true") },
diff -r 8e7e7ba29c71 -r 96dc9de9f92c nxt/nxt_utf8.h
--- a/nxt/nxt_utf8.h Thu May 23 18:03:46 2019 +0300
+++ b/nxt/nxt_utf8.h Tue May 28 20:49:58 2019 +0300
@@ -15,6 +15,8 @@
*/
#define NXT_UTF8_SORT_INVALID 0x0EEE0EEE
+#define NXT_UTF8_REPLACEMENT 0xFFFD
+
NXT_EXPORT u_char *nxt_utf8_encode(u_char *p, uint32_t u);
NXT_EXPORT uint32_t nxt_utf8_decode(const u_char **start, const u_char *end);
More information about the nginx-devel
mailing list