[njs] Improved processing of invalid surrogate pairs in strings.

Alexander Borisov alexander.borisov at nginx.com
Wed May 29 14:55:30 UTC 2019


details:   https://hg.nginx.org/njs/rev/96dc9de9f92c
branches:  
changeset: 991:96dc9de9f92c
user:      Alexander Borisov <alexander.borisov at nginx.com>
date:      Tue May 28 20:49:58 2019 +0300
description:
Improved processing of invalid surrogate pairs in strings.

Previously, an exception was thrown on invalid surrogate pairs.
Now, all such pairs are converted to replacement character.

This closes #170 issue on GitHub.

diffstat:

 njs/njs_parser_terminal.c |  51 +++++++++++++++++++++++++++++++---------------
 njs/test/njs_unit_test.c  |  20 +++++++++++++----
 nxt/nxt_utf8.h            |   2 +
 3 files changed, 51 insertions(+), 22 deletions(-)

diffs (129 lines):

diff -r 8e7e7ba29c71 -r 96dc9de9f92c njs/njs_parser_terminal.c
--- a/njs/njs_parser_terminal.c	Thu May 23 18:03:46 2019 +0300
+++ b/njs/njs_parser_terminal.c	Tue May 28 20:49:58 2019 +0300
@@ -1049,12 +1049,27 @@ njs_parser_escape_string_create(njs_vm_t
         }
 
         if (cp_pair != 0) {
-            cp = njs_string_surrogate_pair(cp_pair, cp);
+            if (nxt_fast_path(cp >= 0xdc00 && cp <= 0xdfff)) {
+                cp = njs_string_surrogate_pair(cp_pair, cp);
+
+            } else if (nxt_slow_path(cp >= 0xd800 && cp <= 0xdbff)) {
+                cp = NXT_UTF8_REPLACEMENT;
+
+                dst = nxt_utf8_encode(dst, (uint32_t) cp);
+
+            } else {
+                dst = nxt_utf8_encode(dst, NXT_UTF8_REPLACEMENT);
+            }
+
             cp_pair = 0;
 
         } else if (cp >= 0xd800 && cp <= 0xdfff) {
-            cp_pair = cp;
-            continue;
+            if (cp <= 0xdbff && src[0] == '\\' && src[1] == 'u') {
+                cp_pair = cp;
+                continue;
+            }
+
+            cp = NXT_UTF8_REPLACEMENT;
         }
 
         dst = nxt_utf8_encode(dst, (uint32_t) cp);
@@ -1183,20 +1198,29 @@ njs_parser_escape_string_calc_length(njs
         }
 
         if (cp_pair != 0) {
-            if (nxt_slow_path(cp < 0xdc00 || cp > 0xdfff)) {
-                goto invalid_pair;
+            if (nxt_fast_path(cp >= 0xdc00 && cp <= 0xdfff)) {
+                cp = njs_string_surrogate_pair(cp_pair, cp);
+
+            } else if (nxt_slow_path(cp >= 0xd800 && cp <= 0xdbff)) {
+                cp = NXT_UTF8_REPLACEMENT;
+
+                size += nxt_utf8_size(cp);
+                length++;
+
+            } else {
+                size += nxt_utf8_size(NXT_UTF8_REPLACEMENT);
+                length++;
             }
 
-            cp = njs_string_surrogate_pair(cp_pair, cp);
             cp_pair = 0;
 
         } else if (cp >= 0xd800 && cp <= 0xdfff) {
-            if (nxt_slow_path(cp > 0xdbff || src[0] != '\\' || src[1] != 'u')) {
-                goto invalid_pair;
+            if (cp <= 0xdbff && src[0] == '\\' && src[1] == 'u') {
+                cp_pair = cp;
+                continue;
             }
 
-            cp_pair = cp;
-            continue;
+            cp = NXT_UTF8_REPLACEMENT;
         }
 
         size += nxt_utf8_size(cp);
@@ -1214,11 +1238,4 @@ invalid:
                             njs_parser_text(parser));
 
     return NJS_ERROR;
-
-invalid_pair:
-
-    njs_parser_syntax_error(vm, parser, "Invalid surrogate pair \"%V\"",
-                            njs_parser_text(parser));
-
-    return NJS_ERROR;
 }
diff -r 8e7e7ba29c71 -r 96dc9de9f92c njs/test/njs_unit_test.c
--- a/njs/test/njs_unit_test.c	Thu May 23 18:03:46 2019 +0300
+++ b/njs/test/njs_unit_test.c	Tue May 28 20:49:58 2019 +0300
@@ -4448,15 +4448,25 @@ static njs_unit_test_t  njs_test[] =
       nxt_string("1") },
 
     { nxt_string("'\\ud83d abc \\udc4d'"),
-      nxt_string("SyntaxError: Invalid surrogate pair "
-                 "\"\\ud83d abc \\udc4d\" in 1") },
+      nxt_string("� abc �") },
 
     { nxt_string("'\\ud83d'"),
-      nxt_string("SyntaxError: Invalid surrogate pair \"\\ud83d\" in 1") },
+      nxt_string("�") },
 
     { nxt_string("'\\ud83d\\uabcd'"),
-      nxt_string("SyntaxError: Invalid surrogate pair "
-                 "\"\\ud83d\\uabcd\" in 1") },
+      nxt_string("�ꯍ") },
+
+    { nxt_string("'\\u{d800}\\u{dB00}'"),
+      nxt_string("��") },
+
+    { nxt_string("'\\u{d800}\\u{d7ff}'"),
+      nxt_string("�퟿") },
+
+    { nxt_string("'\\u{d800}['"),
+      nxt_string("�[") },
+
+    { nxt_string("'\\u{D800}\\u{'"),
+      nxt_string("SyntaxError: Invalid Unicode code point \"\\u{D800}\\u{\" in 1") },
 
     { nxt_string("''.hasOwnProperty('length')"),
       nxt_string("true") },
diff -r 8e7e7ba29c71 -r 96dc9de9f92c nxt/nxt_utf8.h
--- a/nxt/nxt_utf8.h	Thu May 23 18:03:46 2019 +0300
+++ b/nxt/nxt_utf8.h	Tue May 28 20:49:58 2019 +0300
@@ -15,6 +15,8 @@
  */
 #define NXT_UTF8_SORT_INVALID  0x0EEE0EEE
 
+#define NXT_UTF8_REPLACEMENT   0xFFFD
+
 
 NXT_EXPORT u_char *nxt_utf8_encode(u_char *p, uint32_t u);
 NXT_EXPORT uint32_t nxt_utf8_decode(const u_char **start, const u_char *end);


More information about the nginx-devel mailing list