[njs] Fixed parsing surrogate pair presents as UTF-16 escape sequences.

Alexander Borisov alexander.borisov at nginx.com
Tue Apr 23 14:47:26 UTC 2019


details:   https://hg.nginx.org/njs/rev/724c31e77d2a
branches:  
changeset: 921:724c31e77d2a
user:      Alexander Borisov <alexander.borisov at nginx.com>
date:      Mon Apr 22 16:23:50 2019 +0300
description:
Fixed parsing surrogate pair presents as UTF-16 escape sequences.

This closes #96 issue on GitHub.

diffstat:

 njs/njs_parser_terminal.c |  55 ++++++++++++++++++++++++++++++++++++++++------
 njs/test/njs_unit_test.c  |  17 ++++++++++++++
 2 files changed, 64 insertions(+), 8 deletions(-)

diffs (137 lines):

diff -r b3eb60707479 -r 724c31e77d2a njs/njs_parser_terminal.c
--- a/njs/njs_parser_terminal.c	Mon Apr 22 16:23:43 2019 +0300
+++ b/njs/njs_parser_terminal.c	Mon Apr 22 16:23:50 2019 +0300
@@ -926,7 +926,7 @@ njs_parser_escape_string_create(njs_vm_t
 {
     u_char        c, *start, *dst;
     size_t        size, length, hex_length;
-    uint64_t      cp;
+    uint64_t      cp, cp_pair;
     njs_ret_t     ret;
     nxt_str_t     *string;
     const u_char  *src, *end, *hex_end;
@@ -942,6 +942,7 @@ njs_parser_escape_string_create(njs_vm_t
     }
 
     dst = start;
+    cp_pair = 0;
 
     string = njs_parser_text(parser);
     src = string->start;
@@ -1041,6 +1042,23 @@ njs_parser_escape_string_create(njs_vm_t
     hex:
         cp = njs_number_hex_parse(&src, hex_end);
 
+        /* Skip '}' character. */
+
+        if (hex_length == 0) {
+            src++;
+        }
+
+        /* Surrogate pair. */
+
+        if (cp_pair != 0) {
+            cp = 0x10000 + ((cp_pair - 0xd800) << 10) + (cp - 0xdc00);
+            cp_pair = 0;
+
+        } else if (cp >= 0xd800 && cp <= 0xdfff) {
+            cp_pair = cp;
+            continue;
+        }
+
         dst = nxt_utf8_encode(dst, (uint32_t) cp);
         if (nxt_slow_path(dst == NULL)) {
             njs_parser_syntax_error(vm, parser,
@@ -1049,12 +1067,6 @@ njs_parser_escape_string_create(njs_vm_t
 
             return NJS_TOKEN_ILLEGAL;
         }
-
-        /* Skip '}' character */
-
-        if (hex_length == 0) {
-            src++;
-        }
     }
 
     if (length > NJS_STRING_MAP_STRIDE && length != size) {
@@ -1070,12 +1082,13 @@ njs_parser_escape_string_calc_length(njs
     size_t *out_size, size_t *out_length)
 {
     size_t        size, length, hex_length;
-    uint64_t      cp;
+    uint64_t      cp, cp_pair;
     nxt_str_t     *string;
     const u_char  *ptr, *src, *end, *hex_end;
 
     size = 0;
     length = 0;
+    cp_pair = 0;
 
     string = njs_parser_text(parser);
     src = string->start;
@@ -1171,6 +1184,25 @@ njs_parser_escape_string_calc_length(njs
             }
         }
 
+        /* Surrogate pair. */
+
+        if (cp_pair != 0) {
+            if (nxt_slow_path(cp < 0xdc00 || cp > 0xdfff)) {
+                goto invalid_pair;
+            }
+
+            cp = 0x10000 + ((cp_pair - 0xd800) << 10) + (cp - 0xdc00);
+            cp_pair = 0;
+
+        } else if (cp >= 0xd800 && cp <= 0xdfff) {
+            if (nxt_slow_path(cp > 0xdbff || src[0] != '\\' || src[1] != 'u')) {
+                goto invalid_pair;
+            }
+
+            cp_pair = cp;
+            continue;
+        }
+
         size += nxt_utf8_size(cp);
         length++;
     }
@@ -1186,4 +1218,11 @@ invalid:
                             njs_parser_text(parser));
 
     return NJS_ERROR;
+
+invalid_pair:
+
+    njs_parser_syntax_error(vm, parser, "Invalid surrogate pair \"%V\"",
+                            njs_parser_text(parser));
+
+    return NJS_ERROR;
 }
diff -r b3eb60707479 -r 724c31e77d2a njs/test/njs_unit_test.c
--- a/njs/test/njs_unit_test.c	Mon Apr 22 16:23:43 2019 +0300
+++ b/njs/test/njs_unit_test.c	Mon Apr 22 16:23:50 2019 +0300
@@ -4387,6 +4387,23 @@ static njs_unit_test_t  njs_test[] =
     { nxt_string("'привет\\n\\u{61}\\u{3B1}\\u{20AC}'.length"),
       nxt_string("10") },
 
+    { nxt_string("'\\ud83d\\udc4d'"),
+      nxt_string("\xf0\x9f\x91\x8d") },
+
+    { nxt_string("'\\ud83d\\udc4d'.length"),
+      nxt_string("1") },
+
+    { nxt_string("'\\ud83d abc \\udc4d'"),
+      nxt_string("SyntaxError: Invalid surrogate pair "
+                 "\"\\ud83d abc \\udc4d\" in 1") },
+
+    { nxt_string("'\\ud83d'"),
+      nxt_string("SyntaxError: Invalid surrogate pair \"\\ud83d\" in 1") },
+
+    { nxt_string("'\\ud83d\\uabcd'"),
+      nxt_string("SyntaxError: Invalid surrogate pair "
+                 "\"\\ud83d\\uabcd\" in 1") },
+
     { nxt_string("''.hasOwnProperty('length')"),
       nxt_string("true") },
 


More information about the nginx-devel mailing list