[njs] Fixed length calculation for UTF-8 string with escape characters.

Tue Apr 23 14:47:26 UTC 2019

details:   https://hg.nginx.org/njs/rev/b3eb60707479
branches:  
changeset: 920:b3eb60707479
user:      Alexander Borisov <alexander.borisov at nginx.com>
date:      Mon Apr 22 16:23:43 2019 +0300
description:
Fixed length calculation for UTF-8 string with escape characters.

This closes #133 issue on GitHub.

diffstat:

 njs/njs_parser_terminal.c |  329 +++++++++++++++++++++++++++++----------------
 njs/test/njs_unit_test.c  |    6 +
 2 files changed, 216 insertions(+), 119 deletions(-)

diffs (413 lines):

diff -r ad1be10fba80 -r b3eb60707479 njs/njs_parser_terminal.c

--- a/njs/njs_parser_terminal.c	Tue Apr 23 15:31:40 2019 +0300
+++ b/njs/njs_parser_terminal.c	Mon Apr 22 16:23:43 2019 +0300
@@ -28,6 +28,8 @@ static nxt_int_t njs_parser_template_exp
     njs_parser_t *parser);
 static nxt_int_t njs_parser_template_string(njs_vm_t *vm,
     njs_parser_t *parser);
+static njs_ret_t njs_parser_escape_string_calc_length(njs_vm_t *vm,
+    njs_parser_t *parser, size_t *out_size, size_t *out_length);
 static njs_token_t njs_parser_escape_string_create(njs_vm_t *vm,
     njs_parser_t *parser, njs_value_t *value);
 
@@ -923,176 +925,265 @@ njs_parser_escape_string_create(njs_vm_t
     njs_value_t *value)
 {
     u_char        c, *start, *dst;
-    size_t        size,length, hex_length;
-    uint64_t      u;
+    size_t        size, length, hex_length;
+    uint64_t      cp;
+    njs_ret_t     ret;
     nxt_str_t     *string;
-    const u_char  *p, *src, *end, *hex_end;
+    const u_char  *src, *end, *hex_end;
 
-    start = NULL;
-    dst = NULL;
+    ret = njs_parser_escape_string_calc_length(vm, parser, &size, &length);
+    if (nxt_slow_path(ret != NXT_OK)) {
+        return NJS_TOKEN_ILLEGAL;
+    }
+
+    start = njs_string_alloc(vm, value, size, length);
+    if (nxt_slow_path(start == NULL)) {
+        return NJS_TOKEN_ERROR;
+    }
 
-    for ( ;; ) {
-        /*
-         * The loop runs twice: at the first step string size and
-         * UTF-8 length are evaluated.  Then the string is allocated
-         * and at the second step string content is copied.
-         */
-        size = 0;
-        length = 0;
+    dst = start;
+
+    string = njs_parser_text(parser);
+    src = string->start;
+    end = src + string->length;
 
-        string = njs_parser_text(parser);
-        src = string->start;
-        end = src + string->length;
+    while (src < end) {
+        c = *src++;
 
-        while (src < end) {
+        if (c == '\\') {
+            /*
+             * Testing "src == end" is not required here
+             * since this has been already tested by lexer.
+             */
+
             c = *src++;
 
-            if (c == '\\') {
+            switch (c) {
+            case 'u':
                 /*
-                 * Testing "src == end" is not required here
-                 * since this has been already tested by lexer.
+                 * A character after "u" can be safely tested here
+                 * because there is always a closing quote at the
+                 * end of string: ...\u".
                  */
-                c = *src++;
+
+                if (*src != '{') {
+                    hex_length = 4;
+                    goto hex_length;
+                }
+
+                src++;
+                hex_length = 0;
+                hex_end = end;
 
-                switch (c) {
+                goto hex;
+
+            case 'x':
+                hex_length = 2;
+                goto hex_length;
+
+            case '0':
+                c = '\0';
+                break;
 
-                case 'u':
-                    hex_length = 4;
-                    /*
-                     * A character after "u" can be safely tested here
-                     * because there is always a closing quote at the
-                     * end of string: ...\u".
-                     */
-                    if (*src != '{') {
-                        goto hex_length_test;
-                    }
+            case 'b':
+                c = '\b';
+                break;
+
+            case 'f':
+                c = '\f';
+                break;
+
+            case 'n':
+                c = '\n';
+                break;
+
+            case 'r':
+                c = '\r';
+                break;
 
+            case 't':
+                c = '\t';
+                break;
+
+            case 'v':
+                c = '\v';
+                break;
+
+            case '\r':
+                /*
+                 * A character after "\r" can be safely tested here
+                 * because there is always a closing quote at the
+                 * end of string: ...\\r".
+                 */
+
+                if (*src == '\n') {
                     src++;
-                    hex_length = 0;
-                    hex_end = end;
+                }
 
-                    goto hex;
+                continue;
 
-                case 'x':
-                    hex_length = 2;
-                    goto hex_length_test;
+            case '\n':
+                continue;
 
-                case '0':
-                    c = '\0';
-                    break;
+            default:
+                break;
+            }
+        }
 
-                case 'b':
-                    c = '\b';
-                    break;
+        *dst++ = c;
 
-                case 'f':
-                    c = '\f';
-                    break;
+        continue;
+
+    hex_length:
 
-                case 'n':
-                    c = '\n';
-                    break;
+        hex_end = src + hex_length;
+
+    hex:
+        cp = njs_number_hex_parse(&src, hex_end);
 
-                case 'r':
-                    c = '\r';
-                    break;
+        dst = nxt_utf8_encode(dst, (uint32_t) cp);
+        if (nxt_slow_path(dst == NULL)) {
+            njs_parser_syntax_error(vm, parser,
+                                    "Invalid Unicode code point \"%V\"",
+                                    njs_parser_text(parser));
+
+            return NJS_TOKEN_ILLEGAL;
+        }
 
-                case 't':
-                    c = '\t';
-                    break;
+        /* Skip '}' character */
+
+        if (hex_length == 0) {
+            src++;
+        }
+    }
+
+    if (length > NJS_STRING_MAP_STRIDE && length != size) {
+        njs_string_offset_map_init(start, size);
+    }
 
-                case 'v':
-                    c = '\v';
-                    break;
+    return NJS_TOKEN_STRING;
+}
+
 
-                case '\r':
-                    /*
-                     * A character after "\r" can be safely tested here
-                     * because there is always a closing quote at the
-                     * end of string: ...\\r".
-                     */
-                    if (*src == '\n') {
-                        src++;
-                    }
+static njs_ret_t
+njs_parser_escape_string_calc_length(njs_vm_t *vm, njs_parser_t *parser,
+    size_t *out_size, size_t *out_length)
+{
+    size_t        size, length, hex_length;
+    uint64_t      cp;
+    nxt_str_t     *string;
+    const u_char  *ptr, *src, *end, *hex_end;
+
+    size = 0;
+    length = 0;
+
+    string = njs_parser_text(parser);
+    src = string->start;
+    end = src + string->length;
+
+    while (src < end) {
 
-                    continue;
+        if (*src == '\\') {
+            src++;
 
-                case '\n':
-                    continue;
+            switch (*src) {
+            case 'u':
+                src++;
 
-                default:
-                    break;
+                if (*src != '{') {
+                    hex_length = 4;
+                    goto hex_length;
                 }
-            }
+
+                src++;
+                hex_length = 0;
+                hex_end = end;
+
+                goto hex;
+
+            case 'x':
+                src++;
+                hex_length = 2;
+                goto hex_length;
 
-            size++;
-            length++;
+            case '\r':
+                src++;
 
-            if (dst != NULL) {
-                *dst++ = c;
-            }
+                if (*src == '\n') {
+                    src++;
+                }
+
+                continue;
 
-            continue;
-
-        hex_length_test:
+            case '\n':
+                src++;
+                continue;
 
-            hex_end = src + hex_length;
+            default:
+                break;
+            }
+        }
 
-            if (hex_end > end) {
+        if (*src >= 0x80) {
+            ptr = src;
+
+            if (nxt_slow_path(nxt_utf8_decode(&src, end) == 0xffffffff)) {
                 goto invalid;
             }
 
-        hex:
+            size += src - ptr;
+            length++;
+
+            continue;
+        }
 
-            p = src;
-            u = njs_number_hex_parse(&src, hex_end);
+        src++;
+        size++;
+        length++;
+
+        continue;
+
+    hex_length:
 
-            if (hex_length != 0) {
-                if (src != hex_end) {
-                    goto invalid;
-                }
+        hex_end = src + hex_length;
+
+        if (nxt_slow_path(hex_end > end)) {
+            goto invalid;
+        }
 
-            } else {
-                if (src == p || (src - p) > 6) {
-                    goto invalid;
-                }
+    hex:
 
-                if (src == end || *src++ != '}') {
-                    goto invalid;
-                }
+        ptr = src;
+        cp = njs_number_hex_parse(&src, hex_end);
+
+        if (hex_length != 0) {
+            if (src != hex_end) {
+                goto invalid;
             }
 
-            size += nxt_utf8_size(u);
-            length++;
+        } else {
+            if (src == ptr || (src - ptr) > 6) {
+                goto invalid;
+            }
 
-            if (dst != NULL) {
-                dst = nxt_utf8_encode(dst, (uint32_t) u);
-                if (dst == NULL) {
-                    goto invalid;
-                }
+            if (src == end || *src++ != '}') {
+                goto invalid;
             }
         }
 
-        if (start != NULL) {
-            if (length > NJS_STRING_MAP_STRIDE && length != size) {
-                njs_string_offset_map_init(start, size);
-            }
-
-            return NJS_TOKEN_STRING;
-        }
+        size += nxt_utf8_size(cp);
+        length++;
+    }
 
-        start = njs_string_alloc(vm, value, size, length);
-        if (nxt_slow_path(start == NULL)) {
-            return NJS_TOKEN_ERROR;
-        }
+    *out_size = size;
+    *out_length = length;
 
-        dst = start;
-    }
+    return NXT_OK;
 
 invalid:
 
     njs_parser_syntax_error(vm, parser, "Invalid Unicode code point \"%V\"",
                             njs_parser_text(parser));
 
-    return NJS_TOKEN_ILLEGAL;
+    return NJS_ERROR;
 }
diff -r ad1be10fba80 -r b3eb60707479 njs/test/njs_unit_test.c
--- a/njs/test/njs_unit_test.c	Tue Apr 23 15:31:40 2019 +0300
+++ b/njs/test/njs_unit_test.c	Mon Apr 22 16:23:43 2019 +0300
@@ -4381,6 +4381,12 @@ static njs_unit_test_t  njs_test[] =
     { nxt_string("'abc'.length"),
       nxt_string("3") },
 
+    { nxt_string("'привет\\n'.length"),
+      nxt_string("7") },
+
+    { nxt_string("'привет\\n\\u{61}\\u{3B1}\\u{20AC}'.length"),
+      nxt_string("10") },
+
     { nxt_string("''.hasOwnProperty('length')"),
       nxt_string("true") },