[njs] Introduced UTF-8 decoder according to WHATWG encoding spec.

Alexander Borisov alexander.borisov at nginx.com
Wed Jul 15 16:20:44 UTC 2020


details:   https://hg.nginx.org/njs/rev/855edd76bdb6
branches:  
changeset: 1472:855edd76bdb6
user:      Alexander Borisov <alexander.borisov at nginx.com>
date:      Wed Jul 15 19:19:19 2020 +0300
description:
Introduced UTF-8 decoder according to WHATWG encoding spec.

diffstat:

 src/njs_json.c               |    8 +-
 src/njs_parser.c             |   73 ++++++---
 src/njs_string.c             |  293 +++++++++++++++++++++----------------
 src/njs_unicode.h            |    4 +
 src/njs_utf8.c               |  333 ++++++++++++++++++++----------------------
 src/njs_utf8.h               |   40 ++--
 src/test/njs_unit_test.c     |   44 ++++-
 src/test/unicode_unit_test.c |   53 ++++--
 8 files changed, 466 insertions(+), 382 deletions(-)

diffs (truncated from 1394 to 1000 lines):

diff -r 63106bd2e9bf -r 855edd76bdb6 src/njs_json.c
--- a/src/njs_json.c	Wed Jul 15 19:19:18 2020 +0300
+++ b/src/njs_json.c	Wed Jul 15 19:19:19 2020 +0300
@@ -728,7 +728,7 @@ njs_json_parse_string(njs_json_parse_ctx
             if (njs_surrogate_any(utf)) {
 
                 if (utf > 0xdbff || p[0] != '\\' || p[1] != 'u') {
-                    s = njs_utf8_encode(s, NJS_UTF8_REPLACEMENT);
+                    s = njs_utf8_encode(s, NJS_UNICODE_REPLACEMENT);
                     continue;
                 }
 
@@ -741,12 +741,12 @@ njs_json_parse_string(njs_json_parse_ctx
                     utf = njs_string_surrogate_pair(utf, utf_low);
 
                 } else if (njs_surrogate_leading(utf_low)) {
-                    utf = NJS_UTF8_REPLACEMENT;
-                    s = njs_utf8_encode(s, NJS_UTF8_REPLACEMENT);
+                    utf = NJS_UNICODE_REPLACEMENT;
+                    s = njs_utf8_encode(s, NJS_UNICODE_REPLACEMENT);
 
                 } else {
                     utf = utf_low;
-                    s = njs_utf8_encode(s, NJS_UTF8_REPLACEMENT);
+                    s = njs_utf8_encode(s, NJS_UNICODE_REPLACEMENT);
                 }
             }
 
diff -r 63106bd2e9bf -r 855edd76bdb6 src/njs_parser.c
--- a/src/njs_parser.c	Wed Jul 15 19:19:18 2020 +0300
+++ b/src/njs_parser.c	Wed Jul 15 19:19:19 2020 +0300
@@ -7896,11 +7896,12 @@ njs_int_t
 njs_parser_string_create(njs_vm_t *vm, njs_lexer_token_t *token,
     njs_value_t *value)
 {
-    u_char        *dst;
-    ssize_t       size, length;
-    uint32_t      cp;
-    njs_str_t     *src;
-    const u_char  *p, *end;
+    u_char                *dst;
+    ssize_t               size, length;
+    uint32_t              cp;
+    njs_str_t             *src;
+    const u_char          *p, *end;
+    njs_unicode_decode_t  ctx;
 
     src = &token->text;
 
@@ -7914,10 +7915,17 @@ njs_parser_string_create(njs_vm_t *vm, n
     p = src->start;
     end = src->start + src->length;
 
+    njs_utf8_decode_init(&ctx);
+
     while (p < end) {
-        cp = njs_utf8_safe_decode(&p, end);
-
-        dst = njs_utf8_encode(dst, cp);
+        cp = njs_utf8_decode(&ctx, &p, end);
+
+        if (cp <= NJS_UNICODE_MAX_CODEPOINT) {
+            dst = njs_utf8_encode(dst, cp);
+
+        } else {
+            dst = njs_utf8_encode(dst, NJS_UNICODE_REPLACEMENT);
+        }
     }
 
     if (length > NJS_STRING_MAP_STRIDE && size != length) {
@@ -7932,12 +7940,13 @@ static njs_token_type_t
 njs_parser_escape_string_create(njs_parser_t *parser, njs_lexer_token_t *token,
     njs_value_t *value)
 {
-    u_char        c, *start, *dst;
-    size_t        size, length, hex_length;
-    uint64_t      cp, cp_pair;
-    njs_int_t     ret;
-    njs_str_t     *string;
-    const u_char  *src, *end, *hex_end;
+    u_char                c, *start, *dst;
+    size_t                size, length, hex_length;
+    uint64_t              cp, cp_pair;
+    njs_int_t             ret;
+    njs_str_t             *string;
+    const u_char          *src, *end, *hex_end;
+    njs_unicode_decode_t  ctx;
 
     ret = njs_parser_escape_string_calc_length(parser, token, &size, &length);
     if (njs_slow_path(ret != NJS_OK)) {
@@ -8053,7 +8062,13 @@ njs_parser_escape_string_create(njs_pars
 
         src--;
 
-        cp = njs_utf8_safe_decode2(&src, end);
+        njs_utf8_decode_init(&ctx);
+
+        cp = njs_utf8_decode(&ctx, &src, end);
+        if (cp > NJS_UNICODE_MAX_CODEPOINT) {
+            cp = NJS_UNICODE_REPLACEMENT;
+        }
+
         dst = njs_utf8_encode(dst, cp);
 
         continue;
@@ -8076,12 +8091,12 @@ njs_parser_escape_string_create(njs_pars
                 cp = njs_string_surrogate_pair(cp_pair, cp);
 
             } else if (njs_slow_path(njs_surrogate_leading(cp))) {
-                cp = NJS_UTF8_REPLACEMENT;
+                cp = NJS_UNICODE_REPLACEMENT;
 
                 dst = njs_utf8_encode(dst, (uint32_t) cp);
 
             } else {
-                dst = njs_utf8_encode(dst, NJS_UTF8_REPLACEMENT);
+                dst = njs_utf8_encode(dst, NJS_UNICODE_REPLACEMENT);
             }
 
             cp_pair = 0;
@@ -8092,7 +8107,7 @@ njs_parser_escape_string_create(njs_pars
                 continue;
             }
 
-            cp = NJS_UTF8_REPLACEMENT;
+            cp = NJS_UNICODE_REPLACEMENT;
         }
 
         dst = njs_utf8_encode(dst, (uint32_t) cp);
@@ -8116,10 +8131,11 @@ static njs_int_t
 njs_parser_escape_string_calc_length(njs_parser_t *parser,
     njs_lexer_token_t *token, size_t *out_size, size_t *out_length)
 {
-    size_t        size, length, hex_length;
-    uint64_t      cp, cp_pair;
-    njs_str_t     *string;
-    const u_char  *ptr, *src, *end, *hex_end;
+    size_t                size, length, hex_length;
+    uint64_t              cp, cp_pair;
+    njs_str_t             *string;
+    const u_char          *ptr, *src, *end, *hex_end;
+    njs_unicode_decode_t  ctx;
 
     size = 0;
     length = 0;
@@ -8173,7 +8189,12 @@ njs_parser_escape_string_calc_length(njs
         }
 
         if (*src >= 0x80) {
-            cp = njs_utf8_safe_decode2(&src, end);
+            njs_utf8_decode_init(&ctx);
+
+            cp = njs_utf8_decode(&ctx, &src, end);
+            if (cp > NJS_UNICODE_MAX_CODEPOINT) {
+                cp = NJS_UNICODE_REPLACEMENT;
+            }
 
             size += njs_utf8_size(cp);
             length++;
@@ -8220,13 +8241,13 @@ njs_parser_escape_string_calc_length(njs
                 cp = njs_string_surrogate_pair(cp_pair, cp);
 
             } else if (njs_slow_path(njs_surrogate_leading(cp))) {
-                cp = NJS_UTF8_REPLACEMENT;
+                cp = NJS_UNICODE_REPLACEMENT;
 
                 size += njs_utf8_size(cp);
                 length++;
 
             } else {
-                size += njs_utf8_size(NJS_UTF8_REPLACEMENT);
+                size += njs_utf8_size(NJS_UNICODE_REPLACEMENT);
                 length++;
             }
 
@@ -8238,7 +8259,7 @@ njs_parser_escape_string_calc_length(njs
                 continue;
             }
 
-            cp = NJS_UTF8_REPLACEMENT;
+            cp = NJS_UNICODE_REPLACEMENT;
         }
 
         size += njs_utf8_size(cp);
diff -r 63106bd2e9bf -r 855edd76bdb6 src/njs_string.c
--- a/src/njs_string.c	Wed Jul 15 19:19:18 2020 +0300
+++ b/src/njs_string.c	Wed Jul 15 19:19:19 2020 +0300
@@ -20,10 +20,8 @@ static njs_int_t njs_string_slice_prop(n
     njs_slice_prop_t *slice, njs_value_t *args, njs_uint_t nargs);
 static njs_int_t njs_string_slice_args(njs_vm_t *vm, njs_slice_prop_t *slice,
     njs_value_t *args, njs_uint_t nargs);
-static njs_int_t njs_string_from_char_code(njs_vm_t *vm,
-    njs_value_t *args, njs_uint_t nargs, njs_index_t unused);
-static njs_int_t njs_string_from_code_point(njs_vm_t *vm, njs_value_t *args,
-    njs_uint_t nargs, njs_index_t unused);
+static njs_int_t njs_string_from_char_code(njs_vm_t *vm, njs_value_t *args,
+    njs_uint_t nargs, njs_index_t is_point);
 static njs_int_t njs_string_bytes_from(njs_vm_t *vm, njs_value_t *args,
     njs_uint_t nargs, njs_index_t unused);
 static njs_int_t njs_string_bytes_from_array_like(njs_vm_t *vm,
@@ -545,7 +543,7 @@ static const njs_object_prop_t  njs_stri
     {
         .type = NJS_PROPERTY,
         .name = njs_string("fromCharCode"),
-        .value = njs_native_function(njs_string_from_char_code, 1),
+        .value = njs_native_function2(njs_string_from_char_code, 1, 0),
         .writable = 1,
         .configurable = 1,
     },
@@ -553,7 +551,7 @@ static const njs_object_prop_t  njs_stri
     {
         .type = NJS_PROPERTY,
         .name = njs_string("fromCodePoint"),
-        .value = njs_native_function(njs_string_from_code_point, 1),
+        .value = njs_native_function2(njs_string_from_char_code, 1, 1),
         .writable = 1,
         .configurable = 1,
     },
@@ -1029,13 +1027,14 @@ static njs_int_t
 njs_string_prototype_to_bytes(njs_vm_t *vm, njs_value_t *args, njs_uint_t nargs,
     njs_index_t unused)
 {
-    u_char             *p;
-    size_t             length;
-    uint32_t           byte;
-    njs_int_t          ret;
-    const u_char       *s, *end;
-    njs_slice_prop_t   slice;
-    njs_string_prop_t  string;
+    u_char                *p;
+    size_t                length;
+    uint32_t              byte;
+    njs_int_t             ret;
+    const u_char          *s, *end;
+    njs_slice_prop_t      slice;
+    njs_string_prop_t     string;
+    njs_unicode_decode_t  ctx;
 
     ret = njs_string_object_validate(vm, njs_arg(args, nargs, 0));
     if (njs_slow_path(ret != NJS_OK)) {
@@ -1064,8 +1063,10 @@ njs_string_prototype_to_bytes(njs_vm_t *
 
             length = slice.length;
 
+            njs_utf8_decode_init(&ctx);
+
             while (length != 0 && s < end) {
-                byte = njs_utf8_decode(&s, end);
+                byte = njs_utf8_decode(&ctx, &s, end);
 
                 if (njs_slow_path(byte > 0xFF)) {
                     njs_release(vm, &vm->retval);
@@ -1463,13 +1464,14 @@ static njs_int_t
 njs_string_prototype_char_code_at(njs_vm_t *vm, njs_value_t *args,
     njs_uint_t nargs, njs_index_t unused)
 {
-    double             num;
-    size_t             length;
-    int64_t            index;
-    uint32_t           code;
-    njs_int_t          ret;
-    const u_char       *start, *end;
-    njs_string_prop_t  string;
+    double                num;
+    size_t                length;
+    int64_t               index;
+    uint32_t              code;
+    njs_int_t             ret;
+    const u_char          *start, *end;
+    njs_string_prop_t     string;
+    njs_unicode_decode_t  ctx;
 
     ret = njs_string_object_validate(vm, njs_arg(args, nargs, 0));
     if (njs_slow_path(ret != NJS_OK)) {
@@ -1493,10 +1495,12 @@ njs_string_prototype_char_code_at(njs_vm
         code = string.start[index];
 
     } else {
+        njs_utf8_decode_init(&ctx);
+
         /* UTF-8 string. */
         end = string.start + string.size;
         start = njs_string_offset(string.start, end, index);
-        code = njs_utf8_decode(&start, end);
+        code = njs_utf8_decode(&ctx, &start, end);
     }
 
     num = code;
@@ -1829,14 +1833,27 @@ njs_decode_base64_core(njs_vm_t *vm, njs
 
 
 static njs_int_t
-njs_string_from_char_code(njs_vm_t *vm, njs_value_t *args,
-    njs_uint_t nargs, njs_index_t unused)
+njs_string_from_char_code(njs_vm_t *vm, njs_value_t *args, njs_uint_t nargs,
+    njs_index_t is_point)
 {
-    u_char      *p;
-    size_t      size;
-    uint16_t    code;
-    njs_int_t   ret;
-    njs_uint_t  i;
+    double                num;
+    u_char                *p, *start, *end;
+    ssize_t               len;
+    int32_t               code;
+    uint32_t              cp;
+    uint64_t              length, size;
+    njs_int_t             ret;
+    njs_uint_t            i;
+    njs_unicode_decode_t  ctx;
+    u_char                buf[4];
+
+    size = 0;
+    length = 0;
+
+    cp = 0x00;
+    end = buf + sizeof(buf);
+
+    njs_utf16_decode_init(&ctx);
 
     for (i = 1; i < nargs; i++) {
         if (!njs_is_numeric(&args[i])) {
@@ -1845,73 +1862,76 @@ njs_string_from_char_code(njs_vm_t *vm, 
                 return ret;
             }
         }
+
+        if (is_point) {
+            num = njs_number(&args[i]);
+            if (isnan(num)) {
+                goto range_error;
+            }
+
+            code = num;
+
+            if (code != num || code < 0 || code > 0x10FFFF) {
+                goto range_error;
+            }
+
+        } else {
+            code = njs_number_to_uint16(njs_number(&args[i]));
+        }
+
+        start = buf;
+        len = njs_utf16_encode(code, &start, end);
+
+        start = buf;
+        cp = njs_utf16_decode(&ctx, (const u_char **) &start, start + len);
+
+        if (cp > NJS_UNICODE_MAX_CODEPOINT) {
+            if (cp == NJS_UNICODE_CONTINUE) {
+                continue;
+            }
+
+            cp = NJS_UNICODE_REPLACEMENT;
+        }
+
+        size += njs_utf8_size(cp);
+        length++;
     }
 
-    size = 0;
-
-    for (i = 1; i < nargs; i++) {
-        code = njs_number_to_uint16(njs_number(&args[i]));
-        size += njs_utf8_size_uint16(code);
+    if (cp == NJS_UNICODE_CONTINUE) {
+        size += njs_utf8_size(NJS_UNICODE_REPLACEMENT);
+        length++;
     }
 
-    p = njs_string_alloc(vm, &vm->retval, size, nargs - 1);
+    p = njs_string_alloc(vm, &vm->retval, size, length);
     if (njs_slow_path(p == NULL)) {
         return NJS_ERROR;
     }
 
-    for (i = 1; i < nargs; i++) {
-        code = njs_number_to_uint16(njs_number(&args[i]));
-        p = njs_utf8_encode(p, code);
-    }
-
-    return NJS_OK;
-}
-
-
-static njs_int_t
-njs_string_from_code_point(njs_vm_t *vm, njs_value_t *args, njs_uint_t nargs,
-    njs_index_t unused)
-{
-    u_char      *p;
-    double      num;
-    size_t      size;
-    int32_t     code;
-    njs_int_t   ret;
-    njs_uint_t  i;
+    njs_utf16_decode_init(&ctx);
 
     for (i = 1; i < nargs; i++) {
-        if (!njs_is_numeric(&args[i])) {
-            ret = njs_value_to_numeric(vm, &args[i], &args[i]);
-            if (ret != NJS_OK) {
-                return ret;
-            }
-        }
-    }
-
-    size = 0;
-
-    for (i = 1; i < nargs; i++) {
-        num = njs_number(&args[i]);
-        if (isnan(num)) {
-            goto range_error;
+        if (is_point) {
+            code = njs_number(&args[i]);
+
+        } else {
+            code = njs_number_to_uint16(njs_number(&args[i]));
         }
 
-        code = num;
-
-        if (code != num || code < 0 || code >= 0x110000) {
-            goto range_error;
+        start = buf;
+        len = njs_utf16_encode(code, &start, end);
+
+        start = buf;
+        cp = njs_utf16_decode(&ctx, (const u_char **) &start, start + len);
+
+        if (cp > NJS_UNICODE_MAX_CODEPOINT) {
+            if (cp == NJS_UNICODE_CONTINUE && i + 1 != nargs) {
+                continue;
+            }
+
+            cp = NJS_UNICODE_REPLACEMENT;
         }
 
-        size += njs_utf8_size(code);
-    }
-
-    p = njs_string_alloc(vm, &vm->retval, size, nargs - 1);
-    if (njs_slow_path(p == NULL)) {
-        return NJS_ERROR;
-    }
-
-    for (i = 1; i < nargs; i++) {
-        p = njs_utf8_encode(p, njs_number(&args[i]));
+        p = njs_utf8_encode(p, cp);
     }
 
     return NJS_OK;
@@ -2591,11 +2611,12 @@ static njs_int_t
 njs_string_prototype_trim(njs_vm_t *vm, njs_value_t *args, njs_uint_t nargs,
     njs_index_t mode)
 {
-    uint32_t           u, trim, length;
-    njs_int_t          ret;
-    njs_value_t        *value;
-    const u_char       *p, *prev, *start, *end;
-    njs_string_prop_t  string;
+    uint32_t              u, trim, length;
+    njs_int_t             ret;
+    njs_value_t           *value;
+    const u_char          *p, *prev, *start, *end;
+    njs_string_prop_t     string;
+    njs_unicode_decode_t  ctx;
 
     value = njs_argument(args, 0);
     ret = njs_string_object_validate(vm, value);
@@ -2651,13 +2672,15 @@ njs_string_prototype_trim(njs_vm_t *vm, 
         /* UTF-8 string. */
 
         if (mode & NJS_TRIM_START) {
+            njs_utf8_decode_init(&ctx);
+
             for ( ;; ) {
                 if (start == end) {
                     goto empty;
                 }
 
                 p = start;
-                u = njs_utf8_decode(&start, end);
+                u = njs_utf8_decode(&ctx, &start, end);
 
                 if (njs_utf8_is_whitespace(u)) {
                     trim++;
@@ -2672,6 +2695,8 @@ njs_string_prototype_trim(njs_vm_t *vm, 
         if (mode & NJS_TRIM_END) {
             prev = end;
 
+            njs_utf8_decode_init(&ctx);
+
             for ( ;; ) {
                 if (start == prev) {
                     goto empty;
@@ -2679,7 +2704,7 @@ njs_string_prototype_trim(njs_vm_t *vm, 
 
                 prev = njs_utf8_prev(prev);
                 p = prev;
-                u = njs_utf8_decode(&p, end);
+                u = njs_utf8_decode(&ctx, &p, end);
 
                 if (njs_utf8_is_whitespace(u)) {
                     trim++;
@@ -3640,11 +3665,12 @@ njs_string_prototype_replace(njs_vm_t *v
 double
 njs_string_to_number(const njs_value_t *value, njs_bool_t parse_float)
 {
-    double        num;
-    size_t        size;
-    uint32_t      u;
-    njs_bool_t    minus;
-    const u_char  *p, *start, *end;
+    double                num;
+    size_t                size;
+    uint32_t              u;
+    njs_bool_t            minus;
+    const u_char          *p, *start, *end;
+    njs_unicode_decode_t  ctx;
 
     const size_t  infinity = njs_length("Infinity");
 
@@ -3660,9 +3686,11 @@ njs_string_to_number(const njs_value_t *
 
     end = p + size;
 
+    njs_utf8_decode_init(&ctx);
+
     while (p < end) {
         start = p;
-        u = njs_utf8_decode(&p, end);
+        u = njs_utf8_decode(&ctx, &p, end);
 
         if (!njs_utf8_is_whitespace(u)) {
             p = start;
@@ -4179,15 +4207,16 @@ njs_int_t
 njs_string_encode_uri(njs_vm_t *vm, njs_value_t *args, njs_uint_t nargs,
     njs_index_t component)
 {
-    u_char             byte, *dst;
-    uint64_t           size;
-    uint32_t           cp, cp_low;
-    njs_int_t          ret;
-    njs_value_t        *value;
-    const u_char       *src, *end;
-    const uint32_t     *escape;
-    njs_string_prop_t  string;
-    u_char             encode[4];
+    u_char                byte, *dst;
+    uint64_t              size;
+    uint32_t              cp, cp_low;
+    njs_int_t             ret;
+    njs_value_t           *value;
+    const u_char          *src, *end;
+    const uint32_t        *escape;
+    njs_string_prop_t     string;
+    njs_unicode_decode_t  ctx;
+    u_char                encode[4];
 
     static const uint32_t  escape_uri[] = {
         0xffffffff,  /* 1111 1111 1111 1111  1111 1111 1111 1111 */
@@ -4257,8 +4286,10 @@ njs_string_encode_uri(njs_vm_t *vm, njs_
     } else {
         /* UTF-8 string. */
 
+        njs_utf8_decode_init(&ctx);
+
         while (src < end) {
-            cp = njs_utf8_decode(&src, end);
+            cp = njs_utf8_decode(&ctx, &src, end);
 
             if (cp < 0x80 && !njs_need_escape(escape, cp)) {
                 size++;
@@ -4271,7 +4302,7 @@ njs_string_encode_uri(njs_vm_t *vm, njs_
                 }
 
                 if (njs_surrogate_leading(cp)) {
-                    cp_low = njs_utf8_decode(&src, end);
+                    cp_low = njs_utf8_decode(&ctx, &src, end);
 
                     if (njs_slow_path(!njs_surrogate_trailing(cp_low))) {
                         goto uri_error;
@@ -4310,11 +4341,13 @@ njs_string_encode_uri(njs_vm_t *vm, njs_
 
     /* UTF-8 string. */
 
+    njs_utf8_decode_init(&ctx);
+
     while (src < end) {
-        cp = njs_utf8_decode(&src, end);
+        cp = njs_utf8_decode(&ctx, &src, end);
 
         if (njs_slow_path(njs_surrogate_leading(cp))) {
-            cp_low = njs_utf8_decode(&src, end);
+            cp_low = njs_utf8_decode(&ctx, &src, end);
             cp = njs_string_surrogate_pair(cp, cp_low);
         }
 
@@ -4337,11 +4370,14 @@ njs_inline uint32_t
 njs_string_decode_uri_cp(const int8_t *hex, const u_char **start,
     const u_char *end, njs_bool_t expect_percent)
 {
-    int8_t        d0, d1;
-    uint32_t      cp;
-    const u_char  *p;
-
-    cp = njs_utf8_decode(start, end);
+    int8_t                d0, d1;
+    uint32_t              cp;
+    const u_char          *p;
+    njs_unicode_decode_t  ctx;
+
+    njs_utf8_decode_init(&ctx);
+
+    cp = njs_utf8_decode(&ctx, start, end);
     if (njs_fast_path(cp != '%')) {
         return expect_percent ? 0xFFFFFFFF: cp;
     }
@@ -4378,18 +4414,19 @@ njs_int_t
 njs_string_decode_uri(njs_vm_t *vm, njs_value_t *args, njs_uint_t nargs,
     njs_index_t component)
 {
-    u_char             *dst;
-    int64_t            size, length;
-    uint32_t           cp;
-    njs_int_t          ret;
-    njs_chb_t          chain;
-    njs_uint_t         i, n;
-    njs_bool_t         percent;
-    njs_value_t        *value;
-    const u_char       *src, *p, *end;
-    const uint32_t     *reserve;
-    njs_string_prop_t  string;
-    u_char             encode[4];
+    u_char                *dst;
+    int64_t               size, length;
+    uint32_t              cp;
+    njs_int_t             ret;
+    njs_chb_t             chain;
+    njs_uint_t            i, n;
+    njs_bool_t            percent;
+    njs_value_t           *value;
+    const u_char          *src, *p, *end;
+    const uint32_t        *reserve;
+    njs_string_prop_t     string;
+    njs_unicode_decode_t  ctx;
+    u_char                encode[4];
 
     static const uint32_t  reserve_uri[] = {
         0x00000000,  /* 0000 0000 0000 0000  0000 0000 0000 0000 */
@@ -4472,6 +4509,8 @@ njs_string_decode_uri(njs_vm_t *vm, njs_
 
     njs_chb_init(&chain, vm->mem_pool);
 
+    njs_utf8_decode_init(&ctx);
+
     while (src < end) {
         percent = (src[0] == '%');
         cp = njs_string_decode_uri_cp(hex, &src, end, 0);
@@ -4529,8 +4568,8 @@ njs_string_decode_uri(njs_vm_t *vm, njs_
         }
 
         p = encode;
-        cp = njs_utf8_decode(&p, p + n);
-        if (njs_slow_path(cp == 0xFFFFFFFF)) {
+        cp = njs_utf8_decode(&ctx, &p, p + n);
+        if (njs_slow_path(cp > NJS_UNICODE_MAX_CODEPOINT)) {
             goto uri_error;
         }
 
diff -r 63106bd2e9bf -r 855edd76bdb6 src/njs_unicode.h
--- a/src/njs_unicode.h	Wed Jul 15 19:19:18 2020 +0300
+++ b/src/njs_unicode.h	Wed Jul 15 19:19:19 2020 +0300
@@ -9,6 +9,7 @@
 
 
 enum {
+    NJS_UNICODE_REPLACEMENT   = 0xFFFD,
     NJS_UNICODE_MAX_CODEPOINT = 0x10FFFF,
     NJS_UNICODE_ERROR         = 0x1FFFFF,
     NJS_UNICODE_CONTINUE      = 0x2FFFFF
@@ -16,6 +17,9 @@ enum {
 
 typedef struct {
     uint32_t  codepoint;
+
+    unsigned  need;
+    u_char    lower;
     u_char    upper;
 } njs_unicode_decode_t;
 
diff -r 63106bd2e9bf -r 855edd76bdb6 src/njs_utf8.c
--- a/src/njs_utf8.c	Wed Jul 15 19:19:18 2020 +0300
+++ b/src/njs_utf8.c	Wed Jul 15 19:19:19 2020 +0300
@@ -56,211 +56,166 @@ njs_utf8_encode(u_char *p, uint32_t u)
 }
 
 
-/*
- * njs_utf8_decode() decodes UTF-8 sequences and returns a valid
- * character 0x00 - 0x10FFFF, or 0xFFFFFFFF for invalid or overlong
- * UTF-8 sequence.
- */
+njs_inline njs_int_t
+njs_utf8_boundary(njs_unicode_decode_t *ctx, const u_char **data,
+    unsigned *need, u_char lower, u_char upper)
+{
+    u_char  ch;
 
-uint32_t
-njs_utf8_decode(const u_char **start, const u_char *end)
-{
-    uint32_t  u;
+    ch = **data;
 
-    u = (uint32_t) **start;
-
-    if (u < 0x80) {
-        (*start)++;
-        return u;
+    if (ch < lower || ch > upper) {
+        return NJS_ERROR;
     }
 
-    return njs_utf8_decode2(start, end);
+    (*data)++;
+    (*need)--;
+    ctx->codepoint = (ctx->codepoint << 6) | (ch & 0x3F);
+
+    return NJS_OK;
 }
 
 
-/*
- * njs_utf8_decode2() decodes two and more bytes UTF-8 sequences only
- * and returns a valid character 0x80 - 0x10FFFF, OR 0xFFFFFFFF for
- * invalid or overlong UTF-8 sequence.
- */
-
-uint32_t
-njs_utf8_decode2(const u_char **start, const u_char *end)
+njs_inline void
+njs_utf8_boundary_set(njs_unicode_decode_t *ctx, const u_char ch,
+    u_char first, u_char second, u_char lower, u_char upper)
 {
-    u_char        c;
-    size_t        n;
-    uint32_t      u, overlong;
-    const u_char  *p;
-
-    p = *start;
-    u = (uint32_t) *p;
-
-    if (u >= 0xE0) {
-
-        if (u >= 0xF0) {
-
-            if (njs_slow_path(u > 0xF4)) {
-                /*
-                 * The maximum valid Unicode character is 0x10FFFF
-                 * which is encoded as 0xF4 0x8F 0xBF 0xBF.
-                 */
-                return 0xFFFFFFFF;
-            }
-
-            u &= 0x07;
-            overlong = 0x00FFFF;
-            n = 3;
-
-        } else {
-            u &= 0x0F;
-            overlong = 0x07FF;
-            n = 2;
-        }
+    if (ch == first) {
+        ctx->lower = lower;
+        ctx->upper = 0xBF;
 
-    } else if (u >= 0xC2) {
-
-        /* 0x80 is encoded as 0xC2 0x80. */
-
-        u &= 0x1F;
-        overlong = 0x007F;
-        n = 1;
-
-    } else {
-        /* u <= 0xC2 */
-        return 0xFFFFFFFF;
+    } else if (ch == second) {
+        ctx->lower = 0x80;
+        ctx->upper = upper;
     }
-
-    p++;
-
-    if (njs_fast_path(p + n <= end)) {
-
-        do {
-            c = *p++;
-            /*
-             * The byte must in the 0x80 - 0xBF range.
-             * Values below 0x80 become >= 0x80.
-             */
-            c = c - 0x80;
-
-            if (njs_slow_path(c > 0x3F)) {
-                return 0xFFFFFFFF;
-            }
-
-            u = (u << 6) | c;
-            n--;
-
-        } while (n != 0);
-
-        if (overlong < u && u < 0x110000) {
-            *start = p;
-            return u;
-        }
-    }
-
-    return 0xFFFFFFFF;
 }
 
 
 uint32_t
-njs_utf8_safe_decode(const u_char **start, const u_char *end)
-{
-    uint32_t  u;
-
-    u = (uint32_t) **start;
-
-    if (u < 0x80) {
-        (*start)++;
-        return u;
-    }
-
-    return njs_utf8_safe_decode2(start, end);
-}
-
-
-uint32_t
-njs_utf8_safe_decode2(const u_char **start, const u_char *end)
+njs_utf8_decode(njs_unicode_decode_t *ctx, const u_char **start,
+    const u_char *end)
 {
     u_char        c;
-    size_t        n;
-    uint32_t      u, overlong;
+    unsigned      need;
+    njs_int_t     ret;
     const u_char  *p;
 
-    p = *start;
-    u = (uint32_t) *p;
-
-    if (u >= 0xE0) {
-
-        if (u >= 0xF0) {
+    if (ctx->need != 0) {
+        need = ctx->need;
+        ctx->need = 0;
 
-            if (njs_slow_path(u > 0xF4)) {
-                /*
-                 * The maximum valid Unicode character is 0x10FFFF
-                 * which is encoded as 0xF4 0x8F 0xBF 0xBF.
-                 */
-                goto fail_one;
+        if (ctx->lower != 0x00) {
+            ret = njs_utf8_boundary(ctx, start, &need, ctx->lower, ctx->upper);
+            if (njs_slow_path(ret != NJS_OK)) {
+                goto failed;
             }
 
-            u &= 0x07;
-            overlong = 0x00FFFF;
-            n = 3;
+            ctx->lower = 0x00;
+        }
+
+        goto decode;
+    }
+
+    c = *(*start)++;
+
+    if (c < 0x80) {
+        return c;
 
-        } else {
-            u &= 0x0F;
-            overlong = 0x07FF;
-            n = 2;
+    } else if (c <= 0xDF) {
+        if (c < 0xC2) {
+            return NJS_UNICODE_ERROR;
+        }
+
+        need = 1;
+        ctx->codepoint = c & 0x1F;
+
+    } else if (c < 0xF0) {
+        need = 2;
+        ctx->codepoint = c & 0x0F;
+
+        if (*start == end) {
+            njs_utf8_boundary_set(ctx, c, 0xE0, 0xED, 0xA0, 0x9F);
+            goto next;
         }
 
-    } else if (u >= 0xC2) {
+        ret = NJS_OK;
+
+        if (c == 0xE0) {
+            ret = njs_utf8_boundary(ctx, start, &need, 0xA0, 0xBF);
 
-        /* 0x80 is encoded as 0xC2 0x80. */
+        } else if (c == 0xED) {
+            ret = njs_utf8_boundary(ctx, start, &need, 0x80, 0x9F);
+        }
+
+        if (njs_slow_path(ret != NJS_OK)) {
+            goto failed;
+        }
+
+    } else if (c < 0xF5) {
+        need = 3;
+        ctx->codepoint = c & 0x07;
 
-        u &= 0x1F;
-        overlong = 0x007F;
-        n = 1;
+        if (*start == end) {
+            njs_utf8_boundary_set(ctx, c, 0xF0, 0xF4, 0x90, 0x8F);
+            goto next;
+        }
+
+        ret = NJS_OK;
+
+        if (c == 0xF0) {
+            ret = njs_utf8_boundary(ctx, start, &need, 0x90, 0xBF);
+
+        } else if (c == 0xF4) {
+            ret = njs_utf8_boundary(ctx, start, &need, 0x80, 0x8F);
+        }
+
+        if (njs_slow_path(ret != NJS_OK)) {
+            goto failed;
+        }
 
     } else {
-        /* u <= 0xC2 */
-        goto fail_one;
+        return NJS_UNICODE_ERROR;
     }
 
-    p++;
+decode:
+
+    for (p = *start; p < end; p++) {
+        c = *p;
 
-    while (p < end && n != 0) {
-        c = *p++;
-        /*
-         * The byte must in the 0x80 - 0xBF range.
-         * Values below 0x80 become >= 0x80.
-         */
-        c = c - 0x80;
+        if (c < 0x80 || c > 0xBF) {
+            *start = p;
 
-        if (njs_slow_path(c > 0x3F)) {
-            *start = --p;
-            return NJS_UTF8_REPLACEMENT;
+            goto failed;
         }
 
-        u = (u << 6) | c;
-        n--;
+        ctx->codepoint = (ctx->codepoint << 6) | (c & 0x3F);
+
+        if (--need == 0) {
+            *start = p + 1;
+
+            return ctx->codepoint;
+        }
     }
 
     *start = p;
 
-    if (n == 0 && overlong < u && u < 0x110000) {
-        return u;
-    }
+next:
 
-    return NJS_UTF8_REPLACEMENT;
+    ctx->need = need;


More information about the nginx-devel mailing list