[njs] Improved UTF-8 encoding/decoding.

Alexander Borisov alexander.borisov at nginx.com
Wed Aug 26 18:26:57 UTC 2020


details:   https://hg.nginx.org/njs/rev/b98eb205a37b
branches:  
changeset: 1505:b98eb205a37b
user:      Alexander Borisov <alexander.borisov at nginx.com>
date:      Wed Aug 26 21:05:46 2020 +0300
description:
Improved UTF-8 encoding/decoding.

diffstat:

 src/njs_encoding.c |  135 ++++++----------------------------------------------
 src/njs_parser.c   |   18 +-----
 src/njs_utf8.c     |  101 ++++++++++++++++++++++++---------------
 src/njs_utf8.h     |   37 ++++++++++++-
 4 files changed, 117 insertions(+), 174 deletions(-)

diffs (443 lines):

diff -r 657d446001da -r b98eb205a37b src/njs_encoding.c
--- a/src/njs_encoding.c	Wed Aug 26 14:56:47 2020 +0000
+++ b/src/njs_encoding.c	Wed Aug 26 21:05:46 2020 +0300
@@ -18,7 +18,6 @@ typedef struct {
     njs_bool_t            fatal;
     njs_bool_t            ignore_bom;
 
-    uint32_t              codepoint;
     njs_unicode_decode_t  ctx;
 } njs_encoding_decode_t;
 
@@ -87,11 +86,10 @@ njs_text_encoder_encode(njs_vm_t *vm, nj
     njs_index_t unused)
 {
     u_char                *dst;
-    int64_t               size;
-    uint32_t              cp;
+    size_t                size;
     njs_int_t             ret;
     njs_value_t           *this, *input, value;
-    const u_char          *p, *start, *end;
+    const u_char          *start, *end;
     njs_string_prop_t     prop;
     njs_typed_array_t     *array;
     njs_unicode_decode_t  ctx;
@@ -126,30 +124,9 @@ njs_text_encoder_encode(njs_vm_t *vm, nj
         end = start + prop.size;
     }
 
-    p = start;
-
-    cp = 0;
-    size = 0;
-
     njs_utf8_decode_init(&ctx);
 
-    while (p < end) {
-        cp = njs_utf8_decode(&ctx, &p, end);
-
-        if (cp > NJS_UNICODE_MAX_CODEPOINT) {
-            if (cp == NJS_UNICODE_CONTINUE) {
-                continue;
-            }
-
-            cp = NJS_UNICODE_REPLACEMENT;
-        }
-
-        size += njs_utf8_size(cp);
-    }
-
-    if (cp == NJS_UNICODE_CONTINUE) {
-        size += njs_utf8_size(NJS_UNICODE_REPLACEMENT);
-    }
+    (void) njs_utf8_stream_length(&ctx, start, end - start, 1, 0, &size);
 
     njs_set_number(&value, size);
 
@@ -161,23 +138,7 @@ njs_text_encoder_encode(njs_vm_t *vm, nj
     dst = njs_typed_array_buffer(array)->u.u8;
     njs_utf8_decode_init(&ctx);
 
-    while (start < end) {
-        cp = njs_utf8_decode(&ctx, &start, end);
-
-        if (cp > NJS_UNICODE_MAX_CODEPOINT) {
-            if (cp == NJS_UNICODE_CONTINUE) {
-                continue;
-            }
-
-            cp = NJS_UNICODE_REPLACEMENT;
-        }
-
-        dst = njs_utf8_encode(dst, cp);
-    }
-
-    if (cp == NJS_UNICODE_CONTINUE) {
-        (void) njs_utf8_encode(dst, NJS_UNICODE_REPLACEMENT);
-    }
+    (void) njs_utf8_stream_encode(&ctx, start, end, dst, 1, 0);
 
     njs_set_typed_array(&vm->retval, array);
 
@@ -410,7 +371,6 @@ njs_text_decoder_constructor(njs_vm_t *v
         return ret;
     }
 
-    data->codepoint = 0;
     njs_utf8_decode_init(&data->ctx);
 
     njs_set_data(&ov->value, data, NJS_DATA_TAG_TEXT_DECODER);
@@ -573,12 +533,12 @@ njs_text_decoder_decode(njs_vm_t *vm, nj
     njs_index_t unused)
 {
     u_char                   *dst;
-    uint32_t                 length, cp;
-    uint64_t                 size;
+    size_t                   size;
+    ssize_t                  length;
     njs_int_t                ret;
     njs_bool_t               stream;
     njs_value_t              retval, *this, *typed_array, *options;
-    const u_char             *start, *end, *p;
+    const u_char             *start, *end;
     njs_unicode_decode_t     ctx;
     njs_encoding_decode_t    *data;
     const njs_typed_array_t  *array;
@@ -632,52 +592,18 @@ njs_text_decoder_decode(njs_vm_t *vm, nj
     data = njs_object_data(this);
 
     ctx = data->ctx;
-    cp = data->codepoint;
-
-    size = 0;
-    length = 0;
-
-    p = start;
 
     /* Looking for BOM. */
 
-    if (!data->ignore_bom && p + 3 <= end) {
-        cp = njs_utf8_decode(&ctx, &p, end);
-
-        if (cp == NJS_UNICODE_BOM) {
-            start = p;
-
-        } else {
-            p = start;
-        }
+    if (!data->ignore_bom) {
+        start += njs_utf8_bom(start, end);
     }
 
-    while (p < end) {
-        cp = njs_utf8_decode(&ctx, &p, end);
-
-        if (njs_slow_path(cp > NJS_UNICODE_MAX_CODEPOINT)) {
-            if (cp == NJS_UNICODE_CONTINUE) {
-                break;
-            }
-
-            if (data->fatal) {
-                goto fatal;
-            }
-
-            cp = NJS_UNICODE_REPLACEMENT;
-        }
-
-        size += njs_utf8_size(cp);
-        length++;
-    }
-
-    if (cp == NJS_UNICODE_CONTINUE && !stream) {
-        if (data->fatal) {
-            goto fatal;
-        }
-
-        size += njs_utf8_size(NJS_UNICODE_REPLACEMENT);
-        length++;
+    length = njs_utf8_stream_length(&ctx, start, end - start, !stream,
+                                    data->fatal, &size);
+    if (length == -1) {
+        njs_type_error(vm, "The encoded data was not valid");
+        return NJS_ERROR;
     }
 
     dst = njs_string_alloc(vm, &vm->retval, size, length);
@@ -685,40 +611,13 @@ njs_text_decoder_decode(njs_vm_t *vm, nj
         return NJS_ERROR;
     }
 
-    while (start < end) {
-        cp = njs_utf8_decode(&data->ctx, &start, end);
+    (void) njs_utf8_stream_encode(&data->ctx, start, end, dst, !stream, 0);
 
-        if (cp > NJS_UNICODE_MAX_CODEPOINT) {
-            if (cp == NJS_UNICODE_CONTINUE) {
-                break;
-            }
-
-            cp = NJS_UNICODE_REPLACEMENT;
-        }
-
-        dst = njs_utf8_encode(dst, cp);
+    if (!stream) {
+        njs_utf8_decode_init(&data->ctx);
     }
 
-    if (stream) {
-        data->codepoint = cp;
-        return NJS_OK;
-    }
-
-    if (cp == NJS_UNICODE_CONTINUE) {
-        (void) njs_utf8_encode(dst, NJS_UNICODE_REPLACEMENT);
-    }
-
-    data->codepoint = 0;
-
-    njs_utf8_decode_init(&data->ctx);
-
     return NJS_OK;
-
-fatal:
-
-    njs_type_error(vm, "The encoded data was not valid");
-
-    return NJS_ERROR;
 }
 
 
diff -r 657d446001da -r b98eb205a37b src/njs_parser.c
--- a/src/njs_parser.c	Wed Aug 26 14:56:47 2020 +0000
+++ b/src/njs_parser.c	Wed Aug 26 21:05:46 2020 +0300
@@ -7897,15 +7897,16 @@ njs_parser_string_create(njs_vm_t *vm, n
     njs_value_t *value)
 {
     u_char                *dst;
-    ssize_t               size, length;
-    uint32_t              cp;
+    size_t                size, length;
     njs_str_t             *src;
     const u_char          *p, *end;
     njs_unicode_decode_t  ctx;
 
     src = &token->text;
 
-    length = njs_utf8_safe_length(src->start, src->length, &size);
+    njs_utf8_decode_init(&ctx);
+
+    length = njs_utf8_stream_length(&ctx, src->start, src->length, 1, 0, &size);
 
     dst = njs_string_alloc(vm, value, size, length);
     if (njs_slow_path(dst == NULL)) {
@@ -7917,16 +7918,7 @@ njs_parser_string_create(njs_vm_t *vm, n
 
     njs_utf8_decode_init(&ctx);
 
-    while (p < end) {
-        cp = njs_utf8_decode(&ctx, &p, end);
-
-        if (cp <= NJS_UNICODE_MAX_CODEPOINT) {
-            dst = njs_utf8_encode(dst, cp);
-
-        } else {
-            dst = njs_utf8_encode(dst, NJS_UNICODE_REPLACEMENT);
-        }
-    }
+    (void) njs_utf8_stream_encode(&ctx, p, end, dst, 1, 0);
 
     if (length > NJS_STRING_MAP_STRIDE && size != length) {
         njs_string_offset_map_init(value->long_string.data->start, size);
diff -r 657d446001da -r b98eb205a37b src/njs_utf8.c
--- a/src/njs_utf8.c	Wed Aug 26 14:56:47 2020 +0000
+++ b/src/njs_utf8.c	Wed Aug 26 21:05:46 2020 +0300
@@ -213,6 +213,43 @@ failed:
     return NJS_UNICODE_ERROR;
 }
 
+
+u_char *
+njs_utf8_stream_encode(njs_unicode_decode_t *ctx, const u_char *start,
+    const u_char *end, u_char *dst, njs_bool_t last, njs_bool_t fatal)
+{
+    uint32_t  cp;
+
+    while (start < end) {
+        cp = njs_utf8_decode(ctx, &start, end);
+
+        if (cp > NJS_UNICODE_MAX_CODEPOINT) {
+            if (cp == NJS_UNICODE_CONTINUE) {
+                break;
+            }
+
+            if (fatal) {
+                return NULL;
+            }
+
+            cp = NJS_UNICODE_REPLACEMENT;
+        }
+
+        dst = njs_utf8_encode(dst, cp);
+    }
+
+    if (last && ctx->need != 0x00) {
+        if (fatal) {
+            return NULL;
+        }
+
+        dst = njs_utf8_encode(dst, NJS_UNICODE_REPLACEMENT);
+    }
+
+    return dst;
+}
+
+
 /*
  * njs_utf8_casecmp() tests only up to the minimum of given lengths, but
  * requires lengths of both strings because otherwise njs_utf8_decode()
@@ -314,57 +351,43 @@ njs_utf8_upper_case(const u_char **start
 
 
 ssize_t
-njs_utf8_length(const u_char *p, size_t len)
+njs_utf8_stream_length(njs_unicode_decode_t *ctx, const u_char *p, size_t len,
+    njs_bool_t last, njs_bool_t fatal, size_t *out_size)
 {
-    ssize_t               length;
-    const u_char          *end;
-    njs_unicode_decode_t  ctx;
-
-    length = 0;
-
-    end = p + len;
-
-    njs_utf8_decode_init(&ctx);
-
-    while (p < end) {
-        if (njs_slow_path(njs_utf8_decode(&ctx, &p, end)
-                          > NJS_UNICODE_MAX_CODEPOINT))
-        {
-            return -1;
-        }
-
-        length++;
-    }
-
-    return length;
-}
-
-
-ssize_t
-njs_utf8_safe_length(const u_char *p, size_t len, ssize_t *out_size)
-{
-    ssize_t               size, length;
-    uint32_t              codepoint;
-    const u_char          *end;
-    njs_unicode_decode_t  ctx;
+    size_t        size, length;
+    uint32_t      codepoint;
+    const u_char  *end;
 
     size = 0;
     length = 0;
 
     end = p + len;
 
-    njs_utf8_decode_init(&ctx);
-
     while (p < end) {
-        codepoint = njs_utf8_decode(&ctx, &p, end);
+        codepoint = njs_utf8_decode(ctx, &p, end);
 
-        if (codepoint <= NJS_UNICODE_MAX_CODEPOINT) {
-            size += njs_utf8_size(codepoint);
+        if (codepoint > NJS_UNICODE_MAX_CODEPOINT) {
+            if (codepoint == NJS_UNICODE_CONTINUE) {
+                break;
+            }
 
-        } else {
-            size += njs_utf8_size(NJS_UNICODE_REPLACEMENT);
+            if (fatal) {
+                return -1;
+            }
+
+            codepoint = NJS_UNICODE_REPLACEMENT;
         }
 
+        size += njs_utf8_size(codepoint);
+        length++;
+    }
+
+    if (last && ctx->need != 0x00) {
+        if (fatal) {
+            return -1;
+        }
+
+        size += njs_utf8_size(NJS_UNICODE_REPLACEMENT);
         length++;
     }
 
diff -r 657d446001da -r b98eb205a37b src/njs_utf8.h
--- a/src/njs_utf8.h	Wed Aug 26 14:56:47 2020 +0000
+++ b/src/njs_utf8.h	Wed Aug 26 21:05:46 2020 +0300
@@ -8,18 +8,21 @@
 #define _NJS_UTF8_H_INCLUDED_
 
 
-NJS_EXPORT u_char *njs_utf8_encode(u_char *p, uint32_t u);
 NJS_EXPORT uint32_t njs_utf8_decode(njs_unicode_decode_t *ctx,
     const u_char **data, const u_char *end);
+NJS_EXPORT u_char *njs_utf8_encode(u_char *p, uint32_t u);
+NJS_EXPORT u_char *njs_utf8_stream_encode(njs_unicode_decode_t *ctx,
+    const u_char *start, const u_char *end, u_char *dst, njs_bool_t last,
+    njs_bool_t fatal);
 NJS_EXPORT njs_int_t njs_utf8_casecmp(const u_char *start1,
     const u_char *start2, size_t len1, size_t len2);
 NJS_EXPORT uint32_t njs_utf8_lower_case(const u_char **start,
     const u_char *end);
 NJS_EXPORT uint32_t njs_utf8_upper_case(const u_char **start,
     const u_char *end);
-NJS_EXPORT ssize_t njs_utf8_length(const u_char *p, size_t len);
-NJS_EXPORT ssize_t njs_utf8_safe_length(const u_char *p, size_t len,
-    ssize_t *out_size);
+NJS_EXPORT ssize_t njs_utf8_stream_length(njs_unicode_decode_t *ctx,
+    const u_char *p, size_t len, njs_bool_t last, njs_bool_t fatal,
+    size_t *out_size);
 NJS_EXPORT njs_bool_t njs_utf8_is_valid(const u_char *p, size_t len);
 
 
@@ -119,6 +122,32 @@ njs_utf8_decode_init(njs_unicode_decode_
 }
 
 
+njs_inline ssize_t
+njs_utf8_length(const u_char *p, size_t len)
+{
+    njs_unicode_decode_t  ctx;
+
+    njs_utf8_decode_init(&ctx);
+
+    return njs_utf8_stream_length(&ctx, p, len, 1, 1, NULL);
+}
+
+
+njs_inline size_t
+njs_utf8_bom(const u_char *start, const u_char *end)
+{
+    if (start + 3 > end) {
+        return 0;
+    }
+
+    if (start[0] == 0xEF && start[1] == 0xBB && start[2] == 0xBF) {
+        return 3;
+    }
+
+    return 0;
+}
+
+
 njs_inline size_t
 njs_utf8_size(uint32_t cp)
 {


More information about the nginx-devel mailing list