[njs] Introduced UTF-16 according to WHATWG encoding spec.

Alexander Borisov alexander.borisov at nginx.com
Wed Jul 15 16:20:42 UTC 2020


details:   https://hg.nginx.org/njs/rev/63106bd2e9bf
branches:  
changeset: 1471:63106bd2e9bf
user:      Alexander Borisov <alexander.borisov at nginx.com>
date:      Wed Jul 15 19:19:18 2020 +0300
description:
Introduced UTF-16 according to WHATWG encoding spec.

diffstat:

 auto/make                    |    4 +-
 auto/sources                 |    3 +-
 src/njs_main.h               |    2 +
 src/njs_unicode.h            |   23 +++
 src/njs_utf16.c              |  116 +++++++++++++++
 src/njs_utf16.h              |   25 +++
 src/test/unicode_unit_test.c |  312 +++++++++++++++++++++++++++++++++++++++++++
 src/test/utf8_unit_test.c    |  202 ---------------------------
 8 files changed, 482 insertions(+), 205 deletions(-)

diffs (749 lines):

diff -r c39329b57a06 -r 63106bd2e9bf auto/make
--- a/auto/make	Wed Jul 15 15:34:16 2020 +0000
+++ b/auto/make	Wed Jul 15 19:19:18 2020 +0300
@@ -241,12 +241,12 @@ lib_test: $NJS_BUILD_DIR/njs_auto_config
 	$NJS_BUILD_DIR/random_unit_test \\
 	$NJS_BUILD_DIR/rbtree_unit_test \\
 	$NJS_BUILD_DIR/lvlhsh_unit_test \\
-	$NJS_BUILD_DIR/utf8_unit_test
+	$NJS_BUILD_DIR/unicode_unit_test
 
 	$NJS_BUILD_DIR/random_unit_test
 	$NJS_BUILD_DIR/rbtree_unit_test
 	$NJS_BUILD_DIR/lvlhsh_unit_test
-	$NJS_BUILD_DIR/utf8_unit_test
+	$NJS_BUILD_DIR/unicode_unit_test
 
 unit_test: $NJS_BUILD_DIR/njs_auto_config.h \\
 	$NJS_BUILD_DIR/njs_unit_test
diff -r c39329b57a06 -r 63106bd2e9bf auto/sources
--- a/auto/sources	Wed Jul 15 15:34:16 2020 +0000
+++ b/auto/sources	Wed Jul 15 19:19:18 2020 +0300
@@ -6,6 +6,7 @@ NJS_LIB_SRCS=" \
    src/njs_murmur_hash.c \
    src/njs_djb_hash.c \
    src/njs_utf8.c \
+   src/njs_utf16.c \
    src/njs_arr.c \
    src/njs_rbtree.c \
    src/njs_lvlhsh.c \
@@ -60,7 +61,7 @@ NJS_LIB_TEST_SRCS=" \
    src/test/lvlhsh_unit_test.c \
    src/test/random_unit_test.c \
    src/test/rbtree_unit_test.c \
-   src/test/utf8_unit_test.c \
+   src/test/unicode_unit_test.c \
 "
 
 NJS_TEST_SRCS=" \
diff -r c39329b57a06 -r 63106bd2e9bf src/njs_main.h
--- a/src/njs_main.h	Wed Jul 15 15:34:16 2020 +0000
+++ b/src/njs_main.h	Wed Jul 15 19:19:18 2020 +0300
@@ -14,7 +14,9 @@
 #include <njs_types.h>
 #include <njs_clang.h>
 #include <njs_str.h>
+#include <njs_unicode.h>
 #include <njs_utf8.h>
+#include <njs_utf16.h>
 #include <njs_diyfp.h>
 #include <njs_dtoa.h>
 #include <njs_dtoa_fixed.h>
diff -r c39329b57a06 -r 63106bd2e9bf src/njs_unicode.h
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/njs_unicode.h	Wed Jul 15 19:19:18 2020 +0300
@@ -0,0 +1,23 @@
+
+/*
+ * Copyright (C) Alexander Borisov
+ * Copyright (C) NGINX, Inc.
+ */
+
+#ifndef _NJS_UNICODE_H_INCLUDED_
+#define _NJS_UNICODE_H_INCLUDED_
+
+
+enum {
+    NJS_UNICODE_MAX_CODEPOINT = 0x10FFFF,
+    NJS_UNICODE_ERROR         = 0x1FFFFF,
+    NJS_UNICODE_CONTINUE      = 0x2FFFFF
+};
+
+typedef struct {
+    uint32_t  codepoint;
+    u_char    upper;
+} njs_unicode_decode_t;
+
+
+#endif /* _NJS_UNICODE_H_INCLUDED_ */
diff -r c39329b57a06 -r 63106bd2e9bf src/njs_utf16.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/njs_utf16.c	Wed Jul 15 19:19:18 2020 +0300
@@ -0,0 +1,116 @@
+
+/*
+ * Copyright (C) Alexander Borisov
+ * Copyright (C) NGINX, Inc.
+ */
+
+
+#include <njs_main.h>
+
+
+njs_inline void
+njs_utf16_encode_write(uint32_t cp, u_char **start)
+{
+#ifdef NJS_HAVE_BIG_ENDIAN
+        *(*start)++ = cp >> 8;
+        *(*start)++ = cp & 0x00FF;
+#else
+        *(*start)++ = cp & 0x00FF;
+        *(*start)++ = cp >> 8;
+#endif
+}
+
+
+ssize_t
+njs_utf16_encode(uint32_t cp, u_char **start, const u_char *end)
+{
+    if ((*start + 2) > end) {
+        return NJS_ERROR;
+    }
+
+    if (cp < 0x10000) {
+        njs_utf16_encode_write(cp, start);
+
+        return 2;
+    }
+
+    if ((*start + 4) > end) {
+        return NJS_ERROR;
+    }
+
+    cp -= 0x10000;
+
+    njs_utf16_encode_write((0xD800 | (cp >> 0x0A)), start);
+    njs_utf16_encode_write((0xDC00 | (cp & 0x03FF)), start);
+
+    return 4;
+}
+
+
+uint32_t
+njs_utf16_decode(njs_unicode_decode_t *ctx, const u_char **start,
+    const u_char *end)
+{
+    uint32_t  unit;
+    unsigned  lead;
+
+    if (ctx->upper != 0x00) {
+        lead = ctx->upper - 0x01;
+        ctx->upper = 0x00;
+
+        goto lead_state;
+    }
+
+pair_state:
+
+    lead = *(*start)++;
+
+    if (*start >= end) {
+        ctx->upper = lead + 0x01;
+        return NJS_UNICODE_CONTINUE;
+    }
+
+lead_state:
+
+#ifdef NJS_HAVE_BIG_ENDIAN
+        unit = (lead << 8) + *(*start)++;
+#else
+        unit = (*(*start)++ << 8) + lead;
+#endif
+
+    if (ctx->codepoint != 0x00) {
+        if ((unsigned) (unit - 0xDC00) <= (0xDFFF - 0xDC00)) {
+            unit = 0x10000 + ((ctx->codepoint - 0xD800) << 10)
+                   + (unit - 0xDC00);
+
+            ctx->codepoint = 0x00;
+
+            return unit;
+        }
+
+        (*start)--;
+
+        ctx->upper = lead + 0x01;
+        ctx->codepoint = 0x00;
+
+        return NJS_UNICODE_ERROR;
+    }
+
+    /* Surrogate pair. */
+
+    if ((unsigned) (unit - 0xD800) <= (0xDFFF - 0xD800)) {
+        if ((unsigned) (unit - 0xDC00) <= (0xDFFF - 0xDC00)) {
+            return NJS_UNICODE_ERROR;
+        }
+
+        ctx->codepoint = unit;
+
+        if (*start >= end) {
+            return NJS_UNICODE_CONTINUE;
+        }
+
+        goto pair_state;
+    }
+
+    return unit;
+}
diff -r c39329b57a06 -r 63106bd2e9bf src/njs_utf16.h
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/njs_utf16.h	Wed Jul 15 19:19:18 2020 +0300
@@ -0,0 +1,25 @@
+
+/*
+ * Copyright (C) Alexander Borisov
+ * Copyright (C) NGINX, Inc.
+ */
+
+#ifndef _NJS_UTF16_H_INCLUDED_
+#define _NJS_UTF16_H_INCLUDED_
+
+
+NJS_EXPORT ssize_t njs_utf16_encode(uint32_t cp, u_char **start,
+    const u_char *end);
+NJS_EXPORT uint32_t njs_utf16_decode(njs_unicode_decode_t *ctx,
+    const u_char **start, const u_char *end);
+
+
+njs_inline void
+njs_utf16_decode_init(njs_unicode_decode_t *ctx)
+{
+    ctx->upper = 0x00;
+    ctx->codepoint = 0x00;
+}
+
+
+#endif /* _NJS_UTF16_H_INCLUDED_ */
diff -r c39329b57a06 -r 63106bd2e9bf src/test/unicode_unit_test.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/test/unicode_unit_test.c	Wed Jul 15 19:19:18 2020 +0300
@@ -0,0 +1,312 @@
+
+/*
+ * Copyright (C) Igor Sysoev
+ * Copyright (C) NGINX, Inc.
+ */
+
+
+#include <njs_main.h>
+
+
+#define NJS_UTF8_START_TEST  0xC2
+
+
+static u_char  invalid[] = {
+
+    /* Invalid first byte less than 0xC2. */
+    1, 0x80, 0x00, 0x00, 0x00,
+    1, 0xC0, 0x00, 0x00, 0x00,
+    2, 0xC0, 0x00, 0x00, 0x00,
+    3, 0xC0, 0x00, 0x00, 0x00,
+    4, 0xC0, 0x00, 0x00, 0x00,
+
+    /* Invalid 0x0x110000 value. */
+    4, 0xF4, 0x90, 0x80, 0x80,
+
+    /* Incomplete length. */
+    2, 0xE0, 0xAF, 0xB5, 0x00,
+
+    /* Overlong values. */
+    2, 0xC0, 0x80, 0x00, 0x00,
+    2, 0xC1, 0xB3, 0x00, 0x00,
+    3, 0xE0, 0x80, 0x80, 0x00,
+    3, 0xE0, 0x81, 0xB3, 0x00,
+    3, 0xE0, 0x90, 0x9A, 0x00,
+    4, 0xF0, 0x80, 0x8A, 0x80,
+    4, 0xF0, 0x80, 0x81, 0xB3,
+    4, 0xF0, 0x80, 0xAF, 0xB5,
+};
+
+
+static njs_int_t
+utf8_overlong(u_char *overlong, size_t len)
+{
+    u_char        *p, utf8[4];
+    size_t        size;
+    uint32_t      u, d;
+    njs_uint_t    i;
+    const u_char  *pp;
+
+    pp = overlong;
+
+    d = njs_utf8_decode(&pp, overlong + len);
+
+    len = pp - overlong;
+
+    if (d != 0xFFFFFFFF) {
+        p = njs_utf8_encode(utf8, d);
+
+        size = (p != NULL) ? p - utf8 : 0;
+
+        if (len != size || memcmp(overlong, utf8, size) != 0) {
+
+            u = 0;
+            for (i = 0; i < len; i++) {
+                u = (u << 8) + overlong[i];
+            }
+
+            njs_printf("njs_utf8_decode(%05uXD, %uz) failed: %05uXD, %uz\n",
+                       u, len, d, size);
+
+            return NJS_ERROR;
+        }
+    }
+
+    return NJS_OK;
+}
+
+
+static njs_int_t
+utf8_unit_test(njs_uint_t start)
+{
+    u_char        *p, utf8[4];
+    size_t        len;
+    int32_t       n;
+    uint32_t      u, d;
+    njs_uint_t    i, k, l, m;
+    const u_char  *pp;
+
+    njs_printf("utf8 test started\n");
+
+    /* Test valid UTF-8. */
+
+    for (u = 0; u < 0x110000; u++) {
+
+        p = njs_utf8_encode(utf8, u);
+
+        if (p == NULL) {
+            njs_printf("njs_utf8_encode(%05uXD) failed\n", u);
+            return NJS_ERROR;
+        }
+
+        pp = utf8;
+
+        d = njs_utf8_decode(&pp, p);
+
+        if (u != d) {
+            njs_printf("njs_utf8_decode(%05uXD) failed: %05uxD\n", u, d);
+            return NJS_ERROR;
+        }
+    }
+
+    /* Test some invalid UTF-8. */
+
+    for (i = 0; i < sizeof(invalid); i += 5) {
+
+        len = invalid[i];
+        utf8[0] = invalid[i + 1];
+        utf8[1] = invalid[i + 2];
+        utf8[2] = invalid[i + 3];
+        utf8[3] = invalid[i + 4];
+
+        pp = utf8;
+
+        d = njs_utf8_decode(&pp, utf8 + len);
+
+        if (d != 0xFFFFFFFF) {
+
+            u = 0;
+            for (i = 0; i < len; i++) {
+                u = (u << 8) + utf8[i];
+            }
+
+            njs_printf("njs_utf8_decode(%05uXD, %uz) failed: %05uXD\n",
+                       u, len, d);
+            return NJS_ERROR;
+        }
+    }
+
+    /* Test all overlong UTF-8. */
+
+    for (i = start; i < 256; i++) {
+        utf8[0] = i;
+
+        if (utf8_overlong(utf8, 1) != NJS_OK) {
+            return NJS_ERROR;
+        }
+
+        for (k = 0; k < 256; k++) {
+            utf8[1] = k;
+
+            if (utf8_overlong(utf8, 2) != NJS_OK) {
+                return NJS_ERROR;
+            }
+
+            for (l = 0; l < 256; l++) {
+                utf8[2] = l;
+
+                if (utf8_overlong(utf8, 3) != NJS_OK) {
+                    return NJS_ERROR;
+                }
+
+                for (m = 0; m < 256; m++) {
+                    utf8[3] = m;
+
+                    if (utf8_overlong(utf8, 4) != NJS_OK) {
+                        return NJS_ERROR;
+                    }
+                }
+            }
+        }
+    }
+
+    n = njs_utf8_casecmp((u_char *) "ABC АБВ ΑΒΓ",
+                         (u_char *) "abc абв αβγ",
+                         njs_length("ABC АБВ ΑΒΓ"),
+                         njs_length("abc абв αβγ"));
+
+    if (n != 0) {
+        njs_printf("njs_utf8_casecmp() failed\n");
+        return NJS_ERROR;
+    }
+
+    njs_printf("utf8 test passed\n");
+    return NJS_OK;
+}
+
+
+static njs_int_t
+utf16_unit_test()
+{
+    int8_t                length, length_to;
+    u_char                *start, *end, *end_to;
+    uint32_t              cp, i;
+    njs_unicode_decode_t  ctx;
+    u_char                buf[8], to[4];
+
+    njs_printf("utf16 test started\n");
+
+    end = buf + sizeof(buf);
+    end_to = to + sizeof(to);
+
+    for (i = 0; i <= NJS_UNICODE_MAX_CODEPOINT; i++) {
+
+        /* Skip surrogate pair. */
+
+        if (i >= 0xD800 && i <= 0xDFFF) {
+            continue;
+        }
+
+        start = buf;
+
+        length = njs_utf16_encode(i, &start, end);
+        if (length < NJS_OK) {
+            njs_printf("utf16 test encode failed\n");
+            return NJS_ERROR;
+        }
+
+        njs_utf16_decode_init(&ctx);
+
+        start = buf;
+
+        cp = njs_utf16_decode(&ctx, (const u_char **) &start, start + length);
+        if (cp > NJS_UNICODE_MAX_CODEPOINT) {
+            njs_printf("utf16 test decode failed\n");
+            return NJS_ERROR;
+        }
+
+        if (cp != i) {
+            njs_printf("utf16 test decode code point does not match\n");
+            return NJS_ERROR;
+        }
+
+        start = to;
+
+        length_to = njs_utf16_encode(cp, &start, end_to);
+        if (length_to < NJS_OK) {
+            njs_printf("utf16 test encode failed\n");
+            return NJS_ERROR;
+        }
+
+        if (length_to != length || njs_strncmp(buf, to, length) != 0) {
+            njs_printf("utf16 test decode-encode failed\n");
+            return NJS_ERROR;
+        }
+    }
+
+    /* Surrogate pair. */
+
+    for (i = 0xD800; i <= 0xDFFF; i++) {
+        start = buf;
+
+        length = njs_utf16_encode(i, &start, end);
+        if (length < NJS_OK) {
+            njs_printf("utf16 test surrogate pair encode lead failed\n");
+            return NJS_ERROR;
+        }
+
+        length_to = njs_utf16_encode(i - 0xD800 + 0xDC00, &start, end);
+        if (length_to < NJS_OK) {
+            njs_printf("utf16 test surrogate pair encode failed\n");
+            return NJS_ERROR;
+        }
+
+        njs_utf16_decode_init(&ctx);
+
+        start = buf;
+
+        cp = njs_utf16_decode(&ctx, (const u_char **) &start,
+                              start + length + length_to);
+        if (cp > NJS_UNICODE_MAX_CODEPOINT) {
+            if (i < 0xDC00) {
+                njs_printf("utf16 test surrogate pair decode failed\n");
+                return NJS_ERROR;
+            }
+        }
+    }
+
+    njs_printf("utf16 test passed\n");
+
+    return NJS_OK;
+}
+
+
+int
+main(int argc, char **argv)
+{
+    njs_int_t   ret;
+    njs_uint_t  start;
+
+    njs_printf("unicode unit test started\n");
+
+    if (argc > 1 && argv[1][0] == 'a') {
+        start = NJS_UTF8_START_TEST;
+
+    } else {
+        start = 256;
+    }
+
+    ret = utf8_unit_test(start);
+    if (ret != NJS_OK) {
+        return ret;
+    }
+
+    ret = utf16_unit_test();
+    if (ret != NJS_OK) {
+        return ret;
+    }
+
+    njs_printf("unicode unit test passed\n");
+
+    return 0;
+}
diff -r c39329b57a06 -r 63106bd2e9bf src/test/utf8_unit_test.c
--- a/src/test/utf8_unit_test.c	Wed Jul 15 15:34:16 2020 +0000
+++ /dev/null	Thu Jan 01 00:00:00 1970 +0000
@@ -1,202 +0,0 @@
-
-/*
- * Copyright (C) Igor Sysoev
- * Copyright (C) NGINX, Inc.
- */
-
-
-#include <njs_main.h>
-
-
-#define NJS_UTF8_START_TEST  0xC2
-//#define NJS_UTF8_START_TEST  0
-
-
-static u_char  invalid[] = {
-
-    /* Invalid first byte less than 0xC2. */
-    1, 0x80, 0x00, 0x00, 0x00,
-    1, 0xC0, 0x00, 0x00, 0x00,
-    2, 0xC0, 0x00, 0x00, 0x00,
-    3, 0xC0, 0x00, 0x00, 0x00,
-    4, 0xC0, 0x00, 0x00, 0x00,
-
-    /* Invalid 0x0x110000 value. */
-    4, 0xF4, 0x90, 0x80, 0x80,
-
-    /* Incomplete length. */
-    2, 0xE0, 0xAF, 0xB5, 0x00,
-
-    /* Overlong values. */
-    2, 0xC0, 0x80, 0x00, 0x00,
-    2, 0xC1, 0xB3, 0x00, 0x00,
-    3, 0xE0, 0x80, 0x80, 0x00,
-    3, 0xE0, 0x81, 0xB3, 0x00,
-    3, 0xE0, 0x90, 0x9A, 0x00,
-    4, 0xF0, 0x80, 0x8A, 0x80,
-    4, 0xF0, 0x80, 0x81, 0xB3,
-    4, 0xF0, 0x80, 0xAF, 0xB5,
-};
-
-
-static njs_int_t
-utf8_overlong(u_char *overlong, size_t len)
-{
-    u_char        *p, utf8[4];
-    size_t        size;
-    uint32_t      u, d;
-    njs_uint_t    i;
-    const u_char  *pp;
-
-    pp = overlong;
-
-    d = njs_utf8_decode(&pp, overlong + len);
-
-    len = pp - overlong;
-
-    if (d != 0xFFFFFFFF) {
-        p = njs_utf8_encode(utf8, d);
-
-        size = (p != NULL) ? p - utf8 : 0;
-
-        if (len != size || memcmp(overlong, utf8, size) != 0) {
-
-            u = 0;
-            for (i = 0; i < len; i++) {
-                u = (u << 8) + overlong[i];
-            }
-
-            njs_printf("njs_utf8_decode(%05uXD, %uz) failed: %05uXD, %uz\n",
-                       u, len, d, size);
-
-            return NJS_ERROR;
-        }
-    }
-
-    return NJS_OK;
-}
-
-
-static njs_int_t
-utf8_unit_test(njs_uint_t start)
-{
-    u_char        *p, utf8[4];
-    size_t        len;
-    int32_t       n;
-    uint32_t      u, d;
-    njs_uint_t    i, k, l, m;
-    const u_char  *pp;
-
-    njs_printf("utf8 unit test started\n");
-
-    /* Test valid UTF-8. */
-
-    for (u = 0; u < 0x110000; u++) {
-
-        p = njs_utf8_encode(utf8, u);
-
-        if (p == NULL) {
-            njs_printf("njs_utf8_encode(%05uXD) failed\n", u);
-            return NJS_ERROR;
-        }
-
-        pp = utf8;
-
-        d = njs_utf8_decode(&pp, p);
-
-        if (u != d) {
-            njs_printf("njs_utf8_decode(%05uXD) failed: %05uxD\n", u, d);
-            return NJS_ERROR;
-        }
-    }
-
-    /* Test some invalid UTF-8. */
-
-    for (i = 0; i < sizeof(invalid); i += 5) {
-
-        len = invalid[i];
-        utf8[0] = invalid[i + 1];
-        utf8[1] = invalid[i + 2];
-        utf8[2] = invalid[i + 3];
-        utf8[3] = invalid[i + 4];
-
-        pp = utf8;
-
-        d = njs_utf8_decode(&pp, utf8 + len);
-
-        if (d != 0xFFFFFFFF) {
-
-            u = 0;
-            for (i = 0; i < len; i++) {
-                u = (u << 8) + utf8[i];
-            }
-
-            njs_printf("njs_utf8_decode(%05uXD, %uz) failed: %05uXD\n",
-                       u, len, d);
-            return NJS_ERROR;
-        }
-    }
-
-    /* Test all overlong UTF-8. */
-
-    for (i = start; i < 256; i++) {
-        utf8[0] = i;
-
-        if (utf8_overlong(utf8, 1) != NJS_OK) {
-            return NJS_ERROR;
-        }
-
-        for (k = 0; k < 256; k++) {
-            utf8[1] = k;
-
-            if (utf8_overlong(utf8, 2) != NJS_OK) {
-                return NJS_ERROR;
-            }
-
-            for (l = 0; l < 256; l++) {
-                utf8[2] = l;
-
-                if (utf8_overlong(utf8, 3) != NJS_OK) {
-                    return NJS_ERROR;
-                }
-
-                for (m = 0; m < 256; m++) {
-                    utf8[3] = m;
-
-                    if (utf8_overlong(utf8, 4) != NJS_OK) {
-                        return NJS_ERROR;
-                    }
-                }
-            }
-        }
-    }
-
-    n = njs_utf8_casecmp((u_char *) "ABC АБВ ΑΒΓ",
-                         (u_char *) "abc абв αβγ",
-                         njs_length("ABC АБВ ΑΒΓ"),
-                         njs_length("abc абв αβγ"));
-
-    if (n != 0) {
-        njs_printf("njs_utf8_casecmp() failed\n");
-        return NJS_ERROR;
-    }
-
-    njs_printf("utf8 unit test passed\n");
-    return NJS_OK;
-}
-
-
-int
-main(int argc, char **argv)
-{
-    njs_uint_t  start;
-
-    if (argc > 1 && argv[1][0] == 'a') {
-        start = NJS_UTF8_START_TEST;
-
-    } else {
-        start = 256;
-    }
-
-    return utf8_unit_test(start);
-}


More information about the nginx-devel mailing list