[njs] Introduced UTF-16 according to WHATWG encoding spec.
Alexander Borisov
alexander.borisov at nginx.com
Wed Jul 15 16:20:42 UTC 2020
details: https://hg.nginx.org/njs/rev/63106bd2e9bf
branches:
changeset: 1471:63106bd2e9bf
user: Alexander Borisov <alexander.borisov at nginx.com>
date: Wed Jul 15 19:19:18 2020 +0300
description:
Introduced UTF-16 according to WHATWG encoding spec.
diffstat:
auto/make | 4 +-
auto/sources | 3 +-
src/njs_main.h | 2 +
src/njs_unicode.h | 23 +++
src/njs_utf16.c | 116 +++++++++++++++
src/njs_utf16.h | 25 +++
src/test/unicode_unit_test.c | 312 +++++++++++++++++++++++++++++++++++++++++++
src/test/utf8_unit_test.c | 202 ---------------------------
8 files changed, 482 insertions(+), 205 deletions(-)
diffs (749 lines):
diff -r c39329b57a06 -r 63106bd2e9bf auto/make
--- a/auto/make Wed Jul 15 15:34:16 2020 +0000
+++ b/auto/make Wed Jul 15 19:19:18 2020 +0300
@@ -241,12 +241,12 @@ lib_test: $NJS_BUILD_DIR/njs_auto_config
$NJS_BUILD_DIR/random_unit_test \\
$NJS_BUILD_DIR/rbtree_unit_test \\
$NJS_BUILD_DIR/lvlhsh_unit_test \\
- $NJS_BUILD_DIR/utf8_unit_test
+ $NJS_BUILD_DIR/unicode_unit_test
$NJS_BUILD_DIR/random_unit_test
$NJS_BUILD_DIR/rbtree_unit_test
$NJS_BUILD_DIR/lvlhsh_unit_test
- $NJS_BUILD_DIR/utf8_unit_test
+ $NJS_BUILD_DIR/unicode_unit_test
unit_test: $NJS_BUILD_DIR/njs_auto_config.h \\
$NJS_BUILD_DIR/njs_unit_test
diff -r c39329b57a06 -r 63106bd2e9bf auto/sources
--- a/auto/sources Wed Jul 15 15:34:16 2020 +0000
+++ b/auto/sources Wed Jul 15 19:19:18 2020 +0300
@@ -6,6 +6,7 @@ NJS_LIB_SRCS=" \
src/njs_murmur_hash.c \
src/njs_djb_hash.c \
src/njs_utf8.c \
+ src/njs_utf16.c \
src/njs_arr.c \
src/njs_rbtree.c \
src/njs_lvlhsh.c \
@@ -60,7 +61,7 @@ NJS_LIB_TEST_SRCS=" \
src/test/lvlhsh_unit_test.c \
src/test/random_unit_test.c \
src/test/rbtree_unit_test.c \
- src/test/utf8_unit_test.c \
+ src/test/unicode_unit_test.c \
"
NJS_TEST_SRCS=" \
diff -r c39329b57a06 -r 63106bd2e9bf src/njs_main.h
--- a/src/njs_main.h Wed Jul 15 15:34:16 2020 +0000
+++ b/src/njs_main.h Wed Jul 15 19:19:18 2020 +0300
@@ -14,7 +14,9 @@
#include <njs_types.h>
#include <njs_clang.h>
#include <njs_str.h>
+#include <njs_unicode.h>
#include <njs_utf8.h>
+#include <njs_utf16.h>
#include <njs_diyfp.h>
#include <njs_dtoa.h>
#include <njs_dtoa_fixed.h>
diff -r c39329b57a06 -r 63106bd2e9bf src/njs_unicode.h
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/src/njs_unicode.h Wed Jul 15 19:19:18 2020 +0300
@@ -0,0 +1,23 @@
+
+/*
+ * Copyright (C) Alexander Borisov
+ * Copyright (C) NGINX, Inc.
+ */
+
+#ifndef _NJS_UNICODE_H_INCLUDED_
+#define _NJS_UNICODE_H_INCLUDED_
+
+
+enum {
+ NJS_UNICODE_MAX_CODEPOINT = 0x10FFFF,
+ NJS_UNICODE_ERROR = 0x1FFFFF,
+ NJS_UNICODE_CONTINUE = 0x2FFFFF
+};
+
+typedef struct {
+ uint32_t codepoint;
+ u_char upper;
+} njs_unicode_decode_t;
+
+
+#endif /* _NJS_UNICODE_H_INCLUDED_ */
diff -r c39329b57a06 -r 63106bd2e9bf src/njs_utf16.c
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/src/njs_utf16.c Wed Jul 15 19:19:18 2020 +0300
@@ -0,0 +1,116 @@
+
+/*
+ * Copyright (C) Alexander Borisov
+ * Copyright (C) NGINX, Inc.
+ */
+
+
+#include <njs_main.h>
+
+
+njs_inline void
+njs_utf16_encode_write(uint32_t cp, u_char **start)
+{
+#ifdef NJS_HAVE_BIG_ENDIAN
+ *(*start)++ = cp >> 8;
+ *(*start)++ = cp & 0x00FF;
+#else
+ *(*start)++ = cp & 0x00FF;
+ *(*start)++ = cp >> 8;
+#endif
+}
+
+
+ssize_t
+njs_utf16_encode(uint32_t cp, u_char **start, const u_char *end)
+{
+ if ((*start + 2) > end) {
+ return NJS_ERROR;
+ }
+
+ if (cp < 0x10000) {
+ njs_utf16_encode_write(cp, start);
+
+ return 2;
+ }
+
+ if ((*start + 4) > end) {
+ return NJS_ERROR;
+ }
+
+ cp -= 0x10000;
+
+ njs_utf16_encode_write((0xD800 | (cp >> 0x0A)), start);
+ njs_utf16_encode_write((0xDC00 | (cp & 0x03FF)), start);
+
+ return 4;
+}
+
+
+uint32_t
+njs_utf16_decode(njs_unicode_decode_t *ctx, const u_char **start,
+ const u_char *end)
+{
+ uint32_t unit;
+ unsigned lead;
+
+ if (ctx->upper != 0x00) {
+ lead = ctx->upper - 0x01;
+ ctx->upper = 0x00;
+
+ goto lead_state;
+ }
+
+pair_state:
+
+ lead = *(*start)++;
+
+ if (*start >= end) {
+ ctx->upper = lead + 0x01;
+ return NJS_UNICODE_CONTINUE;
+ }
+
+lead_state:
+
+#ifdef NJS_HAVE_BIG_ENDIAN
+ unit = (lead << 8) + *(*start)++;
+#else
+ unit = (*(*start)++ << 8) + lead;
+#endif
+
+ if (ctx->codepoint != 0x00) {
+ if ((unsigned) (unit - 0xDC00) <= (0xDFFF - 0xDC00)) {
+ unit = 0x10000 + ((ctx->codepoint - 0xD800) << 10)
+ + (unit - 0xDC00);
+
+ ctx->codepoint = 0x00;
+
+ return unit;
+ }
+
+ (*start)--;
+
+ ctx->upper = lead + 0x01;
+ ctx->codepoint = 0x00;
+
+ return NJS_UNICODE_ERROR;
+ }
+
+ /* Surrogate pair. */
+
+ if ((unsigned) (unit - 0xD800) <= (0xDFFF - 0xD800)) {
+ if ((unsigned) (unit - 0xDC00) <= (0xDFFF - 0xDC00)) {
+ return NJS_UNICODE_ERROR;
+ }
+
+ ctx->codepoint = unit;
+
+ if (*start >= end) {
+ return NJS_UNICODE_CONTINUE;
+ }
+
+ goto pair_state;
+ }
+
+ return unit;
+}
diff -r c39329b57a06 -r 63106bd2e9bf src/njs_utf16.h
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/src/njs_utf16.h Wed Jul 15 19:19:18 2020 +0300
@@ -0,0 +1,25 @@
+
+/*
+ * Copyright (C) Alexander Borisov
+ * Copyright (C) NGINX, Inc.
+ */
+
+#ifndef _NJS_UTF16_H_INCLUDED_
+#define _NJS_UTF16_H_INCLUDED_
+
+
+NJS_EXPORT ssize_t njs_utf16_encode(uint32_t cp, u_char **start,
+ const u_char *end);
+NJS_EXPORT uint32_t njs_utf16_decode(njs_unicode_decode_t *ctx,
+ const u_char **start, const u_char *end);
+
+
+njs_inline void
+njs_utf16_decode_init(njs_unicode_decode_t *ctx)
+{
+ ctx->upper = 0x00;
+ ctx->codepoint = 0x00;
+}
+
+
+#endif /* _NJS_UTF16_H_INCLUDED_ */
diff -r c39329b57a06 -r 63106bd2e9bf src/test/unicode_unit_test.c
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/src/test/unicode_unit_test.c Wed Jul 15 19:19:18 2020 +0300
@@ -0,0 +1,312 @@
+
+/*
+ * Copyright (C) Igor Sysoev
+ * Copyright (C) NGINX, Inc.
+ */
+
+
+#include <njs_main.h>
+
+
+#define NJS_UTF8_START_TEST 0xC2
+
+
+static u_char invalid[] = {
+
+ /* Invalid first byte less than 0xC2. */
+ 1, 0x80, 0x00, 0x00, 0x00,
+ 1, 0xC0, 0x00, 0x00, 0x00,
+ 2, 0xC0, 0x00, 0x00, 0x00,
+ 3, 0xC0, 0x00, 0x00, 0x00,
+ 4, 0xC0, 0x00, 0x00, 0x00,
+
+ /* Invalid 0x0x110000 value. */
+ 4, 0xF4, 0x90, 0x80, 0x80,
+
+ /* Incomplete length. */
+ 2, 0xE0, 0xAF, 0xB5, 0x00,
+
+ /* Overlong values. */
+ 2, 0xC0, 0x80, 0x00, 0x00,
+ 2, 0xC1, 0xB3, 0x00, 0x00,
+ 3, 0xE0, 0x80, 0x80, 0x00,
+ 3, 0xE0, 0x81, 0xB3, 0x00,
+ 3, 0xE0, 0x90, 0x9A, 0x00,
+ 4, 0xF0, 0x80, 0x8A, 0x80,
+ 4, 0xF0, 0x80, 0x81, 0xB3,
+ 4, 0xF0, 0x80, 0xAF, 0xB5,
+};
+
+
+static njs_int_t
+utf8_overlong(u_char *overlong, size_t len)
+{
+ u_char *p, utf8[4];
+ size_t size;
+ uint32_t u, d;
+ njs_uint_t i;
+ const u_char *pp;
+
+ pp = overlong;
+
+ d = njs_utf8_decode(&pp, overlong + len);
+
+ len = pp - overlong;
+
+ if (d != 0xFFFFFFFF) {
+ p = njs_utf8_encode(utf8, d);
+
+ size = (p != NULL) ? p - utf8 : 0;
+
+ if (len != size || memcmp(overlong, utf8, size) != 0) {
+
+ u = 0;
+ for (i = 0; i < len; i++) {
+ u = (u << 8) + overlong[i];
+ }
+
+ njs_printf("njs_utf8_decode(%05uXD, %uz) failed: %05uXD, %uz\n",
+ u, len, d, size);
+
+ return NJS_ERROR;
+ }
+ }
+
+ return NJS_OK;
+}
+
+
+static njs_int_t
+utf8_unit_test(njs_uint_t start)
+{
+ u_char *p, utf8[4];
+ size_t len;
+ int32_t n;
+ uint32_t u, d;
+ njs_uint_t i, k, l, m;
+ const u_char *pp;
+
+ njs_printf("utf8 test started\n");
+
+ /* Test valid UTF-8. */
+
+ for (u = 0; u < 0x110000; u++) {
+
+ p = njs_utf8_encode(utf8, u);
+
+ if (p == NULL) {
+ njs_printf("njs_utf8_encode(%05uXD) failed\n", u);
+ return NJS_ERROR;
+ }
+
+ pp = utf8;
+
+ d = njs_utf8_decode(&pp, p);
+
+ if (u != d) {
+ njs_printf("njs_utf8_decode(%05uXD) failed: %05uxD\n", u, d);
+ return NJS_ERROR;
+ }
+ }
+
+ /* Test some invalid UTF-8. */
+
+ for (i = 0; i < sizeof(invalid); i += 5) {
+
+ len = invalid[i];
+ utf8[0] = invalid[i + 1];
+ utf8[1] = invalid[i + 2];
+ utf8[2] = invalid[i + 3];
+ utf8[3] = invalid[i + 4];
+
+ pp = utf8;
+
+ d = njs_utf8_decode(&pp, utf8 + len);
+
+ if (d != 0xFFFFFFFF) {
+
+ u = 0;
+ for (i = 0; i < len; i++) {
+ u = (u << 8) + utf8[i];
+ }
+
+ njs_printf("njs_utf8_decode(%05uXD, %uz) failed: %05uXD\n",
+ u, len, d);
+ return NJS_ERROR;
+ }
+ }
+
+ /* Test all overlong UTF-8. */
+
+ for (i = start; i < 256; i++) {
+ utf8[0] = i;
+
+ if (utf8_overlong(utf8, 1) != NJS_OK) {
+ return NJS_ERROR;
+ }
+
+ for (k = 0; k < 256; k++) {
+ utf8[1] = k;
+
+ if (utf8_overlong(utf8, 2) != NJS_OK) {
+ return NJS_ERROR;
+ }
+
+ for (l = 0; l < 256; l++) {
+ utf8[2] = l;
+
+ if (utf8_overlong(utf8, 3) != NJS_OK) {
+ return NJS_ERROR;
+ }
+
+ for (m = 0; m < 256; m++) {
+ utf8[3] = m;
+
+ if (utf8_overlong(utf8, 4) != NJS_OK) {
+ return NJS_ERROR;
+ }
+ }
+ }
+ }
+ }
+
+ n = njs_utf8_casecmp((u_char *) "ABC АБВ ΑΒΓ",
+ (u_char *) "abc абв αβγ",
+ njs_length("ABC АБВ ΑΒΓ"),
+ njs_length("abc абв αβγ"));
+
+ if (n != 0) {
+ njs_printf("njs_utf8_casecmp() failed\n");
+ return NJS_ERROR;
+ }
+
+ njs_printf("utf8 test passed\n");
+ return NJS_OK;
+}
+
+
+static njs_int_t
+utf16_unit_test()
+{
+ int8_t length, length_to;
+ u_char *start, *end, *end_to;
+ uint32_t cp, i;
+ njs_unicode_decode_t ctx;
+ u_char buf[8], to[4];
+
+ njs_printf("utf16 test started\n");
+
+ end = buf + sizeof(buf);
+ end_to = to + sizeof(to);
+
+ for (i = 0; i <= NJS_UNICODE_MAX_CODEPOINT; i++) {
+
+ /* Skip surrogate pair. */
+
+ if (i >= 0xD800 && i <= 0xDFFF) {
+ continue;
+ }
+
+ start = buf;
+
+ length = njs_utf16_encode(i, &start, end);
+ if (length < NJS_OK) {
+ njs_printf("utf16 test encode failed\n");
+ return NJS_ERROR;
+ }
+
+ njs_utf16_decode_init(&ctx);
+
+ start = buf;
+
+ cp = njs_utf16_decode(&ctx, (const u_char **) &start, start + length);
+ if (cp > NJS_UNICODE_MAX_CODEPOINT) {
+ njs_printf("utf16 test decode failed\n");
+ return NJS_ERROR;
+ }
+
+ if (cp != i) {
+ njs_printf("utf16 test decode code point does not match\n");
+ return NJS_ERROR;
+ }
+
+ start = to;
+
+ length_to = njs_utf16_encode(cp, &start, end_to);
+ if (length_to < NJS_OK) {
+ njs_printf("utf16 test encode failed\n");
+ return NJS_ERROR;
+ }
+
+ if (length_to != length || njs_strncmp(buf, to, length) != 0) {
+ njs_printf("utf16 test decode-encode failed\n");
+ return NJS_ERROR;
+ }
+ }
+
+ /* Surrogate pair. */
+
+ for (i = 0xD800; i <= 0xDFFF; i++) {
+ start = buf;
+
+ length = njs_utf16_encode(i, &start, end);
+ if (length < NJS_OK) {
+ njs_printf("utf16 test surrogate pair encode lead failed\n");
+ return NJS_ERROR;
+ }
+
+ length_to = njs_utf16_encode(i - 0xD800 + 0xDC00, &start, end);
+ if (length_to < NJS_OK) {
+ njs_printf("utf16 test surrogate pair encode failed\n");
+ return NJS_ERROR;
+ }
+
+ njs_utf16_decode_init(&ctx);
+
+ start = buf;
+
+ cp = njs_utf16_decode(&ctx, (const u_char **) &start,
+ start + length + length_to);
+ if (cp > NJS_UNICODE_MAX_CODEPOINT) {
+ if (i < 0xDC00) {
+ njs_printf("utf16 test surrogate pair decode failed\n");
+ return NJS_ERROR;
+ }
+ }
+ }
+
+ njs_printf("utf16 test passed\n");
+
+ return NJS_OK;
+}
+
+
+int
+main(int argc, char **argv)
+{
+ njs_int_t ret;
+ njs_uint_t start;
+
+ njs_printf("unicode unit test started\n");
+
+ if (argc > 1 && argv[1][0] == 'a') {
+ start = NJS_UTF8_START_TEST;
+
+ } else {
+ start = 256;
+ }
+
+ ret = utf8_unit_test(start);
+ if (ret != NJS_OK) {
+ return ret;
+ }
+
+ ret = utf16_unit_test();
+ if (ret != NJS_OK) {
+ return ret;
+ }
+
+ njs_printf("unicode unit test passed\n");
+
+ return 0;
+}
diff -r c39329b57a06 -r 63106bd2e9bf src/test/utf8_unit_test.c
--- a/src/test/utf8_unit_test.c Wed Jul 15 15:34:16 2020 +0000
+++ /dev/null Thu Jan 01 00:00:00 1970 +0000
@@ -1,202 +0,0 @@
-
-/*
- * Copyright (C) Igor Sysoev
- * Copyright (C) NGINX, Inc.
- */
-
-
-#include <njs_main.h>
-
-
-#define NJS_UTF8_START_TEST 0xC2
-//#define NJS_UTF8_START_TEST 0
-
-
-static u_char invalid[] = {
-
- /* Invalid first byte less than 0xC2. */
- 1, 0x80, 0x00, 0x00, 0x00,
- 1, 0xC0, 0x00, 0x00, 0x00,
- 2, 0xC0, 0x00, 0x00, 0x00,
- 3, 0xC0, 0x00, 0x00, 0x00,
- 4, 0xC0, 0x00, 0x00, 0x00,
-
- /* Invalid 0x0x110000 value. */
- 4, 0xF4, 0x90, 0x80, 0x80,
-
- /* Incomplete length. */
- 2, 0xE0, 0xAF, 0xB5, 0x00,
-
- /* Overlong values. */
- 2, 0xC0, 0x80, 0x00, 0x00,
- 2, 0xC1, 0xB3, 0x00, 0x00,
- 3, 0xE0, 0x80, 0x80, 0x00,
- 3, 0xE0, 0x81, 0xB3, 0x00,
- 3, 0xE0, 0x90, 0x9A, 0x00,
- 4, 0xF0, 0x80, 0x8A, 0x80,
- 4, 0xF0, 0x80, 0x81, 0xB3,
- 4, 0xF0, 0x80, 0xAF, 0xB5,
-};
-
-
-static njs_int_t
-utf8_overlong(u_char *overlong, size_t len)
-{
- u_char *p, utf8[4];
- size_t size;
- uint32_t u, d;
- njs_uint_t i;
- const u_char *pp;
-
- pp = overlong;
-
- d = njs_utf8_decode(&pp, overlong + len);
-
- len = pp - overlong;
-
- if (d != 0xFFFFFFFF) {
- p = njs_utf8_encode(utf8, d);
-
- size = (p != NULL) ? p - utf8 : 0;
-
- if (len != size || memcmp(overlong, utf8, size) != 0) {
-
- u = 0;
- for (i = 0; i < len; i++) {
- u = (u << 8) + overlong[i];
- }
-
- njs_printf("njs_utf8_decode(%05uXD, %uz) failed: %05uXD, %uz\n",
- u, len, d, size);
-
- return NJS_ERROR;
- }
- }
-
- return NJS_OK;
-}
-
-
-static njs_int_t
-utf8_unit_test(njs_uint_t start)
-{
- u_char *p, utf8[4];
- size_t len;
- int32_t n;
- uint32_t u, d;
- njs_uint_t i, k, l, m;
- const u_char *pp;
-
- njs_printf("utf8 unit test started\n");
-
- /* Test valid UTF-8. */
-
- for (u = 0; u < 0x110000; u++) {
-
- p = njs_utf8_encode(utf8, u);
-
- if (p == NULL) {
- njs_printf("njs_utf8_encode(%05uXD) failed\n", u);
- return NJS_ERROR;
- }
-
- pp = utf8;
-
- d = njs_utf8_decode(&pp, p);
-
- if (u != d) {
- njs_printf("njs_utf8_decode(%05uXD) failed: %05uxD\n", u, d);
- return NJS_ERROR;
- }
- }
-
- /* Test some invalid UTF-8. */
-
- for (i = 0; i < sizeof(invalid); i += 5) {
-
- len = invalid[i];
- utf8[0] = invalid[i + 1];
- utf8[1] = invalid[i + 2];
- utf8[2] = invalid[i + 3];
- utf8[3] = invalid[i + 4];
-
- pp = utf8;
-
- d = njs_utf8_decode(&pp, utf8 + len);
-
- if (d != 0xFFFFFFFF) {
-
- u = 0;
- for (i = 0; i < len; i++) {
- u = (u << 8) + utf8[i];
- }
-
- njs_printf("njs_utf8_decode(%05uXD, %uz) failed: %05uXD\n",
- u, len, d);
- return NJS_ERROR;
- }
- }
-
- /* Test all overlong UTF-8. */
-
- for (i = start; i < 256; i++) {
- utf8[0] = i;
-
- if (utf8_overlong(utf8, 1) != NJS_OK) {
- return NJS_ERROR;
- }
-
- for (k = 0; k < 256; k++) {
- utf8[1] = k;
-
- if (utf8_overlong(utf8, 2) != NJS_OK) {
- return NJS_ERROR;
- }
-
- for (l = 0; l < 256; l++) {
- utf8[2] = l;
-
- if (utf8_overlong(utf8, 3) != NJS_OK) {
- return NJS_ERROR;
- }
-
- for (m = 0; m < 256; m++) {
- utf8[3] = m;
-
- if (utf8_overlong(utf8, 4) != NJS_OK) {
- return NJS_ERROR;
- }
- }
- }
- }
- }
-
- n = njs_utf8_casecmp((u_char *) "ABC АБВ ΑΒΓ",
- (u_char *) "abc абв αβγ",
- njs_length("ABC АБВ ΑΒΓ"),
- njs_length("abc абв αβγ"));
-
- if (n != 0) {
- njs_printf("njs_utf8_casecmp() failed\n");
- return NJS_ERROR;
- }
-
- njs_printf("utf8 unit test passed\n");
- return NJS_OK;
-}
-
-
-int
-main(int argc, char **argv)
-{
- njs_uint_t start;
-
- if (argc > 1 && argv[1][0] == 'a') {
- start = NJS_UTF8_START_TEST;
-
- } else {
- start = 256;
- }
-
- return utf8_unit_test(start);
-}
More information about the nginx-devel
mailing list