[njs] Added UTF8 validation for string literals.
Alexander Borisov
alexander.borisov at nginx.com
Wed Jul 10 16:56:20 UTC 2019
details: https://hg.nginx.org/njs/rev/f1a70d67646d
branches:
changeset: 1037:f1a70d67646d
user: Alexander Borisov <alexander.borisov at nginx.com>
date: Wed Jul 10 14:20:53 2019 +0300
description:
Added UTF8 validation for string literals.
All bad UTF-8 characters are replaced by '\uFFFD'
(REPLACEMENT CHARACTER).
diffstat:
njs/njs_parser_terminal.c | 60 +++++++++++++--------
njs/test/njs_unit_test.c | 81 +++++++++++++++++++++++++++++
nxt/nxt_utf8.c | 125 ++++++++++++++++++++++++++++++++++++++++++++++
nxt/nxt_utf8.h | 6 ++
4 files changed, 250 insertions(+), 22 deletions(-)
diffs (362 lines):
diff -r b946c1073968 -r f1a70d67646d njs/njs_parser_terminal.c
--- a/njs/njs_parser_terminal.c Mon Jul 08 17:51:58 2019 +0300
+++ b/njs/njs_parser_terminal.c Wed Jul 10 14:20:53 2019 +0300
@@ -907,31 +907,35 @@ done:
nxt_int_t
njs_parser_string_create(njs_vm_t *vm, njs_value_t *value)
{
- u_char *p;
- ssize_t length;
- nxt_str_t *src;
+ u_char *dst;
+ ssize_t size, length;
+ uint32_t cp;
+ nxt_str_t *src;
+ const u_char *p, *end;
src = njs_parser_text(vm->parser);
- length = nxt_utf8_length(src->start, src->length);
+ length = nxt_utf8_safe_length(src->start, src->length, &size);
- if (nxt_slow_path(length < 0)) {
- length = 0;
+ dst = njs_string_alloc(vm, value, size, length);
+ if (nxt_slow_path(dst == NULL)) {
+ return NXT_ERROR;
}
- p = njs_string_alloc(vm, value, src->length, length);
-
- if (nxt_fast_path(p != NULL)) {
- memcpy(p, src->start, src->length);
+ p = src->start;
+ end = src->start + src->length;
- if (length > NJS_STRING_MAP_STRIDE && (size_t) length != src->length) {
- njs_string_offset_map_init(p, src->length);
- }
+ while (p < end) {
+ cp = nxt_utf8_safe_decode(&p, end);
- return NXT_OK;
+ dst = nxt_utf8_encode(dst, cp);
}
- return NXT_ERROR;
+ if (size > NJS_STRING_MAP_STRIDE && size != length) {
+ njs_string_offset_map_init(value->long_string.data->start, size);
+ }
+
+ return NXT_OK;
}
@@ -1042,11 +1046,27 @@ njs_parser_escape_string_create(njs_vm_t
continue;
default:
+ if (c >= 0x80) {
+ src--;
+ goto utf8_copy;
+ }
+
break;
}
}
- *dst++ = c;
+ if (c < 0x80) {
+ *dst++ = c;
+
+ continue;
+ }
+
+ utf8_copy:
+
+ src--;
+
+ cp = nxt_utf8_safe_decode2(&src, end);
+ dst = nxt_utf8_encode(dst, cp);
continue;
@@ -1166,13 +1186,9 @@ njs_parser_escape_string_calc_length(njs
}
if (*src >= 0x80) {
- ptr = src;
+ cp = nxt_utf8_safe_decode2(&src, end);
- if (nxt_slow_path(nxt_utf8_decode(&src, end) == 0xffffffff)) {
- goto invalid;
- }
-
- size += src - ptr;
+ size += nxt_utf8_size(cp);
length++;
continue;
diff -r b946c1073968 -r f1a70d67646d njs/test/njs_unit_test.c
--- a/njs/test/njs_unit_test.c Mon Jul 08 17:51:58 2019 +0300
+++ b/njs/test/njs_unit_test.c Wed Jul 10 14:20:53 2019 +0300
@@ -4367,6 +4367,48 @@ static njs_unit_test_t njs_test[] =
{ nxt_string("var a = '123'\n[2].toString();a"),
nxt_string("3") },
+ { nxt_string("'\xE5\x96\x9C\xE3\x81\xB6'"),
+ nxt_string("喜ぶ") },
+
+ /* Broken UTF-8 literals.*/
+
+ { nxt_string("'\x96\xE5\x9C\xE3\x81\xB6'"),
+ nxt_string("��ぶ") },
+
+ { nxt_string("'\x96\xE5\x9C'"),
+ nxt_string("��") },
+
+ { nxt_string("'\x96\xE5'"),
+ nxt_string("��") },
+
+ { nxt_string("'\x96'"),
+ nxt_string("�") },
+
+ { nxt_string("'\xF3'"),
+ nxt_string("�") },
+
+ { nxt_string("'\xF3\xFF'"),
+ nxt_string("��") },
+
+ { nxt_string("'\x96\x96\xE5\x9C\xE3\x81\xB6'"),
+ nxt_string("���ぶ") },
+
+ { nxt_string("'\x9C\x96\xE5\xE3\x81\xB6'"),
+ nxt_string("���ぶ") },
+
+ { nxt_string("'\xE5\x9C\xE3\x81\xB6'"),
+ nxt_string("�ぶ") },
+
+ { nxt_string("'\xEF\xBF\xBD\xE3\x81\xB6'"),
+ nxt_string("�ぶ") },
+
+ { nxt_string("'\xE5\xF6\x9C\xE3\x81\xB6'"),
+ nxt_string("���ぶ") },
+
+ { nxt_string("var a = 'aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa\xF3'; "
+ "[a.length, a[33], a[34]]"),
+ nxt_string("35,a,�") },
+
/* Escape strings. */
{ nxt_string("'\\a \\' \\\" \\\\ \\0 \\b \\f \\n \\r \\t \\v'"),
@@ -4495,6 +4537,45 @@ static njs_unit_test_t njs_test[] =
{ nxt_string("'\\u{D800}\\u{'"),
nxt_string("SyntaxError: Invalid Unicode code point \"\\u{D800}\\u{\" in 1") },
+ /* Broken UTF-8 literals.*/
+
+ { nxt_string("'\\a\x96\xE5\x9C\xE3\x81\xB6'"),
+ nxt_string("a��ぶ") },
+
+ { nxt_string("'\x96\\a\xE5\x9C'"),
+ nxt_string("�a�") },
+
+ { nxt_string("'\x96\xE5\\a'"),
+ nxt_string("��a") },
+
+ { nxt_string("'\\a\x96\\a'"),
+ nxt_string("a�a") },
+
+ { nxt_string("'\xF3\\a'"),
+ nxt_string("�a") },
+
+ { nxt_string("'\xF3\\a\xFF'"),
+ nxt_string("�a�") },
+
+ { nxt_string("'\\a\x96\x96\xE5\x9C\xE3\x81\xB6'"),
+ nxt_string("a���ぶ") },
+
+ { nxt_string("'\\a\x9C\x96\xE5\xE3\x81\xB6'"),
+ nxt_string("a���ぶ") },
+
+ { nxt_string("'\\a\xE5\x9C\xE3\x81\xB6'"),
+ nxt_string("a�ぶ") },
+
+ { nxt_string("'\\a\xEF\xBF\xBD\xE3\x81\xB6'"),
+ nxt_string("a�ぶ") },
+
+ { nxt_string("'\\a\xE5\xF6\x9C\xE3\x81\xB6'"),
+ nxt_string("a���ぶ") },
+
+ { nxt_string("var a = '\\aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa\xF3'; "
+ "[a.length, a[34], a[35]]"),
+ nxt_string("36,a,�") },
+
{ nxt_string("''.hasOwnProperty('length')"),
nxt_string("true") },
diff -r b946c1073968 -r f1a70d67646d nxt/nxt_utf8.c
--- a/nxt/nxt_utf8.c Mon Jul 08 17:51:58 2019 +0300
+++ b/nxt/nxt_utf8.c Wed Jul 10 14:20:53 2019 +0300
@@ -163,6 +163,103 @@ nxt_utf8_decode2(const u_char **start, c
}
+uint32_t
+nxt_utf8_safe_decode(const u_char **start, const u_char *end)
+{
+ uint32_t u;
+
+ u = (uint32_t) **start;
+
+ if (u < 0x80) {
+ (*start)++;
+ return u;
+ }
+
+ return nxt_utf8_safe_decode2(start, end);
+}
+
+
+uint32_t
+nxt_utf8_safe_decode2(const u_char **start, const u_char *end)
+{
+ u_char c;
+ size_t n;
+ uint32_t u, overlong;
+ const u_char *p;
+
+ p = *start;
+ u = (uint32_t) *p;
+
+ if (u >= 0xE0) {
+
+ if (u >= 0xF0) {
+
+ if (nxt_slow_path(u > 0xF4)) {
+ /*
+ * The maximum valid Unicode character is 0x10FFFF
+ * which is encoded as 0xF4 0x8F 0xBF 0xBF.
+ */
+ goto fail_one;
+ }
+
+ u &= 0x07;
+ overlong = 0x00FFFF;
+ n = 3;
+
+ } else {
+ u &= 0x0F;
+ overlong = 0x07FF;
+ n = 2;
+ }
+
+ } else if (u >= 0xC2) {
+
+ /* 0x80 is encoded as 0xC2 0x80. */
+
+ u &= 0x1F;
+ overlong = 0x007F;
+ n = 1;
+
+ } else {
+ /* u <= 0xC2 */
+ goto fail_one;
+ }
+
+ p++;
+
+ while (p < end && n != 0) {
+ c = *p++;
+ /*
+ * The byte must in the 0x80 - 0xBF range.
+ * Values below 0x80 become >= 0x80.
+ */
+ c = c - 0x80;
+
+ if (nxt_slow_path(c > 0x3F)) {
+ *start = --p;
+ return NXT_UTF8_REPLACEMENT;
+ }
+
+ u = (u << 6) | c;
+ n--;
+ }
+
+ *start = p;
+
+ if (n == 0 && overlong < u && u < 0x110000) {
+ return u;
+ }
+
+ return NXT_UTF8_REPLACEMENT;
+
+fail_one:
+
+ (*start)++;
+
+ return NXT_UTF8_REPLACEMENT;
+}
+
+
/*
* nxt_utf8_casecmp() tests only up to the minimum of given lengths, but
* requires lengths of both strings because otherwise nxt_utf8_decode2()
@@ -279,6 +376,34 @@ nxt_utf8_length(const u_char *p, size_t
}
+ssize_t
+nxt_utf8_safe_length(const u_char *p, size_t len, ssize_t *out_size)
+{
+ ssize_t size, length;
+ uint32_t codepoint;
+ const u_char *end;
+
+ size = 0;
+ length = 0;
+
+ end = p + len;
+
+ while (p < end) {
+ codepoint = nxt_utf8_safe_decode(&p, end);
+
+ size += nxt_utf8_size(codepoint);
+
+ length++;
+ }
+
+ if (out_size != NULL) {
+ *out_size = size;
+ }
+
+ return length;
+}
+
+
nxt_bool_t
nxt_utf8_is_valid(const u_char *p, size_t len)
{
diff -r b946c1073968 -r f1a70d67646d nxt/nxt_utf8.h
--- a/nxt/nxt_utf8.h Mon Jul 08 17:51:58 2019 +0300
+++ b/nxt/nxt_utf8.h Wed Jul 10 14:20:53 2019 +0300
@@ -21,6 +21,10 @@
NXT_EXPORT u_char *nxt_utf8_encode(u_char *p, uint32_t u);
NXT_EXPORT uint32_t nxt_utf8_decode(const u_char **start, const u_char *end);
NXT_EXPORT uint32_t nxt_utf8_decode2(const u_char **start, const u_char *end);
+NXT_EXPORT uint32_t nxt_utf8_safe_decode(const u_char **start,
+ const u_char *end);
+NXT_EXPORT uint32_t nxt_utf8_safe_decode2(const u_char **start,
+ const u_char *end);
NXT_EXPORT nxt_int_t nxt_utf8_casecmp(const u_char *start1,
const u_char *start2, size_t len1, size_t len2);
NXT_EXPORT uint32_t nxt_utf8_lower_case(const u_char **start,
@@ -28,6 +32,8 @@ NXT_EXPORT uint32_t nxt_utf8_lower_case(
NXT_EXPORT uint32_t nxt_utf8_upper_case(const u_char **start,
const u_char *end);
NXT_EXPORT ssize_t nxt_utf8_length(const u_char *p, size_t len);
+NXT_EXPORT ssize_t nxt_utf8_safe_length(const u_char *p, size_t len,
+ ssize_t *out_size);
NXT_EXPORT nxt_bool_t nxt_utf8_is_valid(const u_char *p, size_t len);
More information about the nginx-devel
mailing list