[njs] Fixed length calculation for UTF-8 string with escape characters.
Alexander Borisov
alexander.borisov at nginx.com
Tue Apr 23 14:47:26 UTC 2019
details: https://hg.nginx.org/njs/rev/b3eb60707479
branches:
changeset: 920:b3eb60707479
user: Alexander Borisov <alexander.borisov at nginx.com>
date: Mon Apr 22 16:23:43 2019 +0300
description:
Fixed length calculation for UTF-8 string with escape characters.
This closes #133 issue on GitHub.
diffstat:
njs/njs_parser_terminal.c | 329 +++++++++++++++++++++++++++++----------------
njs/test/njs_unit_test.c | 6 +
2 files changed, 216 insertions(+), 119 deletions(-)
diffs (413 lines):
diff -r ad1be10fba80 -r b3eb60707479 njs/njs_parser_terminal.c
--- a/njs/njs_parser_terminal.c Tue Apr 23 15:31:40 2019 +0300
+++ b/njs/njs_parser_terminal.c Mon Apr 22 16:23:43 2019 +0300
@@ -28,6 +28,8 @@ static nxt_int_t njs_parser_template_exp
njs_parser_t *parser);
static nxt_int_t njs_parser_template_string(njs_vm_t *vm,
njs_parser_t *parser);
+static njs_ret_t njs_parser_escape_string_calc_length(njs_vm_t *vm,
+ njs_parser_t *parser, size_t *out_size, size_t *out_length);
static njs_token_t njs_parser_escape_string_create(njs_vm_t *vm,
njs_parser_t *parser, njs_value_t *value);
@@ -923,176 +925,265 @@ njs_parser_escape_string_create(njs_vm_t
njs_value_t *value)
{
u_char c, *start, *dst;
- size_t size,length, hex_length;
- uint64_t u;
+ size_t size, length, hex_length;
+ uint64_t cp;
+ njs_ret_t ret;
nxt_str_t *string;
- const u_char *p, *src, *end, *hex_end;
+ const u_char *src, *end, *hex_end;
- start = NULL;
- dst = NULL;
+ ret = njs_parser_escape_string_calc_length(vm, parser, &size, &length);
+ if (nxt_slow_path(ret != NXT_OK)) {
+ return NJS_TOKEN_ILLEGAL;
+ }
+
+ start = njs_string_alloc(vm, value, size, length);
+ if (nxt_slow_path(start == NULL)) {
+ return NJS_TOKEN_ERROR;
+ }
- for ( ;; ) {
- /*
- * The loop runs twice: at the first step string size and
- * UTF-8 length are evaluated. Then the string is allocated
- * and at the second step string content is copied.
- */
- size = 0;
- length = 0;
+ dst = start;
+
+ string = njs_parser_text(parser);
+ src = string->start;
+ end = src + string->length;
- string = njs_parser_text(parser);
- src = string->start;
- end = src + string->length;
+ while (src < end) {
+ c = *src++;
- while (src < end) {
+ if (c == '\\') {
+ /*
+ * Testing "src == end" is not required here
+ * since this has been already tested by lexer.
+ */
+
c = *src++;
- if (c == '\\') {
+ switch (c) {
+ case 'u':
/*
- * Testing "src == end" is not required here
- * since this has been already tested by lexer.
+ * A character after "u" can be safely tested here
+ * because there is always a closing quote at the
+ * end of string: ...\u".
*/
- c = *src++;
+
+ if (*src != '{') {
+ hex_length = 4;
+ goto hex_length;
+ }
+
+ src++;
+ hex_length = 0;
+ hex_end = end;
- switch (c) {
+ goto hex;
+
+ case 'x':
+ hex_length = 2;
+ goto hex_length;
+
+ case '0':
+ c = '\0';
+ break;
- case 'u':
- hex_length = 4;
- /*
- * A character after "u" can be safely tested here
- * because there is always a closing quote at the
- * end of string: ...\u".
- */
- if (*src != '{') {
- goto hex_length_test;
- }
+ case 'b':
+ c = '\b';
+ break;
+
+ case 'f':
+ c = '\f';
+ break;
+
+ case 'n':
+ c = '\n';
+ break;
+
+ case 'r':
+ c = '\r';
+ break;
+ case 't':
+ c = '\t';
+ break;
+
+ case 'v':
+ c = '\v';
+ break;
+
+ case '\r':
+ /*
+ * A character after "\r" can be safely tested here
+ * because there is always a closing quote at the
+ * end of string: ...\\r".
+ */
+
+ if (*src == '\n') {
src++;
- hex_length = 0;
- hex_end = end;
+ }
- goto hex;
+ continue;
- case 'x':
- hex_length = 2;
- goto hex_length_test;
+ case '\n':
+ continue;
- case '0':
- c = '\0';
- break;
+ default:
+ break;
+ }
+ }
- case 'b':
- c = '\b';
- break;
+ *dst++ = c;
- case 'f':
- c = '\f';
- break;
+ continue;
+
+ hex_length:
- case 'n':
- c = '\n';
- break;
+ hex_end = src + hex_length;
+
+ hex:
+ cp = njs_number_hex_parse(&src, hex_end);
- case 'r':
- c = '\r';
- break;
+ dst = nxt_utf8_encode(dst, (uint32_t) cp);
+ if (nxt_slow_path(dst == NULL)) {
+ njs_parser_syntax_error(vm, parser,
+ "Invalid Unicode code point \"%V\"",
+ njs_parser_text(parser));
+
+ return NJS_TOKEN_ILLEGAL;
+ }
- case 't':
- c = '\t';
- break;
+ /* Skip '}' character */
+
+ if (hex_length == 0) {
+ src++;
+ }
+ }
+
+ if (length > NJS_STRING_MAP_STRIDE && length != size) {
+ njs_string_offset_map_init(start, size);
+ }
- case 'v':
- c = '\v';
- break;
+ return NJS_TOKEN_STRING;
+}
+
- case '\r':
- /*
- * A character after "\r" can be safely tested here
- * because there is always a closing quote at the
- * end of string: ...\\r".
- */
- if (*src == '\n') {
- src++;
- }
+static njs_ret_t
+njs_parser_escape_string_calc_length(njs_vm_t *vm, njs_parser_t *parser,
+ size_t *out_size, size_t *out_length)
+{
+ size_t size, length, hex_length;
+ uint64_t cp;
+ nxt_str_t *string;
+ const u_char *ptr, *src, *end, *hex_end;
+
+ size = 0;
+ length = 0;
+
+ string = njs_parser_text(parser);
+ src = string->start;
+ end = src + string->length;
+
+ while (src < end) {
- continue;
+ if (*src == '\\') {
+ src++;
- case '\n':
- continue;
+ switch (*src) {
+ case 'u':
+ src++;
- default:
- break;
+ if (*src != '{') {
+ hex_length = 4;
+ goto hex_length;
}
- }
+
+ src++;
+ hex_length = 0;
+ hex_end = end;
+
+ goto hex;
+
+ case 'x':
+ src++;
+ hex_length = 2;
+ goto hex_length;
- size++;
- length++;
+ case '\r':
+ src++;
- if (dst != NULL) {
- *dst++ = c;
- }
+ if (*src == '\n') {
+ src++;
+ }
+
+ continue;
- continue;
-
- hex_length_test:
+ case '\n':
+ src++;
+ continue;
- hex_end = src + hex_length;
+ default:
+ break;
+ }
+ }
- if (hex_end > end) {
+ if (*src >= 0x80) {
+ ptr = src;
+
+ if (nxt_slow_path(nxt_utf8_decode(&src, end) == 0xffffffff)) {
goto invalid;
}
- hex:
+ size += src - ptr;
+ length++;
+
+ continue;
+ }
- p = src;
- u = njs_number_hex_parse(&src, hex_end);
+ src++;
+ size++;
+ length++;
+
+ continue;
+
+ hex_length:
- if (hex_length != 0) {
- if (src != hex_end) {
- goto invalid;
- }
+ hex_end = src + hex_length;
+
+ if (nxt_slow_path(hex_end > end)) {
+ goto invalid;
+ }
- } else {
- if (src == p || (src - p) > 6) {
- goto invalid;
- }
+ hex:
- if (src == end || *src++ != '}') {
- goto invalid;
- }
+ ptr = src;
+ cp = njs_number_hex_parse(&src, hex_end);
+
+ if (hex_length != 0) {
+ if (src != hex_end) {
+ goto invalid;
}
- size += nxt_utf8_size(u);
- length++;
+ } else {
+ if (src == ptr || (src - ptr) > 6) {
+ goto invalid;
+ }
- if (dst != NULL) {
- dst = nxt_utf8_encode(dst, (uint32_t) u);
- if (dst == NULL) {
- goto invalid;
- }
+ if (src == end || *src++ != '}') {
+ goto invalid;
}
}
- if (start != NULL) {
- if (length > NJS_STRING_MAP_STRIDE && length != size) {
- njs_string_offset_map_init(start, size);
- }
-
- return NJS_TOKEN_STRING;
- }
+ size += nxt_utf8_size(cp);
+ length++;
+ }
- start = njs_string_alloc(vm, value, size, length);
- if (nxt_slow_path(start == NULL)) {
- return NJS_TOKEN_ERROR;
- }
+ *out_size = size;
+ *out_length = length;
- dst = start;
- }
+ return NXT_OK;
invalid:
njs_parser_syntax_error(vm, parser, "Invalid Unicode code point \"%V\"",
njs_parser_text(parser));
- return NJS_TOKEN_ILLEGAL;
+ return NJS_ERROR;
}
diff -r ad1be10fba80 -r b3eb60707479 njs/test/njs_unit_test.c
--- a/njs/test/njs_unit_test.c Tue Apr 23 15:31:40 2019 +0300
+++ b/njs/test/njs_unit_test.c Mon Apr 22 16:23:43 2019 +0300
@@ -4381,6 +4381,12 @@ static njs_unit_test_t njs_test[] =
{ nxt_string("'abc'.length"),
nxt_string("3") },
+ { nxt_string("'привет\\n'.length"),
+ nxt_string("7") },
+
+ { nxt_string("'привет\\n\\u{61}\\u{3B1}\\u{20AC}'.length"),
+ nxt_string("10") },
+
{ nxt_string("''.hasOwnProperty('length')"),
nxt_string("true") },
More information about the nginx-devel
mailing list