[njs] Parser: properly handling unicode space characters.
Dmitry Volyntsev
xeioex at nginx.com
Fri Sep 16 00:41:11 UTC 2022
details: https://hg.nginx.org/njs/rev/46d505a902bb
branches:
changeset: 1953:46d505a902bb
user: Dmitry Volyntsev <xeioex at nginx.com>
date: Wed Sep 14 22:14:50 2022 -0700
description:
Parser: properly handling unicode space characters.
diffstat:
src/njs_lexer.c | 35 +++++++++++++++++++++++++++++------
src/njs_str.h | 1 -
src/test/njs_unit_test.c | 5 +++++
3 files changed, 34 insertions(+), 7 deletions(-)
diffs (83 lines):
diff -r 05efe34376ab -r 46d505a902bb src/njs_lexer.c
--- a/src/njs_lexer.c Tue Sep 13 21:13:17 2022 -0700
+++ b/src/njs_lexer.c Wed Sep 14 22:14:50 2022 -0700
@@ -45,8 +45,8 @@ static const uint8_t njs_tokens[256] n
NJS_TOKEN_ILLEGAL, NJS_TOKEN_ILLEGAL,
NJS_TOKEN_ILLEGAL, NJS_TOKEN_ILLEGAL,
/* \t */ NJS_TOKEN_ILLEGAL, NJS_TOKEN_SPACE,
- /* \n */ NJS_TOKEN_LINE_END, NJS_TOKEN_ILLEGAL,
- /* \r */ NJS_TOKEN_ILLEGAL, NJS_TOKEN_SPACE,
+ /* \n */ NJS_TOKEN_LINE_END, NJS_TOKEN_SPACE,
+ /* \r */ NJS_TOKEN_SPACE, NJS_TOKEN_SPACE,
NJS_TOKEN_ILLEGAL, NJS_TOKEN_ILLEGAL,
/* 0x10 */ NJS_TOKEN_ILLEGAL, NJS_TOKEN_ILLEGAL,
@@ -437,15 +437,38 @@ njs_lexer_consume_token(njs_lexer_t *lex
njs_int_t
njs_lexer_make_token(njs_lexer_t *lexer, njs_lexer_token_t *token)
{
- u_char c, *p;
+ u_char c, *p;
+ uint32_t cp;
+ njs_unicode_decode_t ctx;
c = ' ';
+ njs_utf8_decode_init(&ctx);
+
while (lexer->start < lexer->end) {
- c = *lexer->start++;
+ c = *lexer->start;
+
+ if (njs_fast_path(!(c & 0x80))) {
+ lexer->start++;
+
+ if (njs_tokens[c] != NJS_TOKEN_SPACE) {
+ break;
+ }
- if (njs_tokens[c] != NJS_TOKEN_SPACE) {
- break;
+ } else {
+
+ /* Unicode. */
+
+ cp = njs_utf8_decode(&ctx, (const u_char **) &lexer->start,
+ lexer->end);
+ if (njs_slow_path(cp > NJS_UNICODE_MAX_CODEPOINT)) {
+ c = '\0';
+ break;
+ }
+
+ if (!njs_utf8_is_whitespace(cp)) {
+ break;
+ }
}
}
diff -r 05efe34376ab -r 46d505a902bb src/njs_str.h
--- a/src/njs_str.h Tue Sep 13 21:13:17 2022 -0700
+++ b/src/njs_str.h Wed Sep 14 22:14:50 2022 -0700
@@ -51,7 +51,6 @@ njs_is_whitespace(u_char c)
case 0x0C: /* <FF> */
case 0x0D: /* <CR> */
case 0x20: /* <SP> */
- case 0xA0: /* <NBSP> */
return 1;
default:
diff -r 05efe34376ab -r 46d505a902bb src/test/njs_unit_test.c
--- a/src/test/njs_unit_test.c Tue Sep 13 21:13:17 2022 -0700
+++ b/src/test/njs_unit_test.c Wed Sep 14 22:14:50 2022 -0700
@@ -7341,6 +7341,11 @@ static njs_unit_test_t njs_test[] =
"[a.length, a[33], a[34]]"),
njs_str("35,a,�") },
+ /* Spaces: U+0009U+000BU+000CU+0020U+00A0U+000AU+000DU+2028U+2029 */
+
+ { njs_str("\x09\x0a\x0b\x0c\x0d \xc2\xa0'a'\xe2\x80\xa8+\xe2\x80\xa9'b'"),
+ njs_str("ab") },
+
/* Escape strings. */
{ njs_str("'\\a \\' \\\" \\\\ \\0 \\b \\f \\n \\r \\t \\v'"),
More information about the nginx-devel
mailing list