[njs] Parser: properly handling unicode space characters.

Dmitry Volyntsev xeioex at nginx.com
Fri Sep 16 00:41:11 UTC 2022


details:   https://hg.nginx.org/njs/rev/46d505a902bb
branches:  
changeset: 1953:46d505a902bb
user:      Dmitry Volyntsev <xeioex at nginx.com>
date:      Wed Sep 14 22:14:50 2022 -0700
description:
Parser: properly handling unicode space characters.

diffstat:

 src/njs_lexer.c          |  35 +++++++++++++++++++++++++++++------
 src/njs_str.h            |   1 -
 src/test/njs_unit_test.c |   5 +++++
 3 files changed, 34 insertions(+), 7 deletions(-)

diffs (83 lines):

diff -r 05efe34376ab -r 46d505a902bb src/njs_lexer.c
--- a/src/njs_lexer.c	Tue Sep 13 21:13:17 2022 -0700
+++ b/src/njs_lexer.c	Wed Sep 14 22:14:50 2022 -0700
@@ -45,8 +45,8 @@ static const uint8_t  njs_tokens[256]  n
                 NJS_TOKEN_ILLEGAL,           NJS_TOKEN_ILLEGAL,
                 NJS_TOKEN_ILLEGAL,           NJS_TOKEN_ILLEGAL,
     /* \t */    NJS_TOKEN_ILLEGAL,           NJS_TOKEN_SPACE,
-    /* \n */    NJS_TOKEN_LINE_END,          NJS_TOKEN_ILLEGAL,
-    /* \r */    NJS_TOKEN_ILLEGAL,           NJS_TOKEN_SPACE,
+    /* \n */    NJS_TOKEN_LINE_END,          NJS_TOKEN_SPACE,
+    /* \r */    NJS_TOKEN_SPACE,             NJS_TOKEN_SPACE,
                 NJS_TOKEN_ILLEGAL,           NJS_TOKEN_ILLEGAL,
 
     /* 0x10 */  NJS_TOKEN_ILLEGAL,           NJS_TOKEN_ILLEGAL,
@@ -437,15 +437,38 @@ njs_lexer_consume_token(njs_lexer_t *lex
 njs_int_t
 njs_lexer_make_token(njs_lexer_t *lexer, njs_lexer_token_t *token)
 {
-    u_char  c, *p;
+    u_char                c, *p;
+    uint32_t              cp;
+    njs_unicode_decode_t  ctx;
 
     c = ' ';
 
+    njs_utf8_decode_init(&ctx);
+
     while (lexer->start < lexer->end) {
-        c = *lexer->start++;
+        c = *lexer->start;
+
+        if (njs_fast_path(!(c & 0x80))) {
+            lexer->start++;
+
+            if (njs_tokens[c] != NJS_TOKEN_SPACE) {
+                break;
+            }
 
-        if (njs_tokens[c] != NJS_TOKEN_SPACE) {
-            break;
+        } else {
+
+            /* Unicode. */
+
+            cp = njs_utf8_decode(&ctx, (const u_char **) &lexer->start,
+                                 lexer->end);
+            if (njs_slow_path(cp > NJS_UNICODE_MAX_CODEPOINT)) {
+                c = '\0';
+                break;
+            }
+
+            if (!njs_utf8_is_whitespace(cp)) {
+                break;
+            }
         }
     }
 
diff -r 05efe34376ab -r 46d505a902bb src/njs_str.h
--- a/src/njs_str.h	Tue Sep 13 21:13:17 2022 -0700
+++ b/src/njs_str.h	Wed Sep 14 22:14:50 2022 -0700
@@ -51,7 +51,6 @@ njs_is_whitespace(u_char c)
     case 0x0C:  /* <FF>   */
     case 0x0D:  /* <CR>   */
     case 0x20:  /* <SP>   */
-    case 0xA0:  /* <NBSP> */
         return 1;
 
     default:
diff -r 05efe34376ab -r 46d505a902bb src/test/njs_unit_test.c
--- a/src/test/njs_unit_test.c	Tue Sep 13 21:13:17 2022 -0700
+++ b/src/test/njs_unit_test.c	Wed Sep 14 22:14:50 2022 -0700
@@ -7341,6 +7341,11 @@ static njs_unit_test_t  njs_test[] =
                  "[a.length, a[33], a[34]]"),
       njs_str("35,a,�") },
 
+    /* Spaces: U+0009U+000BU+000CU+0020U+00A0U+000AU+000DU+2028U+2029 */
+
+    { njs_str("\x09\x0a\x0b\x0c\x0d \xc2\xa0'a'\xe2\x80\xa8+\xe2\x80\xa9'b'"),
+      njs_str("ab") },
+
     /* Escape strings. */
 
     { njs_str("'\\a \\' \\\" \\\\ \\0 \\b \\f \\n \\r \\t \\v'"),


More information about the nginx-devel mailing list