[njs] Fixed RegExpBuiltinExec() with UTF-8 only regexps.

Dmitry Volyntsev xeioex at nginx.com
Fri Jun 25 17:33:15 UTC 2021


details:   https://hg.nginx.org/njs/rev/f10d5c38f098
branches:  
changeset: 1667:f10d5c38f098
user:      Dmitry Volyntsev <xeioex at nginx.com>
date:      Fri Jun 25 17:00:12 2021 +0000
description:
Fixed RegExpBuiltinExec() with UTF-8 only regexps.

The original issue was introduced in f9082cd59ba6 (0.4.2) while adding
RegExpBuiltinExec(), but after de64420d0f2b (0.6.0) it started to affect
RegExp.prototype.test() as it was rewritten according to spec.

diffstat:

 src/njs_regexp.c         |  24 ++++++++++++++----------
 src/test/njs_unit_test.c |  13 +++++++++++++
 2 files changed, 27 insertions(+), 10 deletions(-)

diffs (123 lines):

diff -r 7717b6523cd4 -r f10d5c38f098 src/njs_regexp.c
--- a/src/njs_regexp.c	Fri Jun 18 15:01:48 2021 +0000
+++ b/src/njs_regexp.c	Fri Jun 25 17:00:12 2021 +0000
@@ -26,8 +26,7 @@ static u_char *njs_regexp_compile_trace_
 static u_char *njs_regexp_match_trace_handler(njs_trace_t *trace,
     njs_trace_data_t *td, u_char *start);
 static njs_array_t *njs_regexp_exec_result(njs_vm_t *vm, njs_value_t *r,
-    njs_regexp_utf8_t type, njs_string_prop_t *string,
-    njs_regex_match_data_t *data);
+    njs_utf8_t utf8, njs_string_prop_t *string, njs_regex_match_data_t *data);
 static njs_int_t njs_regexp_string_create(njs_vm_t *vm, njs_value_t *value,
     u_char *start, uint32_t size, int32_t length);
 
@@ -946,6 +945,7 @@ njs_regexp_builtin_exec(njs_vm_t *vm, nj
     size_t                  length, offset;
     int64_t                 last_index;
     njs_int_t               ret;
+    njs_utf8_t              utf8;
     njs_value_t             value;
     njs_array_t             *result;
     njs_regexp_t            *regexp;
@@ -979,11 +979,15 @@ njs_regexp_builtin_exec(njs_vm_t *vm, nj
         goto not_found;
     }
 
+    utf8 = NJS_STRING_BYTE;
     type = NJS_REGEXP_BYTE;
 
-    if (length != string.size) {
-        /* UTF-8 string. */
+    if (string.length != 0) {
         type = NJS_REGEXP_UTF8;
+
+        if (string.length != string.size) {
+            utf8 = NJS_STRING_UTF8;
+        }
     }
 
     pattern = regexp->pattern;
@@ -998,7 +1002,7 @@ njs_regexp_builtin_exec(njs_vm_t *vm, nj
         return NJS_ERROR;
     }
 
-    if (type != NJS_REGEXP_UTF8) {
+    if (utf8 != NJS_STRING_UTF8) {
         offset = last_index;
 
     } else {
@@ -1010,7 +1014,7 @@ njs_regexp_builtin_exec(njs_vm_t *vm, nj
     ret = njs_regexp_match(vm, &pattern->regex[type], string.start, offset,
                            string.size, match_data);
     if (ret >= 0) {
-        result = njs_regexp_exec_result(vm, r, type, &string, match_data);
+        result = njs_regexp_exec_result(vm, r, utf8, &string, match_data);
         if (njs_slow_path(result == NULL)) {
             return NJS_ERROR;
         }
@@ -1043,7 +1047,7 @@ not_found:
 
 
 static njs_array_t *
-njs_regexp_exec_result(njs_vm_t *vm, njs_value_t *r, njs_regexp_utf8_t type,
+njs_regexp_exec_result(njs_vm_t *vm, njs_value_t *r, njs_utf8_t utf8,
     njs_string_prop_t *string, njs_regex_match_data_t *match_data)
 {
     int                   *captures;
@@ -1081,7 +1085,7 @@ njs_regexp_exec_result(njs_vm_t *vm, njs
             start = &string->start[captures[n]];
             size = captures[n + 1] - captures[n];
 
-            if (type == NJS_REGEXP_UTF8) {
+            if (utf8 == NJS_STRING_UTF8) {
                 length = njs_max(njs_utf8_length(start, size), 0);
 
             } else {
@@ -1105,7 +1109,7 @@ njs_regexp_exec_result(njs_vm_t *vm, njs
         goto fail;
     }
 
-    if (type == NJS_REGEXP_UTF8) {
+    if (utf8 == NJS_STRING_UTF8) {
         index = njs_string_index(string, captures[0]);
 
     } else {
@@ -1115,7 +1119,7 @@ njs_regexp_exec_result(njs_vm_t *vm, njs
     njs_set_number(&prop->value, index);
 
     if (pattern->global || pattern->sticky) {
-        if (type == NJS_REGEXP_UTF8) {
+        if (utf8 == NJS_STRING_UTF8) {
             index = njs_string_index(string, captures[1]);
 
         } else {
diff -r 7717b6523cd4 -r f10d5c38f098 src/test/njs_unit_test.c
--- a/src/test/njs_unit_test.c	Fri Jun 18 15:01:48 2021 +0000
+++ b/src/test/njs_unit_test.c	Fri Jun 25 17:00:12 2021 +0000
@@ -10763,6 +10763,12 @@ static njs_unit_test_t  njs_test[] =
     { njs_str("/α/.test('\\u00CE\\u00B1'.toBytes())"),
       njs_str("true") },
 
+    { njs_str("/[A-Za-z]/.test('S')"),
+      njs_str("true") },
+
+    { njs_str("/[A-Za-z]/.test('ø')"),
+      njs_str("false") },
+
     { njs_str("var r = /abc/y; r.test('abc'); r.lastIndex"),
       njs_str("3") },
 
@@ -21004,6 +21010,13 @@ static njs_unit_test_t  njs_regexp_test[
 
     { njs_str("RegExp('[\0]').test('\0')"),
       njs_str("true") },
+
+    { njs_str("/[A-Za-z\\u00F8-\\u02FF]/.test('S')"),
+      njs_str("true") },
+
+    { njs_str("/[A-Za-z\\u00F8-\\u02FF]/.test('ø')"),
+      njs_str("true") },
+
 };
 
 


More information about the nginx-devel mailing list