[njs] Fixed String.prototype.split() according to the specification.

Dmitry Volyntsev xeioex at nginx.com
Wed Jun 9 17:54:15 UTC 2021


details:   https://hg.nginx.org/njs/rev/f2d02c3b5f8a
branches:  
changeset: 1656:f2d02c3b5f8a
user:      Dmitry Volyntsev <xeioex at nginx.com>
date:      Wed Jun 09 17:14:10 2021 +0000
description:
Fixed String.prototype.split() according to the specification.

This closes #359 issue on GitHub.

diffstat:

 src/njs_regexp.c         |  252 ++++++++++++++++++++++++++++++++++++++++++
 src/njs_string.c         |  278 ++++++++++++++++++++--------------------------
 src/njs_string.h         |    2 +
 src/test/njs_benchmark.c |    6 +-
 src/test/njs_unit_test.c |   38 ++++++-
 5 files changed, 418 insertions(+), 158 deletions(-)

diffs (697 lines):

diff -r 5516f717a8c9 -r f2d02c3b5f8a src/njs_regexp.c
--- a/src/njs_regexp.c	Tue Jun 08 18:01:25 2021 +0000
+++ b/src/njs_regexp.c	Wed Jun 09 17:14:10 2021 +0000
@@ -1612,6 +1612,250 @@ exception:
 }
 
 
+static njs_int_t
+njs_regexp_prototype_symbol_split(njs_vm_t *vm, njs_value_t *args,
+    njs_uint_t nargs, njs_index_t unused)
+{
+    u_char             *dst;
+    int64_t            e, i, p, q, ncaptures, length;
+    uint32_t           limit;
+    njs_int_t          ret;
+    njs_bool_t         sticky;
+    njs_utf8_t         utf8;
+    njs_array_t        *array;
+    njs_value_t        *rx, *string, *value;
+    njs_value_t        r, z, this, s_lvalue, retval, setval, constructor;
+    njs_object_t       *object;
+    const u_char       *start, *end;
+    njs_string_prop_t  s;
+    njs_value_t        arguments[2];
+
+    static const njs_value_t  string_lindex = njs_string("lastIndex");
+    static const njs_value_t  string_flags = njs_string("flags");
+
+    rx = njs_argument(args, 0);
+
+    if (njs_slow_path(!njs_is_object(rx))) {
+        njs_type_error(vm, "\"this\" is not object");
+        return NJS_ERROR;
+    }
+
+    string = njs_lvalue_arg(&s_lvalue, args, nargs, 1);
+
+    ret = njs_value_to_string(vm, string, string);
+    if (njs_slow_path(ret != NJS_OK)) {
+        return ret;
+    }
+
+    njs_set_function(&constructor, &vm->constructors[NJS_OBJ_TYPE_REGEXP]);
+
+    ret = njs_value_species_constructor(vm, rx, &constructor, &constructor);
+    if (njs_slow_path(ret != NJS_OK)) {
+        return ret;
+    }
+
+    ret = njs_value_property(vm, rx, njs_value_arg(&string_flags), &retval);
+    if (njs_slow_path(ret == NJS_ERROR)) {
+        return NJS_ERROR;
+    }
+
+    ret = njs_value_to_string(vm, &retval, &retval);
+    if (njs_slow_path(ret != NJS_OK)) {
+        return ret;
+    }
+
+    (void) njs_string_prop(&s, &retval);
+
+    sticky = memchr(s.start, 'y', s.size) != NULL;
+
+    object = njs_function_new_object(vm, &constructor);
+    if (njs_slow_path(object == NULL)) {
+        return NJS_ERROR;
+    }
+
+    njs_set_object(&this, object);
+
+    arguments[0] = *rx;
+
+    if (!sticky) {
+        length = njs_is_byte_string(&s) ? 0 : s.length + 1;
+
+        dst = njs_string_alloc(vm, &arguments[1], s.size + 1, length);
+        if (njs_slow_path(dst == NULL)) {
+            return NJS_ERROR;
+        }
+
+        dst = njs_cpymem(dst, s.start, s.size);
+        *dst++ = 'y';
+
+    } else {
+        arguments[1] = retval;
+    }
+
+    ret = njs_function_call2(vm, njs_function(&constructor), &this,
+                             njs_value_arg(&arguments), 2, &r, 1);
+    if (njs_slow_path(ret != NJS_OK)) {
+        return NJS_ERROR;
+    }
+
+    rx = &r;
+
+    array = njs_array_alloc(vm, 0, 0, NJS_ARRAY_SPARE);
+    if (njs_slow_path(array == NULL)) {
+        return NJS_ERROR;
+    }
+
+    value = njs_arg(args, nargs, 2);
+    limit = UINT32_MAX;
+
+    if (njs_is_defined(value)) {
+        ret = njs_value_to_uint32(vm, value, &limit);
+        if (njs_slow_path(ret != NJS_OK)) {
+            return ret;
+        }
+    }
+
+    if (njs_slow_path(limit == 0)) {
+        goto done;
+    }
+
+    length = njs_string_prop(&s, string);
+
+    if (njs_slow_path(s.size == 0)) {
+        ret = njs_regexp_exec(vm, rx, string, &z);
+        if (njs_slow_path(ret != NJS_OK)) {
+            return NJS_ERROR;
+        }
+
+        if (!njs_is_null(&z)) {
+            goto done;
+        }
+
+        goto single;
+    }
+
+    utf8 = NJS_STRING_BYTE;
+
+    if (s.length != 0 && s.length != s.size) {
+        utf8 = NJS_STRING_UTF8;
+    }
+
+    p = 0;
+    q = 0;
+
+    while (q < length) {
+        njs_set_number(&setval, q);
+        ret = njs_value_property_set(vm, rx, njs_value_arg(&string_lindex),
+                                     &setval);
+        if (njs_slow_path(ret != NJS_OK)) {
+            return NJS_ERROR;
+        }
+
+        ret = njs_regexp_exec(vm, rx, string, &z);
+        if (njs_slow_path(ret != NJS_OK)) {
+            return NJS_ERROR;
+        }
+
+        if (njs_is_null(&z)) {
+            q = q + 1;
+            continue;
+        }
+
+        ret = njs_value_property(vm, rx, njs_value_arg(&string_lindex),
+                                 &retval);
+        if (njs_slow_path(ret == NJS_ERROR)) {
+            return NJS_ERROR;
+        }
+
+        ret = njs_value_to_length(vm, &retval, &e);
+        if (njs_slow_path(ret != NJS_OK)) {
+            return NJS_ERROR;
+        }
+
+        e = njs_min(e, length);
+
+        if (e == p) {
+            q = q + 1;
+            continue;
+        }
+
+        if (utf8 == NJS_STRING_UTF8) {
+            start = njs_string_offset(s.start, s.start + s.size, p);
+            end = njs_string_offset(s.start, s.start + s.size, q);
+
+        } else {
+            start = &s.start[p];
+            end = &s.start[q];
+        }
+
+        ret = njs_string_split_part_add(vm, array, utf8, start, end - start);
+        if (njs_slow_path(ret != NJS_OK)) {
+            return ret;
+        }
+
+        if (array->length == limit) {
+            goto done;
+        }
+
+        p = e;
+
+        ret = njs_object_length(vm, &z, &ncaptures);
+        if (njs_slow_path(ret != NJS_OK)) {
+            return NJS_ERROR;
+        }
+
+        ncaptures = njs_max(ncaptures - 1, 0);
+
+        for (i = 1; i <= ncaptures; i++) {
+            value = njs_array_push(vm, array);
+            if (njs_slow_path(value == NULL)) {
+                return NJS_ERROR;
+            }
+
+            ret = njs_value_property_i64(vm, &z, i, value);
+            if (njs_slow_path(ret == NJS_ERROR)) {
+                return NJS_ERROR;
+            }
+
+            if (array->length == limit) {
+                goto done;
+            }
+        }
+
+        q = p;
+    }
+
+    end = &s.start[s.size];
+
+    if (utf8 == NJS_STRING_UTF8) {
+        start = njs_string_offset(s.start, s.start + s.size, p);
+
+    } else {
+        start = &s.start[p];
+    }
+
+    ret = njs_string_split_part_add(vm, array, utf8, start, end - start);
+    if (njs_slow_path(ret != NJS_OK)) {
+        return ret;
+    }
+
+    goto done;
+
+single:
+
+    value = njs_array_push(vm, array);
+    if (njs_slow_path(value == NULL)) {
+        return NJS_ERROR;
+    }
+
+    *value = *string;
+
+done:
+
+    njs_set_array(&vm->retval, array);
+
+    return NJS_OK;
+}
 
 
 static const njs_object_prop_t  njs_regexp_constructor_properties[] =
@@ -1755,6 +1999,14 @@ static const njs_object_prop_t  njs_rege
         .writable = 1,
         .configurable = 1,
     },
+
+    {
+        .type = NJS_PROPERTY,
+        .name = njs_wellknown_symbol(NJS_SYMBOL_SPLIT),
+        .value = njs_native_function(njs_regexp_prototype_symbol_split, 2),
+        .writable = 1,
+        .configurable = 1,
+    },
 };
 
 
diff -r 5516f717a8c9 -r f2d02c3b5f8a src/njs_string.c
--- a/src/njs_string.c	Tue Jun 08 18:01:25 2021 +0000
+++ b/src/njs_string.c	Wed Jun 09 17:14:10 2021 +0000
@@ -72,8 +72,6 @@ static njs_int_t njs_string_bytes_from_s
     const njs_value_t *string, const njs_value_t *encoding);
 static njs_int_t njs_string_match_multiple(njs_vm_t *vm, njs_value_t *args,
     njs_regexp_pattern_t *pattern);
-static njs_int_t njs_string_split_part_add(njs_vm_t *vm, njs_array_t *array,
-    njs_utf8_t utf8, const u_char *start, size_t size);
 
 
 #define njs_base64_encoded_length(len)       (((len + 2) / 3) * 4)
@@ -3338,19 +3336,49 @@ static njs_int_t
 njs_string_prototype_split(njs_vm_t *vm, njs_value_t *args, njs_uint_t nargs,
     njs_index_t unused)
 {
-    int                   *captures;
-    size_t                size;
-    uint32_t              limit;
-    njs_int_t             ret;
-    njs_utf8_t            utf8;
-    njs_value_t           *value;
-    njs_array_t           *array;
-    const u_char          *p, *start, *next, *last, *end;
-    njs_regexp_utf8_t     type;
-    njs_string_prop_t     string, split;
-    njs_regexp_pattern_t  *pattern;
-
-    ret = njs_string_object_validate(vm, njs_arg(args, nargs, 0));
+    size_t             size;
+    uint32_t           limit;
+    njs_int_t          ret;
+    njs_utf8_t         utf8;
+    njs_bool_t         undefined;
+    njs_value_t        *this, *separator, *value;
+    njs_value_t        separator_lvalue, limit_lvalue, splitter;
+    njs_array_t        *array;
+    const u_char       *p, *start, *next, *last, *end;
+    njs_string_prop_t  string, split;
+    njs_value_t        arguments[3];
+
+    static const njs_value_t  split_key =
+                                        njs_wellknown_symbol(NJS_SYMBOL_SPLIT);
+
+    this = njs_argument(args, 0);
+
+    if (njs_slow_path(njs_is_null_or_undefined(this))) {
+        njs_type_error(vm, "cannot convert \"%s\"to object",
+                       njs_type_string(this->type));
+        return NJS_ERROR;
+    }
+
+    separator = njs_lvalue_arg(&separator_lvalue, args, nargs, 1);
+    value = njs_lvalue_arg(&limit_lvalue, args, nargs, 2);
+
+    if (!njs_is_null_or_undefined(separator)) {
+        ret = njs_value_method(vm, separator, njs_value_arg(&split_key),
+                               &splitter);
+        if (njs_slow_path(ret != NJS_OK)) {
+            return ret;
+        }
+
+        if (njs_is_defined(&splitter)) {
+            arguments[0] = *this;
+            arguments[1] = *value;
+
+            return njs_function_call(vm, njs_function(&splitter), separator,
+                                     arguments, 2, &vm->retval);
+        }
+    }
+
+    ret = njs_value_to_string(vm, this, this);
     if (njs_slow_path(ret != NJS_OK)) {
         return ret;
     }
@@ -3360,159 +3388,99 @@ njs_string_prototype_split(njs_vm_t *vm,
         return NJS_ERROR;
     }
 
-    if (nargs > 1) {
-
-        if (nargs > 2) {
-            value = njs_argument(args, 2);
-
-            if (njs_slow_path(!njs_is_number(value))) {
-                ret = njs_value_to_uint32(vm, value, &limit);
-                if (njs_slow_path(ret != NJS_OK)) {
-                    return ret;
-                }
-
-            } else {
-                limit = njs_number_to_uint32(njs_number(value));
-            }
-
-            if (limit == 0) {
-                goto done;
-            }
-
-        } else {
-            limit = (uint32_t) -1;
+    limit = UINT32_MAX;
+
+    if (njs_is_defined(value)) {
+        ret = njs_value_to_uint32(vm, value, &limit);
+        if (njs_slow_path(ret != NJS_OK)) {
+            return ret;
         }
-
-        (void) njs_string_prop(&string, &args[0]);
-
-        if (string.size == 0) {
+    }
+
+    undefined = njs_is_undefined(separator);
+
+    ret = njs_value_to_string(vm, separator, separator);
+    if (njs_slow_path(ret != NJS_OK)) {
+        return ret;
+    }
+
+    if (njs_slow_path(limit == 0)) {
+        goto done;
+    }
+
+    if (njs_slow_path(undefined)) {
+        goto single;
+    }
+
+    (void) njs_string_prop(&string, this);
+    (void) njs_string_prop(&split, separator);
+
+    if (njs_slow_path(string.size == 0)) {
+        if (split.size != 0) {
             goto single;
         }
 
-        utf8 = NJS_STRING_BYTE;
-        type = NJS_REGEXP_BYTE;
-
-        if (string.length != 0) {
-            utf8 = NJS_STRING_ASCII;
-            type = NJS_REGEXP_UTF8;
-
-            if (string.length != string.size) {
-                utf8 = NJS_STRING_UTF8;
+        goto done;
+    }
+
+    utf8 = NJS_STRING_BYTE;
+
+    if (string.length != 0) {
+        utf8 = NJS_STRING_ASCII;
+
+        if (string.length != string.size) {
+            utf8 = NJS_STRING_UTF8;
+        }
+    }
+
+    start = string.start;
+    end = string.start + string.size;
+    last = end - split.size;
+
+    do {
+
+        for (p = start; p <= last; p++) {
+            if (memcmp(p, split.start, split.size) == 0) {
+                goto found;
             }
         }
 
-        switch (args[1].type) {
-
-        case NJS_REGEXP:
-            pattern = njs_regexp_pattern(&args[1]);
-
-            if (!njs_regex_is_valid(&pattern->regex[type])) {
-                goto single;
-            }
-
-            start = string.start;
-            end = string.start + string.size;
-
-            do {
-                ret = njs_regexp_match(vm, &pattern->regex[type], start, 0,
-                                       end - start, vm->single_match_data);
-                if (ret >= 0) {
-                    captures = njs_regex_captures(vm->single_match_data);
-
-                    p = start + captures[0];
-                    next = start + captures[1];
-
-                } else if (ret == NJS_REGEX_NOMATCH) {
-                    p = (u_char *) end;
-                    next = (u_char *) end + 1;
-
-                } else {
-                    return NJS_ERROR;
-                }
-
-                /* Empty split regexp. */
-                if (p == next) {
-                    p = (utf8 != NJS_STRING_BYTE) ? njs_utf8_next(p, end)
-                                                  : p + 1;
-                    next = p;
-                }
-
-                size = p - start;
-
-                ret = njs_string_split_part_add(vm, array, utf8, start, size);
-                if (njs_slow_path(ret != NJS_OK)) {
-                    return ret;
-                }
-
-                start = next;
-                limit--;
-
-            } while (limit != 0 && p < end);
-
-            goto done;
-
-        case NJS_UNDEFINED:
-            break;
-
-        default:
-            if (njs_slow_path(!njs_is_string(&args[1]))) {
-                ret = njs_value_to_string(vm, &args[1], &args[1]);
-                if (njs_slow_path(ret != NJS_OK)) {
-                    return ret;
-                }
-            }
-
-            (void) njs_string_prop(&split, &args[1]);
-
-            if (string.size < split.size) {
-                goto single;
-            }
-
-            start = string.start;
-            end = string.start + string.size;
-            last = end - split.size;
-
-            do {
-                for (p = start; p <= last; p++) {
-                    if (memcmp(p, split.start, split.size) == 0) {
-                        goto found;
-                    }
-                }
-
-                p = end;
+        p = end;
 
 found:
 
-                next = p + split.size;
-
-                /* Empty split string. */
-                if (p == next) {
-                    p = (utf8 != NJS_STRING_BYTE) ? njs_utf8_next(p, end)
-                                                  : p + 1;
-                    next = p;
-                }
-
-                size = p - start;
-
-                ret = njs_string_split_part_add(vm, array, utf8, start, size);
-                if (njs_slow_path(ret != NJS_OK)) {
-                    return ret;
-                }
-
-                start = next;
-                limit--;
-
-            } while (limit != 0 && p < end);
-
-            goto done;
+        next = p + split.size;
+
+        /* Empty split string. */
+
+        if (p == next) {
+            p = (utf8 != NJS_STRING_BYTE) ? njs_utf8_next(p, end)
+                                          : p + 1;
+            next = p;
         }
-    }
+
+        size = p - start;
+
+        ret = njs_string_split_part_add(vm, array, utf8, start, size);
+        if (njs_slow_path(ret != NJS_OK)) {
+            return ret;
+        }
+
+        start = next;
+        limit--;
+
+    } while (limit != 0 && p < end);
+
+    goto done;
 
 single:
 
-    /* GC: retain. */
-    array->start[0] = args[0];
-    array->length = 1;
+    value = njs_array_push(vm, array);
+    if (njs_slow_path(value == NULL)) {
+        return NJS_ERROR;
+    }
+
+    *value = *this;
 
 done:
 
@@ -3522,7 +3490,7 @@ done:
 }
 
 
-static njs_int_t
+njs_int_t
 njs_string_split_part_add(njs_vm_t *vm, njs_array_t *array, njs_utf8_t utf8,
     const u_char *start, size_t size)
 {
diff -r 5516f717a8c9 -r f2d02c3b5f8a src/njs_string.h
--- a/src/njs_string.h	Tue Jun 08 18:01:25 2021 +0000
+++ b/src/njs_string.h	Wed Jun 09 17:14:10 2021 +0000
@@ -239,6 +239,8 @@ njs_int_t njs_string_decode_uri(njs_vm_t
 
 njs_int_t njs_string_prototype_concat(njs_vm_t *vm, njs_value_t *args,
     njs_uint_t nargs, njs_index_t unused);
+njs_int_t njs_string_split_part_add(njs_vm_t *vm, njs_array_t *array,
+    njs_utf8_t utf8, const u_char *start, size_t size);
 njs_int_t njs_string_get_substitution(njs_vm_t *vm, njs_value_t *matched,
     njs_value_t *string, int64_t pos, njs_value_t *captures, int64_t ncaptures,
     njs_value_t *groups, njs_value_t *replacement, njs_value_t *retval);
diff -r 5516f717a8c9 -r f2d02c3b5f8a src/test/njs_benchmark.c
--- a/src/test/njs_benchmark.c	Tue Jun 08 18:01:25 2021 +0000
+++ b/src/test/njs_benchmark.c	Wed Jun 09 17:14:10 2021 +0000
@@ -317,8 +317,10 @@ static njs_benchmark_test_t  njs_test[] 
       1 },
 
     { "regexp split",
-      njs_str("'a a'.split(/ /).length"),
-      njs_str("2"),
+      njs_str("var s = Array(26).fill(0).map((v,i)=> {"
+              "    var u = String.fromCodePoint(65+i), l = u.toLowerCase(); return u+l+l;}).join('');"
+              "s.split(/(?=[A-Z])/).length"),
+      njs_str("26"),
       100 },
 
     { "regexp 10K split",
diff -r 5516f717a8c9 -r f2d02c3b5f8a src/test/njs_unit_test.c
--- a/src/test/njs_unit_test.c	Tue Jun 08 18:01:25 2021 +0000
+++ b/src/test/njs_unit_test.c	Wed Jun 09 17:14:10 2021 +0000
@@ -8635,6 +8635,9 @@ static njs_unit_test_t  njs_test[] =
     { njs_str("/[\\"),
       njs_str("SyntaxError: Unterminated RegExp \"/[\\\" in 1") },
 
+    { njs_str("/\\s*;\\s*/"),
+      njs_str("/\\s*;\\s*/") },
+
     { njs_str("RegExp(']')"),
       njs_str("/\\]/") },
 
@@ -8802,7 +8805,7 @@ static njs_unit_test_t  njs_test[] =
       njs_str("abc") },
 
     { njs_str("''.split('').length"),
-      njs_str("1") },
+      njs_str("0") },
 
     { njs_str("'abc'.split('')"),
       njs_str("a,b,c") },
@@ -8858,9 +8861,40 @@ static njs_unit_test_t  njs_test[] =
     { njs_str("'abc'.split(/abc/)"),
       njs_str(",") },
 
+    { njs_str("'AbcDefGhi'.split(/([A-Z][a-z]+)/)"),
+      njs_str(",Abc,,Def,,Ghi,") },
+
+    { njs_str("'myCamelCaseString'.split(/(?=[A-Z])/)"),
+      njs_str("my,Camel,Case,String") },
+
+    { njs_str("'мояВерблюжьяСтрока'.split(/(?=[А-Я])/)"),
+      njs_str("моя,Верблюжья,Строка") },
+
+    { njs_str("'Harry Trump ;Fred Barney; Helen Rigby ; Bill Abel ;Chris Hand '.split( /\\s*(?:;|$)\\s*/)"),
+      njs_str("Harry Trump,Fred Barney,Helen Rigby,Bill Abel,Chris Hand,") },
+
+    { njs_str("'Гарри Трамп ;Фрэд Барни; Хелен Ригби ; Билл Абель'.split(/\\s*;\\s*/)"),
+      njs_str("Гарри Трамп,Фрэд Барни,Хелен Ригби,Билл Абель") },
+
+    { njs_str("'Hello 1 world. Sentence number 2.'.split(/(\\d)/)"),
+      njs_str("Hello ,1, world. Sentence number ,2,.") },
+
+    { njs_str("'Привет 1 мир. Предложение номер 2.'.split(/(\\d)/)"),
+      njs_str("Привет ,1, мир. Предложение номер ,2,.") },
+
     { njs_str("'0123456789'.split('').reverse().join('')"),
       njs_str("9876543210") },
 
+    { njs_str("/-/[Symbol.split]('a-b-c')"),
+      njs_str("a,b,c") },
+
+    { njs_str("var O = RegExp.prototype[Symbol.split];"
+              "RegExp.prototype[Symbol.split] = function (s, limit) { "
+              "    return O.call(this, s, limit).map(v => `@${v}#`); "
+              "};"
+              "'2016-01-02'.split(/-/)"),
+      njs_str("@2016#, at 01#, at 02#") },
+
     { njs_str("'abc'.repeat(3)"),
       njs_str("abcabcabc") },
 
@@ -17006,11 +17040,13 @@ static njs_unit_test_t  njs_test[] =
     { njs_str("var a = [1]; a[2] = 'x'; JSON.stringify(a)"),
       njs_str("[1,null,\"x\"]") },
 
+#if (!NJS_HAVE_MEMORY_SANITIZER) /* very long test under MSAN */
     { njs_str(njs_declare_sparse_array("a", 32769)
               "a[32] = 'a'; a[64] = 'b';"
               "var s = JSON.stringify(a); "
               "[s.length,s.substring(162,163),s.match(/null/g).length]"),
       njs_str("163844,a,32767") },
+#endif
 
     { njs_str(njs_declare_sparse_array("a", 8)
               "a[2] = 'a'; a[4] = 'b'; a.length = 3;"


More information about the nginx-devel mailing list