[njs] RegExp: improved source string treatment.

Dmitry Volyntsev xeioex at nginx.com
Thu Nov 11 14:31:02 UTC 2021


details:   https://hg.nginx.org/njs/rev/cf9e73e05aaf
branches:  
changeset: 1744:cf9e73e05aaf
user:      Dmitry Volyntsev <xeioex at nginx.com>
date:      Thu Nov 11 14:26:41 2021 +0000
description:
RegExp: improved source string treatment.

Previously, njs_regexp_pattern_create() in addition to a pattern
compilation made a string representation for a RegExp which was returned
by RegExp.prototype.toString() as is.

After 02444445df29 (0.6.0), RegExp.prototype.toString() was implemented
according to the spec, and since then it creates a RegExp string on the fly.

This patch removes the extra code which was left.

In addition, as a source string may not be a valid UTF-8 string (in
RegExp literals), RegExp.prototype.toString() now ensures that a
valid UTF-8 string is returned.

diffstat:

 src/njs_regexp.c         |  99 ++++++++++++++++++++++++-----------------------
 src/njs_regexp_pattern.h |  11 +----
 2 files changed, 52 insertions(+), 58 deletions(-)

diffs (213 lines):

diff -r 67ee2e4907a8 -r cf9e73e05aaf src/njs_regexp.c
--- a/src/njs_regexp.c	Thu Nov 11 14:26:30 2021 +0000
+++ b/src/njs_regexp.c	Thu Nov 11 14:26:41 2021 +0000
@@ -265,7 +265,7 @@ njs_regexp_pattern_create(njs_vm_t *vm, 
     njs_regex_flags_t flags)
 {
     int                   ret;
-    u_char                *p, *end;
+    u_char                *p;
     size_t                size;
     njs_str_t             text;
     njs_uint_t            n;
@@ -273,11 +273,6 @@ njs_regexp_pattern_create(njs_vm_t *vm, 
     njs_regexp_group_t    *group;
     njs_regexp_pattern_t  *pattern;
 
-    size = 1;  /* A trailing "/". */
-    size += ((flags & NJS_REGEX_GLOBAL) != 0);
-    size += ((flags & NJS_REGEX_IGNORE_CASE) != 0);
-    size += ((flags & NJS_REGEX_MULTILINE) != 0);
-
     text.start = start;
     text.length = length;
 
@@ -287,45 +282,28 @@ njs_regexp_pattern_create(njs_vm_t *vm, 
         return NULL;
     }
 
-    pattern = njs_mp_zalloc(vm->mem_pool, sizeof(njs_regexp_pattern_t) + 1
-                                          + text.length + size + 1);
+    pattern = njs_mp_alloc(vm->mem_pool, sizeof(njs_regexp_pattern_t)
+                                          + text.length + 1);
     if (njs_slow_path(pattern == NULL)) {
         njs_memory_error(vm);
         return NULL;
     }
 
-    pattern->flags = size;
+    njs_memzero(pattern, sizeof(njs_regexp_pattern_t));
 
     p = (u_char *) pattern + sizeof(njs_regexp_pattern_t);
     pattern->source = p;
 
-    *p++ = '/';
-    p = memcpy(p, text.start, text.length);
-    p += text.length;
-    end = p;
+    p = njs_cpymem(p, text.start, text.length);
     *p++ = '\0';
 
     pattern->global = ((flags & NJS_REGEX_GLOBAL) != 0);
-    if (pattern->global) {
-        *p++ = 'g';
-    }
-
     pattern->ignore_case = ((flags & NJS_REGEX_IGNORE_CASE) != 0);
-    if (pattern->ignore_case) {
-        *p++ = 'i';
-    }
-
     pattern->multiline = ((flags & NJS_REGEX_MULTILINE) != 0);
-    if (pattern->multiline) {
-        *p++ = 'm';
-    }
-
     pattern->sticky = ((flags & NJS_REGEX_STICKY) != 0);
 
-    *p++ = '\0';
-
     ret = njs_regexp_pattern_compile(vm, &pattern->regex[0],
-                                     &pattern->source[1], text.length, flags);
+                                     &pattern->source[0], text.length, flags);
 
     if (njs_fast_path(ret >= 0)) {
         pattern->ncaptures = ret;
@@ -335,7 +313,7 @@ njs_regexp_pattern_create(njs_vm_t *vm, 
     }
 
     ret = njs_regexp_pattern_compile(vm, &pattern->regex[1],
-                                  &pattern->source[1], text.length,
+                                  &pattern->source[0], text.length,
                                   flags | NJS_REGEX_UTF8);
     if (njs_fast_path(ret >= 0)) {
 
@@ -362,8 +340,6 @@ njs_regexp_pattern_create(njs_vm_t *vm, 
         goto fail;
     }
 
-    *end = '/';
-
     pattern->ngroups = njs_regex_named_captures(regex, NULL, 0);
 
     if (pattern->ngroups != 0) {
@@ -651,9 +627,7 @@ static njs_int_t
 njs_regexp_prototype_source(njs_vm_t *vm, njs_value_t *args,
     njs_uint_t nargs, njs_index_t unused)
 {
-    u_char                *source;
-    int32_t               length;
-    uint32_t              size;
+    njs_str_t             src;
     njs_value_t           *this;
     njs_regexp_pattern_t  *pattern;
 
@@ -674,13 +648,11 @@ njs_regexp_prototype_source(njs_vm_t *vm
     }
 
     pattern = njs_regexp_pattern(this);
-    /* Skip starting "/". */
-    source = pattern->source + 1;
 
-    size = njs_strlen(source) - pattern->flags;
-    length = njs_utf8_length(source, size);
+    src.start = pattern->source;
+    src.length = njs_strlen(pattern->source);
 
-    return njs_regexp_string_create(vm, &vm->retval, source, size, length);
+    return njs_string_decode_utf8(vm, &vm->retval, &src);
 }
 
 
@@ -756,25 +728,56 @@ njs_int_t
 njs_regexp_to_string(njs_vm_t *vm, njs_value_t *retval,
     const njs_value_t *value)
 {
-    u_char                *p, *source;
+    u_char                *p, *start;
+    size_t                size, extra;
     int32_t               length;
-    uint32_t              size;
+    njs_str_t             s;
     njs_regexp_pattern_t  *pattern;
+    njs_unicode_decode_t  ctx;
 
     pattern = njs_regexp_pattern(value);
-    source = pattern->source;
 
-    size = njs_strlen(source);
-    length = njs_utf8_length(source, size);
+    s.start = pattern->source;
+    s.length = njs_strlen(pattern->source);
+
+    length = njs_decode_utf8_length(&s, &size);
 
-    length = (length >= 0) ? (length + (pattern->sticky != 0)): 0;
+    extra = njs_length("//");
+    extra += (pattern->global != 0);
+    extra += (pattern->ignore_case != 0);
+    extra += (pattern->multiline != 0);
+    extra += (pattern->sticky != 0);
 
-    p = njs_string_alloc(vm, retval, size + (pattern->sticky != 0), length);
-    if (njs_slow_path(p == NULL)) {
+    size += extra;
+
+    length = (length >= 0) ? (length + extra) : 0;
+
+    start = njs_string_alloc(vm, retval, size, length);
+    if (njs_slow_path(start == NULL)) {
         return NJS_ERROR;
     }
 
-    p = njs_cpymem(p, source, size);
+    njs_utf8_decode_init(&ctx);
+
+    p = start;
+
+    *p++ = '/';
+
+    p = njs_utf8_stream_encode(&ctx, s.start, &s.start[s.length], p, 1, 0);
+
+    *p++ = '/';
+
+    if (pattern->global) {
+        *p++ = 'g';
+    }
+
+    if (pattern->ignore_case) {
+        *p++ = 'i';
+    }
+
+    if (pattern->multiline) {
+        *p++ = 'm';
+    }
 
     if (pattern->sticky) {
         *p++ = 'y';
diff -r 67ee2e4907a8 -r cf9e73e05aaf src/njs_regexp_pattern.h
--- a/src/njs_regexp_pattern.h	Thu Nov 11 14:26:30 2021 +0000
+++ b/src/njs_regexp_pattern.h	Thu Nov 11 14:26:41 2021 +0000
@@ -20,21 +20,12 @@ typedef struct njs_regexp_group_s  njs_r
 struct njs_regexp_pattern_s {
     njs_regex_t           regex[2];
 
-    /*
-     * A pattern source is used by RegExp.prototype.toString() method and
-     * RegExp.prototype.source and RegExp.prototype.flags accessor properties.
-     * So it is is stored in form "/pattern/flags"
-     * and as zero-terminated C string but not as value, because retrieving
-     * it is very seldom operation.  To get just a pattern string for
-     * RegExp.source property a length of flags part "/flags" is stored
-     * in flags field.
-     */
+    /* A zero-terminated C string. */
     u_char                *source;
 
     uint16_t              ncaptures;
     uint16_t              ngroups;
 
-    uint8_t               flags;        /* 2 bits */
     uint8_t               global;       /* 1 bit */
     uint8_t               ignore_case;  /* 1 bit */
     uint8_t               multiline;    /* 1 bit */


More information about the nginx-devel mailing list