[njs] Added support of regular expressions not supported directly by PCRE2.

Dmitry Volyntsev xeioex at nginx.com
Sat May 20 04:31:00 UTC 2023


details:   https://hg.nginx.org/njs/rev/3ec3e7d2ce5f
branches:  
changeset: 2124:3ec3e7d2ce5f
user:      Dmitry Volyntsev <xeioex at nginx.com>
date:      Fri May 19 20:22:14 2023 -0700
description:
Added support of regular expressions not supported directly by PCRE2.

The following patterns were fixed:
    `[]` - matches nothing, previously was rejected as invalid expression.
    `[^]` - matched any character, unlike `.` this syntax matches new
        line, previously was rejected as invalid expression.
    `++`, `*+`, `?+` - are rejected now, whereas in PCRE2 they are considered
        valid possessive quantifiers.

diffstat:

 external/njs_regex.c     |  67 ++++++++++++++++++++++++++++++++++++++++++++++++
 src/njs_regexp.c         |  45 +++++++++++++++++++++++++++++++-
 src/test/njs_unit_test.c |  41 +++++++++++++++++++++++++++++
 3 files changed, 152 insertions(+), 1 deletions(-)

diffs (206 lines):

diff -r 4d26300ddc64 -r 3ec3e7d2ce5f external/njs_regex.c
--- a/external/njs_regex.c	Thu May 18 18:33:36 2023 -0700
+++ b/external/njs_regex.c	Fri May 19 20:22:14 2023 -0700
@@ -94,6 +94,73 @@ njs_int_t
 njs_regex_escape(njs_mp_t *mp, njs_str_t *text)
 {
 #ifdef NJS_HAVE_PCRE2
+    size_t  anychars, nomatches;
+    u_char  *p, *dst, *start, *end;
+
+    /*
+     * 1) [^] is a valid regexp expression in JavaScript, but PCRE2
+     * rejects it as invalid, replacing it with equivalent PCRE2 [\s\S]
+     * expression.
+     * 2) [] is a valid regexp expression in JavaScript, but PCRE2
+     * rejects it as invalid, replacing it with equivalent PCRE2 (?!)
+     * expression which matches nothing.
+     */
+
+    start = text->start;
+    end = text->start + text->length;
+
+    anychars = 0;
+    nomatches = 0;
+
+    for (p = start; p < end; p++) {
+        switch (*p) {
+        case '[':
+            if (p + 1 < end && p[1] == ']') {
+                p += 1;
+                nomatches += 1;
+
+            } else if (p + 2 < end && p[1] == '^' && p[2] == ']') {
+                p += 2;
+                anychars += 1;
+            }
+
+            break;
+        }
+    }
+
+    if (!anychars && !nomatches) {
+        return NJS_OK;
+    }
+
+    text->length = text->length
+                   + anychars * (njs_length("\\s\\S") - njs_length("^"))
+                   + nomatches * (njs_length("?!"));
+
+    text->start = njs_mp_alloc(mp, text->length);
+    if (njs_slow_path(text->start == NULL)) {
+        return NJS_ERROR;
+    }
+
+    dst = text->start;
+
+    for (p = start; p < end; p++) {
+
+        switch (*p) {
+        case '[':
+            if (p + 1 < end && p[1] == ']') {
+                p += 1;
+                dst = njs_cpymem(dst, "(?!)", 4);
+                continue;
+
+            } else if (p + 2 < end && p[1] == '^' && p[2] == ']') {
+                p += 2;
+                dst = njs_cpymem(dst, "[\\s\\S]", 6);
+                continue;
+            }
+        }
+
+        *dst++ = *p;
+    }
 
     return NJS_OK;
 
diff -r 4d26300ddc64 -r 3ec3e7d2ce5f src/njs_regexp.c
--- a/src/njs_regexp.c	Thu May 18 18:33:36 2023 -0700
+++ b/src/njs_regexp.c	Fri May 19 20:22:14 2023 -0700
@@ -263,9 +263,10 @@ njs_regexp_pattern_create(njs_vm_t *vm, 
     njs_regex_flags_t flags)
 {
     int                   ret;
-    u_char                *p;
+    u_char                *p, *end;
     size_t                size;
     njs_str_t             text;
+    njs_bool_t            in;
     njs_uint_t            n;
     njs_regex_t           *regex;
     njs_regexp_group_t    *group;
@@ -274,6 +275,42 @@ njs_regexp_pattern_create(njs_vm_t *vm, 
     text.start = start;
     text.length = length;
 
+    in = 0;
+    end = start + length;
+
+    for (p = start; p < end; p++) {
+
+        switch (*p) {
+        case '[':
+            in = 1;
+            break;
+
+        case ']':
+            in = 0;
+            break;
+
+        case '\\':
+            p++;
+            break;
+
+        case '+':
+            if (njs_slow_path(!in
+                              && (p - 1 > start)
+                              && (p[-1] == '+'|| p[-1] == '*' || p[-1] == '?'))
+                              && (p - 2 >= start && p[-2] != '\\'))
+            {
+                /**
+                 * PCRE possessive quantifiers `++`, `*+`, `?+`
+                 * are not allowed in JavaScript. Whereas `[++]` or `\?+` are
+                 * allowed.
+                 */
+                goto nothing_to_repeat;
+            }
+
+            break;
+        }
+    }
+
     ret = njs_regex_escape(vm->mem_pool, &text);
     if (njs_slow_path(ret != NJS_OK)) {
         njs_memory_error(vm);
@@ -370,6 +407,12 @@ fail:
 
     njs_mp_free(vm->mem_pool, pattern);
     return NULL;
+
+nothing_to_repeat:
+
+    njs_syntax_error(vm, "Invalid regular expression \"%V\" nothing to repeat",
+                     &text);
+    return NULL;
 }
 
 
diff -r 4d26300ddc64 -r 3ec3e7d2ce5f src/test/njs_unit_test.c
--- a/src/test/njs_unit_test.c	Thu May 18 18:33:36 2023 -0700
+++ b/src/test/njs_unit_test.c	Fri May 19 20:22:14 2023 -0700
@@ -11810,6 +11810,38 @@ static njs_unit_test_t  njs_test[] =
     { njs_str("var r = /./; r"),
       njs_str("/./") },
 
+    { njs_str("/[^]+|[^]+/.test('\\n| ')"),
+      njs_str("true") },
+
+    { njs_str("/[^]+|[^][^]/.test('|aa')"),
+      njs_str("true") },
+
+    { njs_str("/a[]/.test('a')"),
+      njs_str("false") },
+
+    { njs_str("/[]a/.test('a')"),
+      njs_str("false") },
+
+#ifdef NJS_HAVE_PCRE2
+    { njs_str("/[]*a/.test('a')"),
+      njs_str("true") },
+#endif
+
+    { njs_str("/Ca++BB/"),
+      njs_str("SyntaxError: Invalid regular expression \"Ca++BB\" nothing to repeat in 1") },
+
+    { njs_str("/a*+/"),
+      njs_str("SyntaxError: Invalid regular expression \"a*+\" nothing to repeat in 1") },
+
+    { njs_str("/a?+/"),
+      njs_str("SyntaxError: Invalid regular expression \"a?+\" nothing to repeat in 1") },
+
+    { njs_str(" /\\[[]++\\]/"),
+      njs_str("SyntaxError: Invalid regular expression \"\\[[]++\\]\" nothing to repeat in 1") },
+
+    { njs_str("/\\?+/"),
+      njs_str("/\\?+/") },
+
     { njs_str("var r = new RegExp(); r"),
       njs_str("/(?:)/") },
 
@@ -11870,6 +11902,15 @@ static njs_unit_test_t  njs_test[] =
     { njs_str("RegExp(new RegExp('expr'))"),
       njs_str("/expr/") },
 
+    { njs_str("RegExp(RegExp('[^]+|[^][^]')).test('| \\na')"),
+      njs_str("true") },
+
+    { njs_str("RegExp('a++')"),
+      njs_str("SyntaxError: Invalid regular expression \"a++\" nothing to repeat") },
+
+    { njs_str("RegExp('[a++]')"),
+      njs_str("/[a++]/") },
+
     { njs_str("RegExp(new RegExp('expr')).multiline"),
       njs_str("false") },
 


More information about the nginx-devel mailing list