[njs] Improved surrogate pairs support for PCRE2 backend.

Dmitry Volyntsev xeioex at nginx.com
Fri May 6 03:27:15 UTC 2022


details:   https://hg.nginx.org/njs/rev/ded5304adaf0
branches:  
changeset: 1850:ded5304adaf0
user:      Dmitry Volyntsev <xeioex at nginx.com>
date:      Thu May 05 20:25:05 2022 -0700
description:
Improved surrogate pairs support for PCRE2 backend.

In collaboration with Javier Evans.

diffstat:

 external/njs_regex.c     |  20 +++++++++++++++++++-
 src/test/njs_unit_test.c |   5 +++++
 2 files changed, 24 insertions(+), 1 deletions(-)

diffs (46 lines):

diff -r 80ed74a0e205 -r ded5304adaf0 external/njs_regex.c
--- a/external/njs_regex.c	Wed May 04 16:44:48 2022 -0700
+++ b/external/njs_regex.c	Thu May 05 20:25:05 2022 -0700
@@ -60,8 +60,26 @@ njs_regex_compile_ctx_t *
 njs_regex_compile_ctx_create(njs_regex_generic_ctx_t *ctx)
 {
 #ifdef NJS_HAVE_PCRE2
+    pcre2_compile_context  *cc;
 
-    return pcre2_compile_context_create(ctx);
+    cc = pcre2_compile_context_create(ctx);
+
+#ifdef PCRE2_EXTRA_ALLOW_SURROGATE_ESCAPES
+    if (njs_fast_path(cc != NULL)) {
+        /* Workaround for surrogate pairs in regular expressions
+         *
+         * This option is needed because njs, unlike the standard ECMAScript,
+         * stores and processes strings in UTF-8 encoding.
+         * PCRE2 does not support surrogate pairs by default when it
+         * is compiled for UTF-8 only strings. But many polyfills
+         * and transpilers use such surrogate pairs expressions.
+         */
+        pcre2_set_compile_extra_options(cc,
+                                        PCRE2_EXTRA_ALLOW_SURROGATE_ESCAPES);
+    }
+#endif
+
+    return cc;
 
 #else
 
diff -r 80ed74a0e205 -r ded5304adaf0 src/test/njs_unit_test.c
--- a/src/test/njs_unit_test.c	Wed May 04 16:44:48 2022 -0700
+++ b/src/test/njs_unit_test.c	Thu May 05 20:25:05 2022 -0700
@@ -10841,6 +10841,11 @@ static njs_unit_test_t  njs_test[] =
       njs_str("true") },
 #endif
 
+#ifdef PCRE2_EXTRA_ALLOW_SURROGATE_ESCAPES
+    { njs_str("/\\u200d\\ud800-/"),
+      njs_str("/\\u200d\\ud800-/") },
+#endif
+
     { njs_str("/(\\.(?!com|org)|\\/)/.test('ah.info')"),
       njs_str("true") },
 



More information about the nginx-devel mailing list