[PATCH 3 of 4] xslt_html_parser directive

Laurence Rowe l at lrowe.co.uk
Wed Mar 21 21:28:33 UTC 2012


# HG changeset patch
# User Laurence Rowe <laurence at lrowe.co.uk>
# Date 1331329666 0
# Node ID 65fd4892a78371e863d43e31d4430cdb7333a35d
# Parent  151124d060d3f725c02b656d39c10575ff009cdb
xslt_html_parser directive

When ```xslt_html_parser on;`` the HTMLParser is used. Parsing is performed
with HTML_PARSE_RECOVER as real-world HTML may not be well formed, so only
fatal error handling is enabled when this option is set.

diff --git a/src/http/modules/ngx_http_xslt_filter_module.c b/src/http/modules/ngx_http_xslt_filter_module.c
--- a/src/http/modules/ngx_http_xslt_filter_module.c
+++ b/src/http/modules/ngx_http_xslt_filter_module.c
@@ -10,6 +10,7 @@
 #include <ngx_http.h>
 
 #include <libxml/parser.h>
+#include <libxml/HTMLparser.h>
 #include <libxml/tree.h>
 #include <libxslt/xslt.h>
 #include <libxslt/xsltInternals.h>
@@ -58,6 +59,7 @@
     ngx_hash_t                 types;
     ngx_array_t               *types_keys;
     ngx_array_t               *params;       /* ngx_http_xslt_param_t */
+    ngx_flag_t                 html_parser;
 } ngx_http_xslt_filter_loc_conf_t;
 
 
@@ -67,6 +69,7 @@
     xsltTransformContextPtr    transform;
     ngx_http_request_t        *request;
     ngx_array_t                params;
+    ngx_flag_t                 html_parser;
 
     ngx_uint_t                 done;         /* unsigned  done:1; */
 } ngx_http_xslt_filter_ctx_t;
@@ -150,6 +153,13 @@
       offsetof(ngx_http_xslt_filter_loc_conf_t, types_keys),
       &ngx_http_xslt_default_types[0] },
 
+    { ngx_string("xslt_html_parser"),
+      NGX_HTTP_LOC_CONF|NGX_CONF_FLAG,
+      ngx_conf_set_flag_slot,
+      NGX_HTTP_LOC_CONF_OFFSET,
+      offsetof(ngx_http_xslt_filter_loc_conf_t, html_parser),
+      NULL },
+
       ngx_null_command
 };
 
@@ -225,6 +235,8 @@
 
     r->main_filter_need_in_memory = 1;
 
+    ctx->html_parser = conf->html_parser;
+
     return NGX_OK;
 }
 
@@ -261,7 +273,11 @@
                 xmlFreeDoc(ctx->ctxt->myDoc);
             }
 
-            xmlFreeParserCtxt(ctx->ctxt);
+            if (ctx->html_parser) {
+                htmlFreeParserCtxt(ctx->ctxt);
+            } else {
+                xmlFreeParserCtxt(ctx->ctxt);
+            }
 
             return ngx_http_xslt_send(r, ctx, NULL);
         }
@@ -276,9 +292,13 @@
 
             wellFormed = ctx->ctxt->wellFormed;
 
-            xmlFreeParserCtxt(ctx->ctxt);
+            if (ctx->html_parser) {
+                htmlFreeParserCtxt(ctx->ctxt);
+            } else {
+                xmlFreeParserCtxt(ctx->ctxt);
+            }
 
-            if (wellFormed) {
+            if (wellFormed || ctx->html_parser) {
                 return ngx_http_xslt_send(r, ctx,
                                        ngx_http_xslt_apply_stylesheet(r, ctx));
             }
@@ -352,22 +372,48 @@
     ngx_buf_t *b)
 {
     int               err;
-    xmlParserCtxtPtr  ctxt;
+    xmlParserCtxtPtr  ctxt = NULL;
+    xmlCharEncoding   enc = XML_CHAR_ENCODING_NONE;
 
     if (ctx->ctxt == NULL) {
 
-        ctxt = xmlCreatePushParserCtxt(NULL, NULL, NULL, 0, NULL);
-        if (ctxt == NULL) {
-            ngx_log_error(NGX_LOG_ERR, r->connection->log, 0,
-                          "xmlCreatePushParserCtxt() failed");
-            return NGX_ERROR;
+        if (ctx->html_parser) {
+            if (r->headers_out.charset.len) {
+                enc = xmlParseCharEncoding(
+                                   (const char *) r->headers_out.charset.data);
+                if (enc == XML_CHAR_ENCODING_ERROR) {
+                    ngx_log_error(NGX_LOG_ERR, r->connection->log, 0,
+                                  "xmlParseCharEncoding() failed charset: %s",
+                                  r->headers_out.charset.data);
+                    return NGX_ERROR;
+                }
+            }
+
+            ctxt = htmlCreatePushParserCtxt(NULL, NULL, NULL, 0, NULL, enc);
+            if (ctxt == NULL) {
+                ngx_log_error(NGX_LOG_ERR, r->connection->log, 0,
+                              "htmlCreatePushParserCtxt() failed");
+                return NGX_ERROR;
+            }
+
+            htmlCtxtUseOptions(ctxt, HTML_PARSE_RECOVER|HTML_PARSE_NOERROR
+                                                       |HTML_PARSE_NOWARNING);
+
+        } else {
+            ctxt = xmlCreatePushParserCtxt(NULL, NULL, NULL, 0, NULL);
+            if (ctxt == NULL) {
+                ngx_log_error(NGX_LOG_ERR, r->connection->log, 0,
+                              "xmlCreatePushParserCtxt() failed");
+                return NGX_ERROR;
+            }
+
+            xmlCtxtUseOptions(ctxt, XML_PARSE_NOENT|XML_PARSE_DTDLOAD
+                                                   |XML_PARSE_NOWARNING);
+            ctxt->sax->externalSubset = ngx_http_xslt_sax_external_subset;
+            ctxt->sax->error = ngx_http_xslt_sax_error;
         }
-        xmlCtxtUseOptions(ctxt, XML_PARSE_NOENT|XML_PARSE_DTDLOAD
-                                               |XML_PARSE_NOWARNING);
 
-        ctxt->sax->externalSubset = ngx_http_xslt_sax_external_subset;
         ctxt->sax->setDocumentLocator = NULL;
-        ctxt->sax->error = ngx_http_xslt_sax_error;
         ctxt->sax->fatalError = ngx_http_xslt_sax_error;
         ctxt->sax->_private = ctx;
 
@@ -375,8 +421,16 @@
         ctx->request = r;
     }
 
-    err = xmlParseChunk(ctx->ctxt, (char *) b->pos, (int) (b->last - b->pos),
-                        (b->last_buf) || (b->last_in_chain));
+    if (ctx->html_parser) {
+        err = htmlParseChunk(ctx->ctxt, (char *) b->pos,
+                             (int) (b->last - b->pos),
+                             (b->last_buf) || (b->last_in_chain));
+
+    } else {
+        err = xmlParseChunk(ctx->ctxt, (char *) b->pos,
+                            (int) (b->last - b->pos),
+                            (b->last_buf) || (b->last_in_chain));
+    }
 
     if (ctx->done == 0) {
         b->pos = b->last;
@@ -1059,6 +1113,8 @@
      *     conf->params = NULL;
      */
 
+    conf->html_parser = NGX_CONF_UNSET;
+
     return conf;
 }
 
@@ -1081,6 +1137,8 @@
         conf->params = prev->params;
     }
 
+    ngx_conf_merge_value(conf->html_parser, prev->html_parser, 0);
+
     if (ngx_http_merge_types(cf, &conf->types_keys, &conf->types,
                              &prev->types_keys, &prev->types,
                              ngx_http_xslt_default_types)



More information about the nginx-devel mailing list