[PATCH 1 of 2] Rewritten host header validation to follow generic parsing rules

Sergey Kandaurov pluknet at nginx.com
Fri Jun 7 16:48:23 UTC 2024


On Tue, May 28, 2024 at 12:53:46PM +0100, J Carter wrote:
> Hello Sergey,
> 
> On Mon, 27 May 2024 14:21:43 +0400
> Sergey Kandaurov <pluknet at nginx.com> wrote:
> 
> > # HG changeset patch
> > # User Sergey Kandaurov <pluknet at nginx.com>
> > # Date 1716805272 -14400
> > #      Mon May 27 14:21:12 2024 +0400
> > # Node ID e82a7318ed48fdbc1273771bc96357e9dc232975
> > # Parent  f58b6f6362387eeace46043a6fc0bceb56a6786a
> > Rewritten host header validation to follow generic parsing rules.
> > 
> > It now uses a generic model of state-based machine, with more strict
> > parsing rules borrowed from ngx_http_validate_host(),
> 
> I think you mean "borrowed from ngx_http_parse_request_line()".
> 

Sure, tnx.

The problem is that both functions make a subset of parsing
and validation, and these sets just partially intersect.
In particular, ngx_http_parse_request_line() currently detects
invalid characters in a port subcomponent of authority, and
ngx_http_validate_host() handles a trailing dot.
So I think it makes sense to make them unifined, this will also
remove the need to validate host in absolute URIs.
Below is an updated version with both parsers further polished
(stream part is excluded for now).

Also, it may have sense to rename ngx_http_validate_host()
to something like ngx_http_parse_host(), similar to
ngx_http_parse_uri(), out of this series.

# HG changeset patch
# User Sergey Kandaurov <pluknet at nginx.com>
# Date 1717777582 -14400
#      Fri Jun 07 20:26:22 2024 +0400
# Node ID 0cba4301e4980871de7aceb46acddf8f2b5a7318
# Parent  02e9411009b987f408214ab4a8b6b6093f843bcd
Improved parsing of host in absolute URIs.

When the request line is in the absolute-URI form, a host identified
by a registered name (reg-name) is now restricted to start with an
alphanumeric character (see RFC 1123, RFC 3986).  Previously, empty
domain labels or host starting with a hyphen were accepted.

Additionally, host with a trailing dot is taken into account.

diff --git a/src/http/ngx_http_parse.c b/src/http/ngx_http_parse.c
--- a/src/http/ngx_http_parse.c
+++ b/src/http/ngx_http_parse.c
@@ -113,8 +113,10 @@ ngx_http_parse_request_line(ngx_http_req
         sw_schema_slash_slash,
         sw_host_start,
         sw_host,
+        sw_host_dot,
         sw_host_end,
         sw_host_ip_literal,
+        sw_host_ip_literal_dot,
         sw_port,
         sw_after_slash_in_uri,
         sw_check_uri,
@@ -354,27 +356,50 @@ ngx_http_parse_request_line(ngx_http_req
                 break;
             }
 
-            state = sw_host;
+            if (ch == '.' || ch == '-') {
+                return NGX_HTTP_PARSE_INVALID_REQUEST;
+            }
 
             /* fall through */
 
         case sw_host:
+        case sw_host_dot:
+
+            if (ch == '.') {
+                if (state == sw_host_dot) {
+                    return NGX_HTTP_PARSE_INVALID_REQUEST;
+                }
+
+                state = sw_host_dot;
+                break;
+            }
 
             c = (u_char) (ch | 0x20);
             if (c >= 'a' && c <= 'z') {
+                state = sw_host;
+                break;
+            }
+
+            if ((ch >= '0' && ch <= '9') || ch == '-') {
+                state = sw_host;
                 break;
             }
 
-            if ((ch >= '0' && ch <= '9') || ch == '.' || ch == '-') {
-                break;
+            if (state == sw_host_start) {
+                return NGX_HTTP_PARSE_INVALID_REQUEST;
+            }
+
+            if (state == sw_host_dot) {
+                r->host_end = p - 1;
+
+            } else {
+                r->host_end = p;
             }
 
             /* fall through */
 
         case sw_host_end:
 
-            r->host_end = p;
-
             switch (ch) {
             case ':':
                 state = sw_port;
@@ -404,6 +429,18 @@ ngx_http_parse_request_line(ngx_http_req
             break;
 
         case sw_host_ip_literal:
+        case sw_host_ip_literal_dot:
+
+            if (ch == '.') {
+                if (state == sw_host_ip_literal_dot) {
+                    return NGX_HTTP_PARSE_INVALID_REQUEST;
+                }
+
+                state = sw_host_ip_literal_dot;
+                break;
+            }
+
+            state = sw_host_ip_literal;
 
             if (ch >= '0' && ch <= '9') {
                 break;
@@ -418,10 +455,10 @@ ngx_http_parse_request_line(ngx_http_req
             case ':':
                 break;
             case ']':
+                r->host_end = p + 1;
                 state = sw_host_end;
                 break;
             case '-':
-            case '.':
             case '_':
             case '~':
                 /* unreserved */
# HG changeset patch
# User Sergey Kandaurov <pluknet at nginx.com>
# Date 1717777646 -14400
#      Fri Jun 07 20:27:26 2024 +0400
# Node ID 3f7ac1d90a6d4eceabaa5ce45dabf53efd99ed67
# Parent  0cba4301e4980871de7aceb46acddf8f2b5a7318
Rewritten host validation to match host parsing in absolute URIs.

It is reimplemented based on ngx_http_parse_request_line() state machine.
This introduces several changes, in particular:
- host name with underscores is rejected;
- a port subcomponent is restricted to digits;
- for IP literals, a missing closing bracket and trailing dot are detected.

diff --git a/src/http/ngx_http_request.c b/src/http/ngx_http_request.c
--- a/src/http/ngx_http_request.c
+++ b/src/http/ngx_http_request.c
@@ -2145,72 +2145,157 @@ ngx_int_t
 ngx_http_validate_host(ngx_str_t *host, ngx_pool_t *pool, ngx_uint_t alloc)
 {
     u_char  *h, ch;
-    size_t   i, dot_pos, host_len;
+    size_t   i, host_len;
 
     enum {
-        sw_usual = 0,
-        sw_literal,
-        sw_rest
+        sw_host_start = 0,
+        sw_host,
+        sw_host_dot,
+        sw_host_end,
+        sw_host_ip_literal,
+        sw_host_ip_literal_dot,
+        sw_port,
     } state;
 
-    dot_pos = host->len;
     host_len = host->len;
 
     h = host->data;
 
-    state = sw_usual;
+    state = sw_host_start;
 
     for (i = 0; i < host->len; i++) {
         ch = h[i];
 
-        switch (ch) {
-
-        case '.':
-            if (dot_pos == i - 1) {
+        switch (state) {
+
+        case sw_host_start:
+
+            if (ch == '[') {
+                state = sw_host_ip_literal;
+                break;
+            }
+
+            if (ch == '.' || ch == '-') {
                 return NGX_DECLINED;
             }
-            dot_pos = i;
-            break;
-
-        case ':':
-            if (state == sw_usual) {
-                host_len = i;
-                state = sw_rest;
-            }
-            break;
-
-        case '[':
-            if (i == 0) {
-                state = sw_literal;
-            }
-            break;
-
-        case ']':
-            if (state == sw_literal) {
-                host_len = i + 1;
-                state = sw_rest;
-            }
-            break;
-
-        default:
-
-            if (ngx_path_separator(ch)) {
-                return NGX_DECLINED;
-            }
-
-            if (ch <= 0x20 || ch == 0x7f) {
-                return NGX_DECLINED;
+
+            /* fall through */
+
+        case sw_host:
+        case sw_host_dot:
+
+            if (ch == '.') {
+                if (state == sw_host_dot) {
+                    return NGX_DECLINED;
+                }
+
+                state = sw_host_dot;
+                break;
             }
 
             if (ch >= 'A' && ch <= 'Z') {
                 alloc = 1;
+                state = sw_host;
+                break;
             }
 
+            if (ch >= 'a' && ch <= 'z') {
+                state = sw_host;
+                break;
+            }
+
+            if ((ch >= '0' && ch <= '9') || ch == '-') {
+                state = sw_host;
+                break;
+            }
+
+            if (state == sw_host_dot) {
+                host_len = i - 1;
+
+            } else {
+                host_len = i;
+            }
+
+            /* fall through */
+
+        case sw_host_end:
+
+            if (ch == ':') {
+                state = sw_port;
+                break;
+            }
+            return NGX_DECLINED;
+
+        case sw_host_ip_literal:
+        case sw_host_ip_literal_dot:
+
+            if (ch == '.') {
+                if (state == sw_host_ip_literal_dot) {
+                    return NGX_DECLINED;
+                }
+
+                state = sw_host_ip_literal_dot;
+                break;
+            }
+
+            state = sw_host_ip_literal;
+
+            if (ch >= 'A' && ch <= 'Z') {
+                alloc = 1;
+                break;
+            }
+
+            if (ch >= 'a' && ch <= 'z') {
+                break;
+            }
+
+            if (ch >= '0' && ch <= '9') {
+                break;
+            }
+
+            switch (ch) {
+            case ':':
+                break;
+            case ']':
+                host_len = i + 1;
+                state = sw_host_end;
+                break;
+            case '-':
+            case '_':
+            case '~':
+                /* unreserved */
+                break;
+            case '!':
+            case '$':
+            case '&':
+            case '\'':
+            case '(':
+            case ')':
+            case '*':
+            case '+':
+            case ',':
+            case ';':
+            case '=':
+                /* sub-delims */
+                break;
+            default:
+                return NGX_DECLINED;
+            }
             break;
+
+        case sw_port:
+            if (ch >= '0' && ch <= '9') {
+                break;
+            }
+            return NGX_DECLINED;
         }
     }
 
-    if (dot_pos == host_len - 1) {
+    if (state == sw_host_ip_literal) {
+        return NGX_DECLINED;
+    }
+
+    if (h[host_len - 1] == '.') {
         host_len--;
     }
 
# HG changeset patch
# User Sergey Kandaurov <pluknet at nginx.com>
# Date 1717777737 -14400
#      Fri Jun 07 20:28:57 2024 +0400
# Node ID 722bcffe3d3c9ff4314a2813227c47dc5eff660e
# Parent  3f7ac1d90a6d4eceabaa5ce45dabf53efd99ed67
Skip host validation in absolute URIs.

Now that parsing of host in the absolute-URI form and of the host header
were made the same in previous changes, it makes no sense to validate
host once again.  Only host case normalization to lowercase is applied
(RFC 3986, 6.2.2.1) after parsing absolute URI as this is out of scope.

No functional changes intended.

diff --git a/src/http/ngx_http_parse.c b/src/http/ngx_http_parse.c
--- a/src/http/ngx_http_parse.c
+++ b/src/http/ngx_http_parse.c
@@ -374,8 +374,13 @@ ngx_http_parse_request_line(ngx_http_req
                 break;
             }
 
-            c = (u_char) (ch | 0x20);
-            if (c >= 'a' && c <= 'z') {
+            if (ch >= 'A' && ch <= 'Z') {
+                r->host_normalize = 1;
+                state = sw_host;
+                break;
+            }
+
+            if (ch >= 'a' && ch <= 'z') {
                 state = sw_host;
                 break;
             }
@@ -446,8 +451,12 @@ ngx_http_parse_request_line(ngx_http_req
                 break;
             }
 
-            c = (u_char) (ch | 0x20);
-            if (c >= 'a' && c <= 'z') {
+            if (ch >= 'A' && ch <= 'Z') {
+                r->host_normalize = 1;
+                break;
+            }
+
+            if (ch >= 'a' && ch <= 'z') {
                 break;
             }
 
diff --git a/src/http/ngx_http_request.c b/src/http/ngx_http_request.c
--- a/src/http/ngx_http_request.c
+++ b/src/http/ngx_http_request.c
@@ -1147,18 +1147,15 @@ ngx_http_process_request_line(ngx_event_
                 host.len = r->host_end - r->host_start;
                 host.data = r->host_start;
 
-                rc = ngx_http_validate_host(&host, r->pool, 0);
-
-                if (rc == NGX_DECLINED) {
-                    ngx_log_error(NGX_LOG_INFO, c->log, 0,
-                                  "client sent invalid host in request line");
-                    ngx_http_finalize_request(r, NGX_HTTP_BAD_REQUEST);
-                    break;
-                }
-
-                if (rc == NGX_ERROR) {
-                    ngx_http_close_request(r, NGX_HTTP_INTERNAL_SERVER_ERROR);
-                    break;
+                if (r->host_normalize) {
+                    host.data = ngx_pnalloc(r->pool, host.len);
+                    if (host.data == NULL) {
+                        ngx_http_close_request(r,
+                                               NGX_HTTP_INTERNAL_SERVER_ERROR);
+                        break;
+                    }
+
+                    ngx_strlow(host.data, r->host_start, host.len);
                 }
 
                 if (ngx_http_set_virtual_server(r, &host) == NGX_ERROR) {
diff --git a/src/http/ngx_http_request.h b/src/http/ngx_http_request.h
--- a/src/http/ngx_http_request.h
+++ b/src/http/ngx_http_request.h
@@ -466,6 +466,9 @@ struct ngx_http_request_s {
 
     unsigned                          http_state:4;
 
+    /* host with upper case */
+    unsigned                          host_normalize:1;
+
     /* URI with "/." and on Win32 with "//" */
     unsigned                          complex_uri:1;
 


More information about the nginx-devel mailing list