[PATCH 3 of 3] QUIC: path MTU discovery
Sergey Kandaurov
pluknet at nginx.com
Mon May 1 16:58:55 UTC 2023
> On 28 Mar 2023, at 18:51, Roman Arutyunyan <arut at nginx.com> wrote:
>
> # HG changeset patch
> # User Roman Arutyunyan <arut at nginx.com>
> # Date 1679993500 -14400
> # Tue Mar 28 12:51:40 2023 +0400
> # Branch quic
> # Node ID 13d43a278510f131101c7b19d87455a0171ebe2f
> # Parent c686c97f4abd6e1ca9a2cc2324d5a24f3d035c58
> QUIC: path MTU discovery.
>
> MTU selection starts by probing the maximum allowed MTU first. After that,
> binary search is used to find the path MTU.
>
> Maximum allowed MTU is calculated as the minimum of max_udp_payload for client
> and server, and local interface MTU.
>
> diff --git a/auto/unix b/auto/unix
> --- a/auto/unix
> +++ b/auto/unix
> @@ -448,6 +448,54 @@ ngx_feature_test="setsockopt(0, IPPROTO_
> . auto/feature
>
>
> +# IP packet fragmentation flags
> +
> +ngx_feature="IP_DONTFRAG"
> +ngx_feature_name="NGX_HAVE_IP_DONTFRAG"
> +ngx_feature_run=no
> +ngx_feature_incs="#include <sys/socket.h>
> + #include <netinet/in.h>"
> +ngx_feature_path=
> +ngx_feature_libs=
> +ngx_feature_test="getsockopt(0, IPPROTO_IP, IP_DONTFRAG, NULL, 0)"
> +. auto/feature
> +
> +
> +ngx_feature="IPV6_DONTFRAG"
> +ngx_feature_name="NGX_HAVE_IPV6_DONTFRAG"
> +ngx_feature_run=no
> +ngx_feature_incs="#include <sys/socket.h>
> + #include <netinet/in.h>"
> +ngx_feature_path=
> +ngx_feature_libs=
> +ngx_feature_test="getsockopt(0, IPPROTO_IPV6, IPV6_DONTFRAG, NULL, 0)"
> +. auto/feature
> +
> +
> +# Linux MTU flags
> +
> +ngx_feature="IP_PMTUDISC_DO"
> +ngx_feature_name="NGX_HAVE_IP_PMTUDISC_DO"
> +ngx_feature_run=no
> +ngx_feature_incs="#include <sys/socket.h>
> + #include <netinet/in.h>"
> +ngx_feature_path=
> +ngx_feature_libs=
> +ngx_feature_test="getsockopt(0, IPPROTO_IP, IP_PMTUDISC_DO, NULL, 0)"
> +. auto/feature
> +
> +
> +ngx_feature="IPV6_PMTUDISC_DO"
> +ngx_feature_name="NGX_HAVE_IPV6_PMTUDISC_DO"
> +ngx_feature_run=no
> +ngx_feature_incs="#include <sys/socket.h>
> + #include <netinet/in.h>"
> +ngx_feature_path=
> +ngx_feature_libs=
> +ngx_feature_test="getsockopt(0, IPPROTO_IPV6, IPV6_PMTUDISC_DO, NULL, 0)"
> +. auto/feature
> +
> +
> ngx_feature="TCP_DEFER_ACCEPT"
> ngx_feature_name="NGX_HAVE_DEFERRED_ACCEPT"
> ngx_feature_run=no
> @@ -920,6 +968,19 @@ ngx_feature_test="int i = FIONREAD; prin
> . auto/feature
>
>
> +ngx_feature="ioctl(SIOCGIFMTU)"
> +ngx_feature_name="NGX_HAVE_SIOCGIFMTU"
> +ngx_feature_run=no
> +ngx_feature_incs="#include <sys/ioctl.h>
> + #include <stdio.h>
> + #include <net/if.h>"
> +ngx_feature_path=
> +ngx_feature_libs=
> +ngx_feature_test="int i = SIOCGIFMTU; struct ifreq ifr;
> + ifr.ifr_name[0] = 'e'; printf(\"%d\", i)"
> +. auto/feature
> +
> +
> ngx_feature="struct tm.tm_gmtoff"
> ngx_feature_name="NGX_HAVE_GMTOFF"
> ngx_feature_run=no
> @@ -1002,3 +1063,17 @@ ngx_feature_test='struct addrinfo *res;
> if (getaddrinfo("localhost", NULL, NULL, &res) != 0) return 1;
> freeaddrinfo(res)'
> . auto/feature
> +
> +
> +ngx_feature="getifaddrs()"
> +ngx_feature_name="NGX_HAVE_GETIFADDRS"
> +ngx_feature_run=no
> +ngx_feature_incs="#include <sys/types.h>
> + #include <sys/socket.h>
> + #include <ifaddrs.h>"
> +ngx_feature_path=
> +ngx_feature_libs=
> +ngx_feature_test='struct ifaddrs *ifaddr;
> + if (getifaddrs(&ifaddr) != 0) return 1;
> + freeifaddrs(ifaddr)'
> +. auto/feature
> diff --git a/src/core/ngx_connection.c b/src/core/ngx_connection.c
> --- a/src/core/ngx_connection.c
> +++ b/src/core/ngx_connection.c
> @@ -1010,6 +1010,74 @@ ngx_configure_listening_sockets(ngx_cycl
> }
>
> #endif
> +
> +#if (NGX_HAVE_IP_PMTUDISC_DO)
> +
> + if (ls[i].quic && ls[i].sockaddr->sa_family == AF_INET) {
> + value = 1;
> +
> + if (setsockopt(ls[i].fd, IPPROTO_IP, IP_PMTUDISC_DO,
> + (const void *) &value, sizeof(int))
> + == -1)
> + {
> + ngx_log_error(NGX_LOG_ALERT, cycle->log, ngx_socket_errno,
> + "setsockopt(IP_PMTUDISC_DO) "
> + "for %V failed, ignored",
> + &ls[i].addr_text);
> + }
> + }
> +
> +#elif (NGX_HAVE_IP_DONTFRAG)
> +
> + if (ls[i].quic && ls[i].sockaddr->sa_family == AF_INET) {
> + value = 1;
> +
> + if (setsockopt(ls[i].fd, IPPROTO_IP, IP_DONTFRAG,
> + (const void *) &value, sizeof(int))
> + == -1)
> + {
> + ngx_log_error(NGX_LOG_ALERT, cycle->log, ngx_socket_errno,
> + "setsockopt(IP_DONTFRAG) "
> + "for %V failed, ignored",
> + &ls[i].addr_text);
> + }
> + }
> +
> +#endif
> +
> +#if (NGX_HAVE_INET6 && NGX_HAVE_IPV6_PMTUDISC_DO)
> +
> + if (ls[i].quic && ls[i].sockaddr->sa_family == AF_INET6) {
> + value = 1;
> +
> + if (setsockopt(ls[i].fd, IPPROTO_IPV6, IPV6_PMTUDISC_DO,
> + (const void *) &value, sizeof(int))
> + == -1)
> + {
> + ngx_log_error(NGX_LOG_ALERT, cycle->log, ngx_socket_errno,
> + "setsockopt(IPV6_PMTUDISC_DO) "
> + "for %V failed, ignored",
> + &ls[i].addr_text);
> + }
> + }
> +
> +#elif (NGX_HAVE_INET6 && NGX_HAVE_IPV6_DONTFRAG)
> +
> + if (ls[i].quic && ls[i].sockaddr->sa_family == AF_INET6) {
> + value = 1;
> +
> + if (setsockopt(ls[i].fd, IPPROTO_IPV6, IPV6_DONTFRAG,
> + (const void *) &value, sizeof(int))
> + == -1)
> + {
> + ngx_log_error(NGX_LOG_ALERT, cycle->log, ngx_socket_errno,
> + "setsockopt(IPV6_DONTFRAG) "
> + "for %V failed, ignored",
> + &ls[i].addr_text);
> + }
> + }
> +
> +#endif
> }
>
> return;
> @@ -1507,6 +1575,10 @@ ngx_connection_error(ngx_connection_t *c
> }
> #endif
>
> + if (err == NGX_EMSGSIZE && c->log_error == NGX_ERROR_IGNORE_EMSGSIZE) {
> + return 0;
> + }
> +
> if (err == 0
> || err == NGX_ECONNRESET
> #if (NGX_WIN32)
> @@ -1524,6 +1596,7 @@ ngx_connection_error(ngx_connection_t *c
> {
> switch (c->log_error) {
>
> + case NGX_ERROR_IGNORE_EMSGSIZE:
> case NGX_ERROR_IGNORE_EINVAL:
> case NGX_ERROR_IGNORE_ECONNRESET:
> case NGX_ERROR_INFO:
> diff --git a/src/core/ngx_connection.h b/src/core/ngx_connection.h
> --- a/src/core/ngx_connection.h
> +++ b/src/core/ngx_connection.h
> @@ -97,7 +97,8 @@ typedef enum {
> NGX_ERROR_ERR,
> NGX_ERROR_INFO,
> NGX_ERROR_IGNORE_ECONNRESET,
> - NGX_ERROR_IGNORE_EINVAL
> + NGX_ERROR_IGNORE_EINVAL,
> + NGX_ERROR_IGNORE_EMSGSIZE
> } ngx_connection_log_error_e;
>
>
I'd move the dontfrag part to a separate change for clarity.
It can be seen as a foundation for succeeding PLPMTUD work
not strictly related to it.
(Further, PLPMTUD is an optional feature, while dontfrag
is a MUST per RFC 9000, section 14.)
> diff --git a/src/event/quic/ngx_event_quic.c b/src/event/quic/ngx_event_quic.c
> --- a/src/event/quic/ngx_event_quic.c
> +++ b/src/event/quic/ngx_event_quic.c
> @@ -10,8 +10,17 @@
> #include <ngx_event_quic_connection.h>
>
>
> +#define NGX_QUIC_UDP4_MAX_PACKET 65535
> +#define NGX_QUIC_UDP4_HEADER_SIZE 28
> +
> +#define NGX_QUIC_UDP6_MAX_PAYLOAD 65535
> +#define NGX_QUIC_UDP6_HEADER_SIZE 48
> +
> +
> static ngx_quic_connection_t *ngx_quic_new_connection(ngx_connection_t *c,
> ngx_quic_conf_t *conf, ngx_quic_header_t *pkt);
> +static ssize_t ngx_quic_get_local_mtu(ngx_connection_t *c,
> + struct sockaddr *sockaddr);
> static ngx_int_t ngx_quic_handle_stateless_reset(ngx_connection_t *c,
> ngx_quic_header_t *pkt);
> static void ngx_quic_input_handler(ngx_event_t *rev);
> @@ -149,11 +158,6 @@ ngx_quic_apply_transport_params(ngx_conn
> ngx_log_error(NGX_LOG_INFO, c->log, 0,
> "quic maximum packet size is invalid");
> return NGX_ERROR;
> -
> - } else if (ctp->max_udp_payload_size > ngx_quic_max_udp_payload(c)) {
> - ctp->max_udp_payload_size = ngx_quic_max_udp_payload(c);
> - ngx_log_debug0(NGX_LOG_DEBUG_EVENT, c->log, 0,
> - "quic client maximum packet size truncated");
> }
>
> if (ctp->active_connection_id_limit < 2) {
> @@ -228,6 +232,7 @@ static ngx_quic_connection_t *
> ngx_quic_new_connection(ngx_connection_t *c, ngx_quic_conf_t *conf,
> ngx_quic_header_t *pkt)
> {
> + ssize_t mtu;
> ngx_uint_t i;
> ngx_quic_tp_t *ctp;
> ngx_quic_connection_t *qc;
> @@ -297,7 +302,7 @@ ngx_quic_new_connection(ngx_connection_t
> ctp = &qc->ctp;
>
> /* defaults to be used before actual client parameters are received */
> - ctp->max_udp_payload_size = ngx_quic_max_udp_payload(c);
> + ctp->max_udp_payload_size = NGX_QUIC_MAX_UDP_PAYLOAD_SIZE;
> ctp->ack_delay_exponent = NGX_QUIC_DEFAULT_ACK_DELAY_EXPONENT;
> ctp->max_ack_delay = NGX_QUIC_DEFAULT_MAX_ACK_DELAY;
> ctp->active_connection_id_limit = 2;
> @@ -317,6 +322,18 @@ ngx_quic_new_connection(ngx_connection_t
> qc->congestion.ssthresh = (size_t) -1;
> qc->congestion.recovery_start = ngx_current_msec;
>
> + qc->max_mtu = ngx_min(qc->tp.max_udp_payload_size,
> + qc->ctp.max_udp_payload_size);
> +
max_udp_payload_size is not negotiable, it is a property
of the endpoint and not the path, because of possible
asymmetric routes.
> + mtu = ngx_quic_get_local_mtu(c, c->local_sockaddr);
> + if (mtu == NGX_ERROR) {
> + return NULL;
> + }
> +
> + if (mtu > 0 && (size_t) mtu < qc->max_mtu) {
It makes sense to ensure UDP is of at least 1200 bytes.
The checks inside ngx_quic_get_local_mtu() make it possible
to ignore link MTU below 1200, probably it should behave stricter
like error log at appropriate level or discarding connection state.
> + qc->max_mtu = mtu;
> + }
> +
> if (pkt->validated && pkt->retried) {
> qc->tp.retry_scid.len = pkt->dcid.len;
> qc->tp.retry_scid.data = ngx_pstrdup(c->pool, &pkt->dcid);
> @@ -347,6 +364,90 @@ ngx_quic_new_connection(ngx_connection_t
> }
>
>
> +static ssize_t
> +ngx_quic_get_local_mtu(ngx_connection_t *c, struct sockaddr *sockaddr)
> +{
> +#if (NGX_HAVE_GETIFADDRS && NGX_HAVE_SIOCGIFMTU)
> +
> + size_t mtu;
> + struct ifreq ifr;
> + struct ifaddrs *ifaddrs, *ifa;
> +
> + if (sockaddr->sa_family != AF_INET
> +#if (NGX_HAVE_INET6)
> + && sockaddr->sa_family != AF_INET6
> +#endif
> + )
> + {
> + return NGX_DECLINED;
> + }
> +
> + if (getifaddrs(&ifaddrs) == -1) {
> + ngx_log_error(NGX_LOG_INFO, c->log, 0, "getifaddrs() failed");
> + return NGX_ERROR;
> + }
> +
> + for (ifa = ifaddrs; ifa; ifa = ifa->ifa_next) {
> + if (ifa->ifa_addr == NULL) {
> + continue;
> + }
> +
> + if (ngx_cmp_sockaddr(sockaddr, 0, ifa->ifa_addr, 0, 0) != NGX_OK) {
> + continue;
> + }
> +
> + ngx_memzero(&ifr, sizeof(struct ifreq));
> + strcpy(ifr.ifr_name, ifa->ifa_name);
> +
> + freeifaddrs(ifaddrs);
> +
> + if (ioctl(c->fd, SIOCGIFMTU, &ifr)) {
> + ngx_log_error(NGX_LOG_INFO, c->log, 0, "ioctl(SIOCGIFMTU) failed");
> + return NGX_ERROR;
> + }
> +
> + mtu = ifr.ifr_mtu;
> +
> + if (sockaddr->sa_family == AF_INET) {
> + if (mtu > NGX_QUIC_UDP4_MAX_PACKET) {
> + mtu = NGX_QUIC_UDP4_MAX_PACKET;
> + }
> +
> + if (mtu <= NGX_QUIC_UDP4_HEADER_SIZE) {
> + return NGX_DECLINED;
> + }
> +
> + mtu -= NGX_QUIC_UDP4_HEADER_SIZE;
> +
> +#if (NGX_HAVE_INET6)
> + } else { /* sockaddr->sa_family == AF_INET6 */
> +
> + if (mtu <= NGX_QUIC_UDP6_HEADER_SIZE) {
> + return NGX_DECLINED;
> + }
> +
> + mtu -= NGX_QUIC_UDP6_HEADER_SIZE;
> +
> + if (mtu > NGX_QUIC_UDP6_MAX_PAYLOAD) {
> + mtu = NGX_QUIC_UDP6_MAX_PAYLOAD;
> + }
> +#endif
> + }
> +
It makes sense to further limit link MTU to system constraints.
BSD is known to have a system limitation for a maximum outgoing UDP
datagram size set for some reason. In all known distributions which
derive from 4.3BSD-Reno, it defaults to 9216.
For that, we can query the limit using sysctl.
Limited, this reduces the number of MTU probes.
In particular on lo0, before the change:
- 16356
+ 8778
- 12567
- 10672
- 9725
- 9251
+ 9014
Legend: '-' is rejected with EMSGSIZE, '+' sent to network
After the change, it is the only successful probe of maxdgram.
On real networks there will be different results,
but the general pattern can be traced.
Aside from that, I noticed that we have split FreeBSD
and Darwin tests/sources but didn't accommodate for that.
Below to address this. The first one or two patches of three
could go directly to the default branch if approved,
the last one set upper bound for maxdgram where appropriate.
# HG changeset patch
# User Sergey Kandaurov <pluknet at nginx.com>
# Date 1682957319 -14400
# Mon May 01 20:08:39 2023 +0400
# Branch quic
# Node ID b4d35da933cc3df9a098feb6d06957b19e228c39
# Parent cc5d2e648dd4359d77c28120e6e02f9b5842024e
Fixed Darwin support for the "net.inet.tcp.sendspace" sysctl.
After Darwin support was split in separate files in 345a014436d4 (0.7.7),
sysctl variables received a separate prefix, but common sources were not
updated. In particular, the "net.inet.tcp.sendspace" sysctl value wasn't
checked as a limit for the send_lowat directive and friends.
The change unifies a prefix for the "sendspace" variable in both FreeBSD
and Darwin. Other extern variables aren't touched: their usage is either
limited to os-specific source files or they aren't used at all.
diff --git a/src/http/modules/ngx_http_fastcgi_module.c b/src/http/modules/ngx_http_fastcgi_module.c
--- a/src/http/modules/ngx_http_fastcgi_module.c
+++ b/src/http/modules/ngx_http_fastcgi_module.c
@@ -3905,14 +3905,14 @@ ngx_http_fastcgi_cache_key(ngx_conf_t *c
static char *
ngx_http_fastcgi_lowat_check(ngx_conf_t *cf, void *post, void *data)
{
-#if (NGX_FREEBSD)
+#if (NGX_FREEBSD || NGX_DARWIN)
ssize_t *np = data;
- if ((u_long) *np >= ngx_freebsd_net_inet_tcp_sendspace) {
+ if ((u_long) *np >= ngx_net_inet_tcp_sendspace) {
ngx_conf_log_error(NGX_LOG_EMERG, cf, 0,
"\"fastcgi_send_lowat\" must be less than %d "
"(sysctl net.inet.tcp.sendspace)",
- ngx_freebsd_net_inet_tcp_sendspace);
+ ngx_net_inet_tcp_sendspace);
return NGX_CONF_ERROR;
}
diff --git a/src/http/modules/ngx_http_proxy_module.c b/src/http/modules/ngx_http_proxy_module.c
--- a/src/http/modules/ngx_http_proxy_module.c
+++ b/src/http/modules/ngx_http_proxy_module.c
@@ -4890,14 +4890,14 @@ ngx_http_proxy_ssl_password_file(ngx_con
static char *
ngx_http_proxy_lowat_check(ngx_conf_t *cf, void *post, void *data)
{
-#if (NGX_FREEBSD)
+#if (NGX_FREEBSD || NGX_DARWIN)
ssize_t *np = data;
- if ((u_long) *np >= ngx_freebsd_net_inet_tcp_sendspace) {
+ if ((u_long) *np >= ngx_net_inet_tcp_sendspace) {
ngx_conf_log_error(NGX_LOG_EMERG, cf, 0,
"\"proxy_send_lowat\" must be less than %d "
"(sysctl net.inet.tcp.sendspace)",
- ngx_freebsd_net_inet_tcp_sendspace);
+ ngx_net_inet_tcp_sendspace);
return NGX_CONF_ERROR;
}
diff --git a/src/http/ngx_http_core_module.c b/src/http/ngx_http_core_module.c
--- a/src/http/ngx_http_core_module.c
+++ b/src/http/ngx_http_core_module.c
@@ -5288,14 +5288,14 @@ ngx_http_disable_symlinks(ngx_conf_t *cf
static char *
ngx_http_core_lowat_check(ngx_conf_t *cf, void *post, void *data)
{
-#if (NGX_FREEBSD)
+#if (NGX_FREEBSD || NGX_DARWIN)
ssize_t *np = data;
- if ((u_long) *np >= ngx_freebsd_net_inet_tcp_sendspace) {
+ if ((u_long) *np >= ngx_net_inet_tcp_sendspace) {
ngx_conf_log_error(NGX_LOG_EMERG, cf, 0,
"\"send_lowat\" must be less than %d "
"(sysctl net.inet.tcp.sendspace)",
- ngx_freebsd_net_inet_tcp_sendspace);
+ ngx_net_inet_tcp_sendspace);
return NGX_CONF_ERROR;
}
diff --git a/src/os/unix/ngx_darwin.h b/src/os/unix/ngx_darwin.h
--- a/src/os/unix/ngx_darwin.h
+++ b/src/os/unix/ngx_darwin.h
@@ -15,7 +15,7 @@ ngx_chain_t *ngx_darwin_sendfile_chain(n
extern int ngx_darwin_kern_osreldate;
extern int ngx_darwin_hw_ncpu;
-extern u_long ngx_darwin_net_inet_tcp_sendspace;
+extern u_long ngx_net_inet_tcp_sendspace;
extern ngx_uint_t ngx_debug_malloc;
diff --git a/src/os/unix/ngx_darwin_init.c b/src/os/unix/ngx_darwin_init.c
--- a/src/os/unix/ngx_darwin_init.c
+++ b/src/os/unix/ngx_darwin_init.c
@@ -13,7 +13,7 @@ char ngx_darwin_kern_ostype[16];
char ngx_darwin_kern_osrelease[128];
int ngx_darwin_hw_ncpu;
int ngx_darwin_kern_ipc_somaxconn;
-u_long ngx_darwin_net_inet_tcp_sendspace;
+u_long ngx_net_inet_tcp_sendspace;
ngx_uint_t ngx_debug_malloc;
@@ -49,8 +49,8 @@ sysctl_t sysctls[] = {
sizeof(ngx_darwin_hw_ncpu), 0 },
{ "net.inet.tcp.sendspace",
- &ngx_darwin_net_inet_tcp_sendspace,
- sizeof(ngx_darwin_net_inet_tcp_sendspace), 0 },
+ &ngx_net_inet_tcp_sendspace,
+ sizeof(ngx_net_inet_tcp_sendspace), 0 },
{ "kern.ipc.somaxconn",
&ngx_darwin_kern_ipc_somaxconn,
diff --git a/src/os/unix/ngx_freebsd.h b/src/os/unix/ngx_freebsd.h
--- a/src/os/unix/ngx_freebsd.h
+++ b/src/os/unix/ngx_freebsd.h
@@ -15,7 +15,7 @@ ngx_chain_t *ngx_freebsd_sendfile_chain(
extern int ngx_freebsd_kern_osreldate;
extern int ngx_freebsd_hw_ncpu;
-extern u_long ngx_freebsd_net_inet_tcp_sendspace;
+extern u_long ngx_net_inet_tcp_sendspace;
extern ngx_uint_t ngx_freebsd_sendfile_nbytes_bug;
extern ngx_uint_t ngx_freebsd_use_tcp_nopush;
diff --git a/src/os/unix/ngx_freebsd_init.c b/src/os/unix/ngx_freebsd_init.c
--- a/src/os/unix/ngx_freebsd_init.c
+++ b/src/os/unix/ngx_freebsd_init.c
@@ -15,7 +15,7 @@ char ngx_freebsd_kern_osrelease[128];
int ngx_freebsd_kern_osreldate;
int ngx_freebsd_hw_ncpu;
int ngx_freebsd_kern_ipc_somaxconn;
-u_long ngx_freebsd_net_inet_tcp_sendspace;
+u_long ngx_net_inet_tcp_sendspace;
/* FreeBSD 4.9 */
int ngx_freebsd_machdep_hlt_logical_cpus;
@@ -62,8 +62,8 @@ sysctl_t sysctls[] = {
sizeof(ngx_freebsd_machdep_hlt_logical_cpus), 0 },
{ "net.inet.tcp.sendspace",
- &ngx_freebsd_net_inet_tcp_sendspace,
- sizeof(ngx_freebsd_net_inet_tcp_sendspace), 0 },
+ &ngx_net_inet_tcp_sendspace,
+ sizeof(ngx_net_inet_tcp_sendspace), 0 },
{ "kern.ipc.somaxconn",
&ngx_freebsd_kern_ipc_somaxconn,
# HG changeset patch
# User Sergey Kandaurov <pluknet at nginx.com>
# Date 1682957408 -14400
# Mon May 01 20:10:08 2023 +0400
# Branch quic
# Node ID 976e2e40f0f58a5bbcd89b76e24909be0c8d337d
# Parent b4d35da933cc3df9a098feb6d06957b19e228c39
Introduced the "net.inet.udp.maxdgram" sysctl variable.
diff --git a/src/os/unix/ngx_darwin.h b/src/os/unix/ngx_darwin.h
--- a/src/os/unix/ngx_darwin.h
+++ b/src/os/unix/ngx_darwin.h
@@ -16,6 +16,7 @@ ngx_chain_t *ngx_darwin_sendfile_chain(n
extern int ngx_darwin_kern_osreldate;
extern int ngx_darwin_hw_ncpu;
extern u_long ngx_net_inet_tcp_sendspace;
+extern u_long ngx_net_inet_udp_maxdgram;
extern ngx_uint_t ngx_debug_malloc;
diff --git a/src/os/unix/ngx_darwin_init.c b/src/os/unix/ngx_darwin_init.c
--- a/src/os/unix/ngx_darwin_init.c
+++ b/src/os/unix/ngx_darwin_init.c
@@ -14,6 +14,7 @@ char ngx_darwin_kern_osrelease[128];
int ngx_darwin_hw_ncpu;
int ngx_darwin_kern_ipc_somaxconn;
u_long ngx_net_inet_tcp_sendspace;
+u_long ngx_net_inet_udp_maxdgram;
ngx_uint_t ngx_debug_malloc;
@@ -52,6 +53,10 @@ sysctl_t sysctls[] = {
&ngx_net_inet_tcp_sendspace,
sizeof(ngx_net_inet_tcp_sendspace), 0 },
+ { "net.inet.udp.maxdgram",
+ &ngx_net_inet_udp_maxdgram,
+ sizeof(ngx_net_inet_udp_maxdgram), 0 },
+
{ "kern.ipc.somaxconn",
&ngx_darwin_kern_ipc_somaxconn,
sizeof(ngx_darwin_kern_ipc_somaxconn), 0 },
diff --git a/src/os/unix/ngx_freebsd.h b/src/os/unix/ngx_freebsd.h
--- a/src/os/unix/ngx_freebsd.h
+++ b/src/os/unix/ngx_freebsd.h
@@ -16,6 +16,7 @@ ngx_chain_t *ngx_freebsd_sendfile_chain(
extern int ngx_freebsd_kern_osreldate;
extern int ngx_freebsd_hw_ncpu;
extern u_long ngx_net_inet_tcp_sendspace;
+extern u_long ngx_net_inet_udp_maxdgram;
extern ngx_uint_t ngx_freebsd_sendfile_nbytes_bug;
extern ngx_uint_t ngx_freebsd_use_tcp_nopush;
diff --git a/src/os/unix/ngx_freebsd_init.c b/src/os/unix/ngx_freebsd_init.c
--- a/src/os/unix/ngx_freebsd_init.c
+++ b/src/os/unix/ngx_freebsd_init.c
@@ -16,6 +16,7 @@ int ngx_freebsd_kern_osreldate;
int ngx_freebsd_hw_ncpu;
int ngx_freebsd_kern_ipc_somaxconn;
u_long ngx_net_inet_tcp_sendspace;
+u_long ngx_net_inet_udp_maxdgram;
/* FreeBSD 4.9 */
int ngx_freebsd_machdep_hlt_logical_cpus;
@@ -65,6 +66,10 @@ sysctl_t sysctls[] = {
&ngx_net_inet_tcp_sendspace,
sizeof(ngx_net_inet_tcp_sendspace), 0 },
+ { "net.inet.udp.maxdgram",
+ &ngx_net_inet_udp_maxdgram,
+ sizeof(ngx_net_inet_udp_maxdgram), 0 },
+
{ "kern.ipc.somaxconn",
&ngx_freebsd_kern_ipc_somaxconn,
sizeof(ngx_freebsd_kern_ipc_somaxconn), 0 },
# HG changeset patch
# User Sergey Kandaurov <pluknet at nginx.com>
# Date 1682959469 -14400
# Mon May 01 20:44:29 2023 +0400
# Branch quic
# Node ID 4f32cf8fe9241fe002701d3bf2dd38ea358c0b5d
# Parent 976e2e40f0f58a5bbcd89b76e24909be0c8d337d
QUIC: limited link MTU upper bound to maxdgram.
diff --git a/src/event/quic/ngx_event_quic.c b/src/event/quic/ngx_event_quic.c
--- a/src/event/quic/ngx_event_quic.c
+++ b/src/event/quic/ngx_event_quic.c
@@ -434,6 +434,10 @@ ngx_quic_get_local_mtu(ngx_connection_t
#endif
}
+#if (NGX_FREEBSD || NGX_DARWIN)
+ mtu = ngx_min(mtu, ngx_net_inet_udp_maxdgram);
+#endif
+
ngx_log_debug1(NGX_LOG_DEBUG_EVENT, c->log, 0,
"quic local mtu:%uz", mtu);
> + ngx_log_debug1(NGX_LOG_DEBUG_EVENT, c->log, 0,
> + "quic local mtu:%uz", mtu);
> +
> + return mtu;
> + }
> +
> + freeifaddrs(ifaddrs);
> +
> +#endif
> +
> + return NGX_DECLINED;
> +}
> +
> +
> static ngx_int_t
> ngx_quic_handle_stateless_reset(ngx_connection_t *c, ngx_quic_header_t *pkt)
> {
> diff --git a/src/event/quic/ngx_event_quic_ack.c b/src/event/quic/ngx_event_quic_ack.c
> --- a/src/event/quic/ngx_event_quic_ack.c
> +++ b/src/event/quic/ngx_event_quic_ack.c
> @@ -229,6 +229,12 @@ ngx_quic_handle_ack_frame_range(ngx_conn
>
> qc = ngx_quic_get_connection(c);
>
> + if (ctx->level == ssl_encryption_application) {
> + if (ngx_quic_handle_path_mtu_ack(c, qc->path, min, max) != NGX_OK) {
> + return NGX_ERROR;
> + }
> + }
> +
> st->max_pn = NGX_TIMER_INFINITE;
> found = 0;
>
> diff --git a/src/event/quic/ngx_event_quic_connection.h b/src/event/quic/ngx_event_quic_connection.h
> --- a/src/event/quic/ngx_event_quic_connection.h
> +++ b/src/event/quic/ngx_event_quic_connection.h
> @@ -89,14 +89,21 @@ struct ngx_quic_path_s {
> ngx_sockaddr_t sa;
> socklen_t socklen;
> ngx_quic_client_id_t *cid;
> - ngx_msec_t expires;
> - ngx_uint_t tries;
> + ngx_msec_t valid_expires;
> + ngx_msec_t mtu_expires;
> + ngx_uint_t valid_tries;
> + ngx_uint_t mtu_tries;
> + ngx_uint_t mtu_steps;
> ngx_uint_t tag;
> + size_t mtu;
> + size_t mtud;
> + size_t max_mtu;
> off_t sent;
> off_t received;
> u_char challenge1[8];
> u_char challenge2[8];
> uint64_t seqnum;
> + uint64_t mtu_pnum[NGX_QUIC_PATH_RETRIES];
> ngx_str_t addr_text;
> u_char text[NGX_SOCKADDR_STRLEN];
> unsigned validated:1;
> @@ -206,6 +213,8 @@ struct ngx_quic_connection_s {
> uint64_t server_seqnum;
> uint64_t path_seqnum;
>
> + size_t max_mtu;
> +
> ngx_quic_tp_t tp;
> ngx_quic_tp_t ctp;
>
> diff --git a/src/event/quic/ngx_event_quic_migration.c b/src/event/quic/ngx_event_quic_migration.c
> --- a/src/event/quic/ngx_event_quic_migration.c
> +++ b/src/event/quic/ngx_event_quic_migration.c
> @@ -10,6 +10,10 @@
> #include <ngx_event_quic_connection.h>
>
>
> +#define NGX_QUIC_MAX_MTU_STEPS 7
> +#define NGX_QUIC_MTU_PRECISION 4
> +
> +
For the record, rebased on top off my previous changes sent previously
for the first patch (this doesn't include series for maxdgram).
# HG changeset patch
# User Roman Arutyunyan <arut at nginx.com>
# Date 1679993500 -14400
# Tue Mar 28 12:51:40 2023 +0400
# Branch quic
# Node ID cc5d2e648dd4359d77c28120e6e02f9b5842024e
# Parent 81c439faaa38dc9dd5c3af5e5b1dc0e3c36f6367
QUIC: path MTU discovery.
MTU selection starts by probing the maximum allowed MTU first. After that,
binary search is used to find the path MTU.
Maximum allowed MTU is calculated as the minimum of max_udp_payload for client
and server, and local interface MTU.
diff --git a/auto/unix b/auto/unix
--- a/auto/unix
+++ b/auto/unix
@@ -448,6 +448,54 @@ ngx_feature_test="setsockopt(0, IPPROTO_
. auto/feature
+# IP packet fragmentation flags
+
+ngx_feature="IP_DONTFRAG"
+ngx_feature_name="NGX_HAVE_IP_DONTFRAG"
+ngx_feature_run=no
+ngx_feature_incs="#include <sys/socket.h>
+ #include <netinet/in.h>"
+ngx_feature_path=
+ngx_feature_libs=
+ngx_feature_test="getsockopt(0, IPPROTO_IP, IP_DONTFRAG, NULL, 0)"
+. auto/feature
+
+
+ngx_feature="IPV6_DONTFRAG"
+ngx_feature_name="NGX_HAVE_IPV6_DONTFRAG"
+ngx_feature_run=no
+ngx_feature_incs="#include <sys/socket.h>
+ #include <netinet/in.h>"
+ngx_feature_path=
+ngx_feature_libs=
+ngx_feature_test="getsockopt(0, IPPROTO_IPV6, IPV6_DONTFRAG, NULL, 0)"
+. auto/feature
+
+
+# Linux MTU flags
+
+ngx_feature="IP_PMTUDISC_DO"
+ngx_feature_name="NGX_HAVE_IP_PMTUDISC_DO"
+ngx_feature_run=no
+ngx_feature_incs="#include <sys/socket.h>
+ #include <netinet/in.h>"
+ngx_feature_path=
+ngx_feature_libs=
+ngx_feature_test="getsockopt(0, IPPROTO_IP, IP_PMTUDISC_DO, NULL, 0)"
+. auto/feature
+
+
+ngx_feature="IPV6_PMTUDISC_DO"
+ngx_feature_name="NGX_HAVE_IPV6_PMTUDISC_DO"
+ngx_feature_run=no
+ngx_feature_incs="#include <sys/socket.h>
+ #include <netinet/in.h>"
+ngx_feature_path=
+ngx_feature_libs=
+ngx_feature_test="getsockopt(0, IPPROTO_IPV6, IPV6_PMTUDISC_DO, NULL, 0)"
+. auto/feature
+
+
ngx_feature="TCP_DEFER_ACCEPT"
ngx_feature_name="NGX_HAVE_DEFERRED_ACCEPT"
ngx_feature_run=no
@@ -920,6 +968,19 @@ ngx_feature_test="int i = FIONREAD; prin
. auto/feature
+ngx_feature="ioctl(SIOCGIFMTU)"
+ngx_feature_name="NGX_HAVE_SIOCGIFMTU"
+ngx_feature_run=no
+ngx_feature_incs="#include <sys/ioctl.h>
+ #include <stdio.h>
+ #include <net/if.h>"
+ngx_feature_path=
+ngx_feature_libs=
+ngx_feature_test="int i = SIOCGIFMTU; struct ifreq ifr;
+ ifr.ifr_name[0] = 'e'; printf(\"%d\", i)"
+. auto/feature
+
+
ngx_feature="struct tm.tm_gmtoff"
ngx_feature_name="NGX_HAVE_GMTOFF"
ngx_feature_run=no
@@ -1002,3 +1063,17 @@ ngx_feature_test='struct addrinfo *res;
if (getaddrinfo("localhost", NULL, NULL, &res) != 0) return 1;
freeaddrinfo(res)'
. auto/feature
+
+
+ngx_feature="getifaddrs()"
+ngx_feature_name="NGX_HAVE_GETIFADDRS"
+ngx_feature_run=no
+ngx_feature_incs="#include <sys/types.h>
+ #include <sys/socket.h>
+ #include <ifaddrs.h>"
+ngx_feature_path=
+ngx_feature_libs=
+ngx_feature_test='struct ifaddrs *ifaddr;
+ if (getifaddrs(&ifaddr) != 0) return 1;
+ freeifaddrs(ifaddr)'
+. auto/feature
diff --git a/src/core/ngx_connection.c b/src/core/ngx_connection.c
--- a/src/core/ngx_connection.c
+++ b/src/core/ngx_connection.c
@@ -1010,6 +1010,74 @@ ngx_configure_listening_sockets(ngx_cycl
}
#endif
+
+#if (NGX_HAVE_IP_PMTUDISC_DO)
+
+ if (ls[i].quic && ls[i].sockaddr->sa_family == AF_INET) {
+ value = 1;
+
+ if (setsockopt(ls[i].fd, IPPROTO_IP, IP_PMTUDISC_DO,
+ (const void *) &value, sizeof(int))
+ == -1)
+ {
+ ngx_log_error(NGX_LOG_ALERT, cycle->log, ngx_socket_errno,
+ "setsockopt(IP_PMTUDISC_DO) "
+ "for %V failed, ignored",
+ &ls[i].addr_text);
+ }
+ }
+
+#elif (NGX_HAVE_IP_DONTFRAG)
+
+ if (ls[i].quic && ls[i].sockaddr->sa_family == AF_INET) {
+ value = 1;
+
+ if (setsockopt(ls[i].fd, IPPROTO_IP, IP_DONTFRAG,
+ (const void *) &value, sizeof(int))
+ == -1)
+ {
+ ngx_log_error(NGX_LOG_ALERT, cycle->log, ngx_socket_errno,
+ "setsockopt(IP_DONTFRAG) "
+ "for %V failed, ignored",
+ &ls[i].addr_text);
+ }
+ }
+
+#endif
+
+#if (NGX_HAVE_INET6 && NGX_HAVE_IPV6_PMTUDISC_DO)
+
+ if (ls[i].quic && ls[i].sockaddr->sa_family == AF_INET6) {
+ value = 1;
+
+ if (setsockopt(ls[i].fd, IPPROTO_IPV6, IPV6_PMTUDISC_DO,
+ (const void *) &value, sizeof(int))
+ == -1)
+ {
+ ngx_log_error(NGX_LOG_ALERT, cycle->log, ngx_socket_errno,
+ "setsockopt(IPV6_PMTUDISC_DO) "
+ "for %V failed, ignored",
+ &ls[i].addr_text);
+ }
+ }
+
+#elif (NGX_HAVE_INET6 && NGX_HAVE_IPV6_DONTFRAG)
+
+ if (ls[i].quic && ls[i].sockaddr->sa_family == AF_INET6) {
+ value = 1;
+
+ if (setsockopt(ls[i].fd, IPPROTO_IPV6, IPV6_DONTFRAG,
+ (const void *) &value, sizeof(int))
+ == -1)
+ {
+ ngx_log_error(NGX_LOG_ALERT, cycle->log, ngx_socket_errno,
+ "setsockopt(IPV6_DONTFRAG) "
+ "for %V failed, ignored",
+ &ls[i].addr_text);
+ }
+ }
+
+#endif
}
return;
@@ -1507,6 +1575,10 @@ ngx_connection_error(ngx_connection_t *c
}
#endif
+ if (err == NGX_EMSGSIZE && c->log_error == NGX_ERROR_IGNORE_EMSGSIZE) {
+ return 0;
+ }
+
if (err == 0
|| err == NGX_ECONNRESET
#if (NGX_WIN32)
@@ -1524,6 +1596,7 @@ ngx_connection_error(ngx_connection_t *c
{
switch (c->log_error) {
+ case NGX_ERROR_IGNORE_EMSGSIZE:
case NGX_ERROR_IGNORE_EINVAL:
case NGX_ERROR_IGNORE_ECONNRESET:
case NGX_ERROR_INFO:
diff --git a/src/core/ngx_connection.h b/src/core/ngx_connection.h
--- a/src/core/ngx_connection.h
+++ b/src/core/ngx_connection.h
@@ -97,7 +97,8 @@ typedef enum {
NGX_ERROR_ERR,
NGX_ERROR_INFO,
NGX_ERROR_IGNORE_ECONNRESET,
- NGX_ERROR_IGNORE_EINVAL
+ NGX_ERROR_IGNORE_EINVAL,
+ NGX_ERROR_IGNORE_EMSGSIZE
} ngx_connection_log_error_e;
diff --git a/src/event/quic/ngx_event_quic.c b/src/event/quic/ngx_event_quic.c
--- a/src/event/quic/ngx_event_quic.c
+++ b/src/event/quic/ngx_event_quic.c
@@ -10,8 +10,17 @@
#include <ngx_event_quic_connection.h>
+#define NGX_QUIC_UDP4_MAX_PACKET 65535
+#define NGX_QUIC_UDP4_HEADER_SIZE 28
+
+#define NGX_QUIC_UDP6_MAX_PAYLOAD 65535
+#define NGX_QUIC_UDP6_HEADER_SIZE 48
+
+
static ngx_quic_connection_t *ngx_quic_new_connection(ngx_connection_t *c,
ngx_quic_conf_t *conf, ngx_quic_header_t *pkt);
+static ssize_t ngx_quic_get_local_mtu(ngx_connection_t *c,
+ struct sockaddr *sockaddr);
static ngx_int_t ngx_quic_handle_stateless_reset(ngx_connection_t *c,
ngx_quic_header_t *pkt);
static void ngx_quic_input_handler(ngx_event_t *rev);
@@ -149,11 +158,6 @@ ngx_quic_apply_transport_params(ngx_conn
ngx_log_error(NGX_LOG_INFO, c->log, 0,
"quic maximum packet size is invalid");
return NGX_ERROR;
-
- } else if (ctp->max_udp_payload_size > ngx_quic_max_udp_payload(c)) {
- ctp->max_udp_payload_size = ngx_quic_max_udp_payload(c);
- ngx_log_debug0(NGX_LOG_DEBUG_EVENT, c->log, 0,
- "quic client maximum packet size truncated");
}
if (ctp->active_connection_id_limit < 2) {
@@ -228,6 +232,7 @@ static ngx_quic_connection_t *
ngx_quic_new_connection(ngx_connection_t *c, ngx_quic_conf_t *conf,
ngx_quic_header_t *pkt)
{
+ ssize_t mtu;
ngx_uint_t i;
ngx_quic_tp_t *ctp;
ngx_quic_connection_t *qc;
@@ -297,7 +302,7 @@ ngx_quic_new_connection(ngx_connection_t
ctp = &qc->ctp;
/* defaults to be used before actual client parameters are received */
- ctp->max_udp_payload_size = ngx_quic_max_udp_payload(c);
+ ctp->max_udp_payload_size = NGX_QUIC_MAX_UDP_PAYLOAD_SIZE;
ctp->ack_delay_exponent = NGX_QUIC_DEFAULT_ACK_DELAY_EXPONENT;
ctp->max_ack_delay = NGX_QUIC_DEFAULT_MAX_ACK_DELAY;
ctp->active_connection_id_limit = 2;
@@ -317,6 +322,18 @@ ngx_quic_new_connection(ngx_connection_t
qc->congestion.ssthresh = (size_t) -1;
qc->congestion.recovery_start = ngx_current_msec;
+ qc->max_mtu = ngx_min(qc->tp.max_udp_payload_size,
+ qc->ctp.max_udp_payload_size);
+
+ mtu = ngx_quic_get_local_mtu(c, c->local_sockaddr);
+ if (mtu == NGX_ERROR) {
+ return NULL;
+ }
+
+ if (mtu > 0 && (size_t) mtu < qc->max_mtu) {
+ qc->max_mtu = mtu;
+ }
+
if (pkt->validated && pkt->retried) {
qc->tp.retry_scid.len = pkt->dcid.len;
qc->tp.retry_scid.data = ngx_pstrdup(c->pool, &pkt->dcid);
@@ -347,6 +364,90 @@ ngx_quic_new_connection(ngx_connection_t
}
+static ssize_t
+ngx_quic_get_local_mtu(ngx_connection_t *c, struct sockaddr *sockaddr)
+{
+#if (NGX_HAVE_GETIFADDRS && NGX_HAVE_SIOCGIFMTU)
+
+ size_t mtu;
+ struct ifreq ifr;
+ struct ifaddrs *ifaddrs, *ifa;
+
+ if (sockaddr->sa_family != AF_INET
+#if (NGX_HAVE_INET6)
+ && sockaddr->sa_family != AF_INET6
+#endif
+ )
+ {
+ return NGX_DECLINED;
+ }
+
+ if (getifaddrs(&ifaddrs) == -1) {
+ ngx_log_error(NGX_LOG_INFO, c->log, 0, "getifaddrs() failed");
+ return NGX_ERROR;
+ }
+
+ for (ifa = ifaddrs; ifa; ifa = ifa->ifa_next) {
+ if (ifa->ifa_addr == NULL) {
+ continue;
+ }
+
+ if (ngx_cmp_sockaddr(sockaddr, 0, ifa->ifa_addr, 0, 0) != NGX_OK) {
+ continue;
+ }
+
+ ngx_memzero(&ifr, sizeof(struct ifreq));
+ strcpy(ifr.ifr_name, ifa->ifa_name);
+
+ freeifaddrs(ifaddrs);
+
+ if (ioctl(c->fd, SIOCGIFMTU, &ifr)) {
+ ngx_log_error(NGX_LOG_INFO, c->log, 0, "ioctl(SIOCGIFMTU) failed");
+ return NGX_ERROR;
+ }
+
+ mtu = ifr.ifr_mtu;
+
+ if (sockaddr->sa_family == AF_INET) {
+ if (mtu > NGX_QUIC_UDP4_MAX_PACKET) {
+ mtu = NGX_QUIC_UDP4_MAX_PACKET;
+ }
+
+ if (mtu <= NGX_QUIC_UDP4_HEADER_SIZE) {
+ return NGX_DECLINED;
+ }
+
+ mtu -= NGX_QUIC_UDP4_HEADER_SIZE;
+
+#if (NGX_HAVE_INET6)
+ } else { /* sockaddr->sa_family == AF_INET6 */
+
+ if (mtu <= NGX_QUIC_UDP6_HEADER_SIZE) {
+ return NGX_DECLINED;
+ }
+
+ mtu -= NGX_QUIC_UDP6_HEADER_SIZE;
+
+ if (mtu > NGX_QUIC_UDP6_MAX_PAYLOAD) {
+ mtu = NGX_QUIC_UDP6_MAX_PAYLOAD;
+ }
+#endif
+ }
+
+ ngx_log_debug1(NGX_LOG_DEBUG_EVENT, c->log, 0,
+ "quic local mtu:%uz", mtu);
+
+ return mtu;
+ }
+
+ freeifaddrs(ifaddrs);
+
+#endif
+
+ return NGX_DECLINED;
+}
+
+
static ngx_int_t
ngx_quic_handle_stateless_reset(ngx_connection_t *c, ngx_quic_header_t *pkt)
{
diff --git a/src/event/quic/ngx_event_quic_ack.c b/src/event/quic/ngx_event_quic_ack.c
--- a/src/event/quic/ngx_event_quic_ack.c
+++ b/src/event/quic/ngx_event_quic_ack.c
@@ -229,6 +229,12 @@ ngx_quic_handle_ack_frame_range(ngx_conn
qc = ngx_quic_get_connection(c);
+ if (ctx->level == ssl_encryption_application) {
+ if (ngx_quic_handle_path_mtu_ack(c, qc->path, min, max) != NGX_OK) {
+ return NGX_ERROR;
+ }
+ }
+
st->max_pn = NGX_TIMER_INFINITE;
found = 0;
diff --git a/src/event/quic/ngx_event_quic_connection.h b/src/event/quic/ngx_event_quic_connection.h
--- a/src/event/quic/ngx_event_quic_connection.h
+++ b/src/event/quic/ngx_event_quic_connection.h
@@ -89,14 +89,21 @@ struct ngx_quic_path_s {
ngx_sockaddr_t sa;
socklen_t socklen;
ngx_quic_client_id_t *cid;
- ngx_msec_t expires;
- ngx_uint_t tries;
+ ngx_msec_t valid_expires;
+ ngx_msec_t mtu_expires;
+ ngx_uint_t valid_tries;
+ ngx_uint_t mtu_tries;
+ ngx_uint_t mtu_steps;
ngx_uint_t tag;
+ size_t mtu;
+ size_t mtud;
+ size_t max_mtu;
off_t sent;
off_t received;
u_char challenge1[8];
u_char challenge2[8];
uint64_t seqnum;
+ uint64_t mtu_pnum[NGX_QUIC_PATH_RETRIES];
ngx_str_t addr_text;
u_char text[NGX_SOCKADDR_STRLEN];
unsigned validated:1;
@@ -206,6 +213,8 @@ struct ngx_quic_connection_s {
uint64_t server_seqnum;
uint64_t path_seqnum;
+ size_t max_mtu;
+
ngx_quic_tp_t tp;
ngx_quic_tp_t ctp;
diff --git a/src/event/quic/ngx_event_quic_migration.c b/src/event/quic/ngx_event_quic_migration.c
--- a/src/event/quic/ngx_event_quic_migration.c
+++ b/src/event/quic/ngx_event_quic_migration.c
@@ -10,13 +10,23 @@
#include <ngx_event_quic_connection.h>
+#define NGX_QUIC_MAX_MTU_STEPS 7
+#define NGX_QUIC_MTU_PRECISION 4
+
+
static void ngx_quic_set_connection_path(ngx_connection_t *c,
ngx_quic_path_t *path);
static ngx_int_t ngx_quic_validate_path(ngx_connection_t *c,
ngx_quic_path_t *path);
static ngx_int_t ngx_quic_send_path_challenge(ngx_connection_t *c,
ngx_quic_path_t *path);
+static ngx_int_t ngx_quic_expire_path_mtu(ngx_connection_t *c,
+ ngx_quic_send_ctx_t *ctx, ngx_quic_path_t *path, ngx_msec_int_t *next);
+static ngx_int_t ngx_quic_expire_path(ngx_connection_t *c,
+ ngx_quic_send_ctx_t *ctx, ngx_quic_path_t *path, ngx_msec_int_t *next);
static ngx_quic_path_t *ngx_quic_get_path(ngx_connection_t *c, ngx_uint_t tag);
+static ngx_int_t ngx_quic_send_path_mtu_probe(ngx_connection_t *c,
+ ngx_quic_path_t *path);
ngx_int_t
@@ -169,6 +179,10 @@ valid:
path->validating = 0;
path->limited = 0;
+ if (ngx_quic_discover_path_mtu(c, path) != NGX_OK) {
+ return NGX_ERROR;
+ }
+
return NGX_OK;
}
@@ -207,6 +221,8 @@ ngx_quic_new_path(ngx_connection_t *c,
path->limited = 1;
+ path->mtu = NGX_QUIC_MIN_INITIAL_SIZE;
+
path->seqnum = qc->path_seqnum++;
path->sockaddr = &path->sa.sockaddr;
@@ -496,7 +512,7 @@ ngx_quic_validate_path(ngx_connection_t
"quic initiated validation of path seq:%uL", path->seqnum);
path->validating = 1;
- path->tries = 0;
+ path->valid_tries = 0;
if (RAND_bytes(path->challenge1, 8) != 1) {
return NGX_ERROR;
@@ -513,7 +529,7 @@ ngx_quic_validate_path(ngx_connection_t
ctx = ngx_quic_get_send_ctx(qc, ssl_encryption_application);
pto = ngx_max(ngx_quic_pto(c, ctx), 1000);
- path->expires = ngx_current_msec + pto;
+ path->valid_expires = ngx_current_msec + pto;
if (!qc->path_validation.timer_set) {
ngx_add_timer(&qc->path_validation, pto);
@@ -530,7 +546,7 @@ ngx_quic_send_path_challenge(ngx_connect
ngx_log_debug2(NGX_LOG_DEBUG_EVENT, c->log, 0,
"quic path seq:%uL send path_challenge tries:%ui",
- path->seqnum, path->tries);
+ path->seqnum, path->valid_tries);
ngx_memzero(&frame, sizeof(ngx_quic_frame_t));
@@ -566,10 +582,9 @@ ngx_quic_send_path_challenge(ngx_connect
void
ngx_quic_path_validation_handler(ngx_event_t *ev)
{
- ngx_msec_t now;
ngx_queue_t *q;
- ngx_msec_int_t left, next, pto;
- ngx_quic_path_t *path, *bkp;
+ ngx_msec_int_t next;
+ ngx_quic_path_t *path;
ngx_connection_t *c;
ngx_quic_send_ctx_t *ctx;
ngx_quic_connection_t *qc;
@@ -580,7 +595,6 @@ ngx_quic_path_validation_handler(ngx_eve
ctx = ngx_quic_get_send_ctx(qc, ssl_encryption_application);
next = -1;
- now = ngx_current_msec;
q = ngx_queue_head(&qc->paths);
@@ -589,78 +603,12 @@ ngx_quic_path_validation_handler(ngx_eve
path = ngx_queue_data(q, ngx_quic_path_t, queue);
q = ngx_queue_next(q);
- if (!path->validating) {
- continue;
- }
-
- left = path->expires - now;
-
- if (left > 0) {
-
- if (next == -1 || left < next) {
- next = left;
- }
-
- continue;
- }
-
- if (++path->tries < NGX_QUIC_PATH_RETRIES) {
- pto = ngx_max(ngx_quic_pto(c, ctx), 1000) << path->tries;
-
- path->expires = ngx_current_msec + pto;
-
- if (next == -1 || pto < next) {
- next = pto;
- }
-
- /* retransmit */
- (void) ngx_quic_send_path_challenge(c, path);
-
- continue;
+ if (ngx_quic_expire_path_mtu(c, ctx, path, &next) != NGX_OK) {
+ ngx_quic_close_connection(c, NGX_ERROR);
+ return;
}
- ngx_log_debug1(NGX_LOG_DEBUG_EVENT, ev->log, 0,
- "quic path seq:%uL validation failed", path->seqnum);
-
- /* found expired path */
-
- path->validated = 0;
- path->validating = 0;
- path->limited = 1;
-
-
- /* RFC 9000, 9.3.2. On-Path Address Spoofing
- *
- * To protect the connection from failing due to such a spurious
- * migration, an endpoint MUST revert to using the last validated
- * peer address when validation of a new peer address fails.
- */
-
- if (qc->path == path) {
- /* active path validation failed */
-
- bkp = ngx_quic_get_path(c, NGX_QUIC_PATH_BACKUP);
-
- if (bkp == NULL) {
- qc->error = NGX_QUIC_ERR_NO_VIABLE_PATH;
- qc->error_reason = "no viable path";
- ngx_quic_close_connection(c, NGX_ERROR);
- return;
- }
-
- qc->path = bkp;
- qc->path->tag = NGX_QUIC_PATH_ACTIVE;
-
- ngx_quic_set_connection_path(c, qc->path);
-
- ngx_log_error(NGX_LOG_INFO, c->log, 0,
- "quic path seq:%uL addr:%V is restored from backup",
- qc->path->seqnum, &qc->path->addr_text);
-
- ngx_quic_path_dbg(c, "is active", qc->path);
- }
-
- if (ngx_quic_free_path(c, path) != NGX_OK) {
+ if (ngx_quic_expire_path(c, ctx, path, &next) != NGX_OK) {
ngx_quic_close_connection(c, NGX_ERROR);
return;
}
@@ -670,3 +618,293 @@ ngx_quic_path_validation_handler(ngx_eve
ngx_add_timer(&qc->path_validation, next);
}
}
+
+
+static ngx_int_t
+ngx_quic_expire_path_mtu(ngx_connection_t *c, ngx_quic_send_ctx_t *ctx,
+ ngx_quic_path_t *path, ngx_msec_int_t *next)
+{
+ ngx_int_t rc;
+ ngx_msec_t now;
+ ngx_msec_int_t left, pto;
+
+ if (!path->mtud) {
+ return NGX_OK;
+ }
+
+ now = ngx_current_msec;
+
+ left = path->mtu_expires - now;
+
+ if (left > 0) {
+
+ if (*next == -1 || left < *next) {
+ *next = left;
+ }
+
+ return NGX_OK;
+ }
+
+ if (++path->mtu_tries < NGX_QUIC_PATH_RETRIES) {
+ pto = ngx_max(ngx_quic_pto(c, ctx), 1000) << path->mtu_tries;
+
+ path->mtu_expires = ngx_current_msec + pto;
+
+ if (*next == -1 || pto < *next) {
+ *next = pto;
+ }
+
+ rc = ngx_quic_send_path_mtu_probe(c, path);
+ if (rc != NGX_DECLINED) {
+ return rc;
+ }
+ }
+
+ ngx_log_debug1(NGX_LOG_DEBUG_EVENT, c->log, 0,
+ "quic path seq:%uL mtu probe failed", path->seqnum);
+
+ path->max_mtu = path->mtud;
+ path->mtud = 0;
+
+ return ngx_quic_discover_path_mtu(c, path);
+}
+
+
+static ngx_int_t
+ngx_quic_expire_path(ngx_connection_t *c, ngx_quic_send_ctx_t *ctx,
+ ngx_quic_path_t *path, ngx_msec_int_t *next)
+{
+ ngx_msec_t now;
+ ngx_msec_int_t left, pto;
+ ngx_quic_path_t *bkp;
+ ngx_quic_connection_t *qc;
+
+ if (!path->validating) {
+ return NGX_OK;
+ }
+
+ qc = ngx_quic_get_connection(c);
+
+ now = ngx_current_msec;
+
+ left = path->valid_expires - now;
+
+ if (left > 0) {
+
+ if (*next == -1 || left < *next) {
+ *next = left;
+ }
+
+ return NGX_OK;
+ }
+
+ if (++path->valid_tries < NGX_QUIC_PATH_RETRIES) {
+ pto = ngx_max(ngx_quic_pto(c, ctx), 1000) << path->valid_tries;
+
+ path->valid_expires = ngx_current_msec + pto;
+
+ if (*next == -1 || pto < *next) {
+ *next = pto;
+ }
+
+ /* retransmit */
+ (void) ngx_quic_send_path_challenge(c, path);
+
+ return NGX_OK;
+ }
+
+ ngx_log_debug1(NGX_LOG_DEBUG_EVENT, c->log, 0,
+ "quic path seq:%uL validation failed", path->seqnum);
+
+ /* found expired path */
+
+ path->validated = 0;
+ path->validating = 0;
+ path->limited = 1;
+
+
+ /* RFC 9000, 9.3.2. On-Path Address Spoofing
+ *
+ * To protect the connection from failing due to such a spurious
+ * migration, an endpoint MUST revert to using the last validated
+ * peer address when validation of a new peer address fails.
+ */
+
+ if (qc->path == path) {
+ /* active path validation failed */
+
+ bkp = ngx_quic_get_path(c, NGX_QUIC_PATH_BACKUP);
+
+ if (bkp == NULL) {
+ qc->error = NGX_QUIC_ERR_NO_VIABLE_PATH;
+ qc->error_reason = "no viable path";
+ return NGX_ERROR;
+ }
+
+ qc->path = bkp;
+ qc->path->tag = NGX_QUIC_PATH_ACTIVE;
+
+ ngx_quic_set_connection_path(c, qc->path);
+
+ ngx_log_error(NGX_LOG_INFO, c->log, 0,
+ "quic path seq:%uL addr:%V is restored from backup",
+ qc->path->seqnum, &qc->path->addr_text);
+
+ ngx_quic_path_dbg(c, "is active", qc->path);
+ }
+
+ if (ngx_quic_free_path(c, path) != NGX_OK) {
+ return NGX_ERROR;
+ }
+
+ return NGX_OK;
+}
+
+
+ngx_int_t
+ngx_quic_discover_path_mtu(ngx_connection_t *c, ngx_quic_path_t *path)
+{
+ ngx_int_t rc;
+ ngx_uint_t i;
+ ngx_msec_t pto;
+ ngx_quic_send_ctx_t *ctx;
+ ngx_quic_connection_t *qc;
+
+ qc = ngx_quic_get_connection(c);
+
+ ctx = ngx_quic_get_send_ctx(qc, ssl_encryption_application);
+
+again:
+
+ if (path->mtu_steps == 0) {
+ path->max_mtu = qc->max_mtu;
+ path->mtud = path->max_mtu;
+
+ } else if (path->mtu_steps >= NGX_QUIC_MAX_MTU_STEPS
+ || (path->max_mtu - path->mtu) <= NGX_QUIC_MTU_PRECISION)
+ {
+ return NGX_OK;
+
+ } else {
+ path->mtud = (path->mtu + path->max_mtu) / 2;
+ }
+
+ path->mtu_steps++;
+
+ ngx_log_debug1(NGX_LOG_DEBUG_EVENT, c->log, 0,
+ "quic initiated mtu discovery of path seq:%uL",
+ path->seqnum);
+
+ for (i = 0; i < NGX_QUIC_PATH_RETRIES; i++) {
+ path->mtu_pnum[i] = NGX_QUIC_UNSET_PN;
+ }
+
+ path->mtu_tries = 0;
+
+ rc = ngx_quic_send_path_mtu_probe(c, path);
+
+ if (rc == NGX_DECLINED) {
+ path->max_mtu = path->mtud;
+ path->mtud = 0;
+ goto again;
+ }
+
+ if (rc == NGX_ERROR) {
+ path->mtud = 0;
+ return NGX_ERROR;
+ }
+
+ /* rc == NGX_OK */
+
+ pto = ngx_quic_pto(c, ctx);
+ path->mtu_expires = ngx_current_msec + pto;
+
+ if (!qc->path_validation.timer_set) {
+ ngx_add_timer(&qc->path_validation, pto);
+ }
+
+ return NGX_OK;
+}
+
+
+static ngx_int_t
+ngx_quic_send_path_mtu_probe(ngx_connection_t *c, ngx_quic_path_t *path)
+{
+ ngx_int_t rc;
+ ngx_uint_t log_error;
+ ngx_quic_frame_t frame;
+ ngx_quic_send_ctx_t *ctx;
+ ngx_quic_connection_t *qc;
+
+ ngx_memzero(&frame, sizeof(ngx_quic_frame_t));
+
+ frame.level = ssl_encryption_application;
+ frame.type = NGX_QUIC_FT_PING;
+
+ qc = ngx_quic_get_connection(c);
+ ctx = ngx_quic_get_send_ctx(qc, ssl_encryption_application);
+ path->mtu_pnum[path->mtu_tries] = ctx->pnum;
+
+ ngx_log_debug4(NGX_LOG_DEBUG_EVENT, c->log, 0,
+ "quic path seq:%uL send mtu probe "
+ "size:%uz pnum:%uL tries:%ui",
+ path->seqnum, path->mtud, ctx->pnum, path->mtu_tries);
+
+ log_error = c->log_error;
+ c->log_error = NGX_ERROR_IGNORE_EMSGSIZE;
+
+ rc = ngx_quic_frame_sendto(c, &frame, path->mtud, path);
+ c->log_error = log_error;
+
+ if (rc == NGX_ERROR) {
+ if (c->write->error) {
+ c->write->error = 0;
+
+ ngx_log_debug1(NGX_LOG_DEBUG_EVENT, c->log, 0,
+ "quic rejected mtu probe of path seq:%uL",
+ path->seqnum);
+
+ return NGX_DECLINED;
+ }
+
+ return NGX_ERROR;
+ }
+
+ return NGX_OK;
+}
+
+
+ngx_int_t
+ngx_quic_handle_path_mtu_ack(ngx_connection_t *c, ngx_quic_path_t *path,
+ uint64_t min, uint64_t max)
+{
+ uint64_t pnum;
+ ngx_uint_t i;
+
+ if (!path->mtud) {
+ return NGX_OK;
+ }
+
+ for (i = 0; i < NGX_QUIC_PATH_RETRIES; i++) {
+ pnum = path->mtu_pnum[i];
+
+ if (pnum == NGX_QUIC_UNSET_PN) {
+ break;
+ }
+
+ if (pnum < min || pnum > max) {
+ continue;
+ }
+
+ path->mtu = path->mtud;
+ path->mtud = 0;
+
+ ngx_log_debug2(NGX_LOG_DEBUG_EVENT, c->log, 0,
+ "quic path seq:%uL mtu ack size:%uz",
+ path->seqnum, path->mtu);
+
+ return ngx_quic_discover_path_mtu(c, path);
+ }
+
+ return NGX_OK;
+}
diff --git a/src/event/quic/ngx_event_quic_migration.h b/src/event/quic/ngx_event_quic_migration.h
--- a/src/event/quic/ngx_event_quic_migration.h
+++ b/src/event/quic/ngx_event_quic_migration.h
@@ -39,4 +39,9 @@ ngx_int_t ngx_quic_handle_migration(ngx_
void ngx_quic_path_validation_handler(ngx_event_t *ev);
+ngx_int_t ngx_quic_discover_path_mtu(ngx_connection_t *c,
+ ngx_quic_path_t *path);
+ngx_int_t ngx_quic_handle_path_mtu_ack(ngx_connection_t *c,
+ ngx_quic_path_t *path, uint64_t min, uint64_t max);
+
#endif /* _NGX_EVENT_QUIC_MIGRATION_H_INCLUDED_ */
diff --git a/src/event/quic/ngx_event_quic_output.c b/src/event/quic/ngx_event_quic_output.c
--- a/src/event/quic/ngx_event_quic_output.c
+++ b/src/event/quic/ngx_event_quic_output.c
@@ -10,9 +10,6 @@
#include <ngx_event_quic_connection.h>
-#define NGX_QUIC_MAX_UDP_PAYLOAD_OUT 1252
-#define NGX_QUIC_MAX_UDP_PAYLOAD_OUT6 1232
-
#define NGX_QUIC_MAX_UDP_SEGMENT_BUF 65487 /* 65K - IPv6 header */
#define NGX_QUIC_MAX_SEGMENTS 64 /* UDP_MAX_SEGMENTS */
@@ -61,21 +58,6 @@ static size_t ngx_quic_path_limit(ngx_co
size_t size);
-size_t
-ngx_quic_max_udp_payload(ngx_connection_t *c)
-{
- /* TODO: path MTU discovery */
-
-#if (NGX_HAVE_INET6)
- if (c->sockaddr->sa_family == AF_INET6) {
- return NGX_QUIC_MAX_UDP_PAYLOAD_OUT6;
- }
-#endif
-
- return NGX_QUIC_MAX_UDP_PAYLOAD_OUT;
-}
-
-
ngx_int_t
ngx_quic_output(ngx_connection_t *c)
{
@@ -142,10 +124,7 @@ ngx_quic_create_datagrams(ngx_connection
p = dst;
- len = ngx_min(qc->ctp.max_udp_payload_size,
- NGX_QUIC_MAX_UDP_PAYLOAD_SIZE);
-
- len = ngx_quic_path_limit(c, path, len);
+ len = ngx_quic_path_limit(c, path, path->mtu);
pad = ngx_quic_get_padding_level(c);
@@ -271,17 +250,19 @@ ngx_quic_allow_segmentation(ngx_connecti
{
size_t bytes, len;
ngx_queue_t *q;
+ ngx_quic_path_t *path;
ngx_quic_frame_t *f;
ngx_quic_send_ctx_t *ctx;
ngx_quic_connection_t *qc;
qc = ngx_quic_get_connection(c);
+ path = qc->path;
if (!qc->conf->gso_enabled) {
return 0;
}
- if (qc->path->limited) {
+ if (path->limited) {
/* don't even try to be faster on non-validated paths */
return 0;
}
@@ -299,9 +280,7 @@ ngx_quic_allow_segmentation(ngx_connecti
ctx = ngx_quic_get_send_ctx(qc, ssl_encryption_application);
bytes = 0;
-
- len = ngx_min(qc->ctp.max_udp_payload_size,
- NGX_QUIC_MAX_UDP_SEGMENT_BUF);
+ len = path->mtu;
for (q = ngx_queue_head(&ctx->frames);
q != ngx_queue_sentinel(&ctx->frames);
@@ -345,8 +324,7 @@ ngx_quic_create_segments(ngx_connection_
return NGX_ERROR;
}
- segsize = ngx_min(qc->ctp.max_udp_payload_size,
- NGX_QUIC_MAX_UDP_SEGMENT_BUF);
+ segsize = ngx_min(path->mtu, NGX_QUIC_MAX_UDP_SEGMENT_BUF);
p = dst;
end = dst + sizeof(dst);
diff --git a/src/event/quic/ngx_event_quic_output.h b/src/event/quic/ngx_event_quic_output.h
--- a/src/event/quic/ngx_event_quic_output.h
+++ b/src/event/quic/ngx_event_quic_output.h
@@ -12,8 +12,6 @@
#include <ngx_core.h>
-size_t ngx_quic_max_udp_payload(ngx_connection_t *c);
-
ngx_int_t ngx_quic_output(ngx_connection_t *c);
ngx_int_t ngx_quic_negotiate_version(ngx_connection_t *c,
diff --git a/src/event/quic/ngx_event_quic_ssl.c b/src/event/quic/ngx_event_quic_ssl.c
--- a/src/event/quic/ngx_event_quic_ssl.c
+++ b/src/event/quic/ngx_event_quic_ssl.c
@@ -499,6 +499,10 @@ ngx_quic_crypto_input(ngx_connection_t *
return NGX_ERROR;
}
+ if (ngx_quic_discover_path_mtu(c, qc->path) != NGX_OK) {
+ return NGX_ERROR;
+ }
+
if (ngx_quic_init_streams(c) != NGX_OK) {
return NGX_ERROR;
}
diff --git a/src/os/unix/ngx_darwin_config.h b/src/os/unix/ngx_darwin_config.h
--- a/src/os/unix/ngx_darwin_config.h
+++ b/src/os/unix/ngx_darwin_config.h
@@ -47,6 +47,8 @@
#include <arpa/inet.h>
#include <netdb.h>
#include <sys/un.h>
+#include <net/if.h>
+#include <ifaddrs.h>
#include <sys/sysctl.h>
#include <xlocale.h>
diff --git a/src/os/unix/ngx_errno.h b/src/os/unix/ngx_errno.h
--- a/src/os/unix/ngx_errno.h
+++ b/src/os/unix/ngx_errno.h
@@ -54,6 +54,7 @@ typedef int ngx_err_t;
#define NGX_ENOMOREFILES 0
#define NGX_ELOOP ELOOP
#define NGX_EBADF EBADF
+#define NGX_EMSGSIZE EMSGSIZE
#if (NGX_HAVE_OPENAT)
#define NGX_EMLINK EMLINK
diff --git a/src/os/unix/ngx_freebsd_config.h b/src/os/unix/ngx_freebsd_config.h
--- a/src/os/unix/ngx_freebsd_config.h
+++ b/src/os/unix/ngx_freebsd_config.h
@@ -48,6 +48,9 @@
#include <libutil.h> /* setproctitle() before 4.1 */
#include <osreldate.h>
#include <sys/sysctl.h>
+#include <sys/ioctl.h>
+#include <net/if.h>
+#include <ifaddrs.h>
#include <dlfcn.h>
diff --git a/src/os/unix/ngx_linux_config.h b/src/os/unix/ngx_linux_config.h
--- a/src/os/unix/ngx_linux_config.h
+++ b/src/os/unix/ngx_linux_config.h
@@ -54,6 +54,8 @@
#include <sys/ioctl.h>
#include <crypt.h>
#include <sys/utsname.h> /* uname() */
+#include <net/if.h>
+#include <ifaddrs.h>
#include <dlfcn.h>
diff --git a/src/os/unix/ngx_posix_config.h b/src/os/unix/ngx_posix_config.h
--- a/src/os/unix/ngx_posix_config.h
+++ b/src/os/unix/ngx_posix_config.h
@@ -140,6 +140,17 @@ typedef struct aiocb ngx_aiocb_t;
#endif
+#if (NGX_HAVE_SIOCGIFMTU)
+#include <sys/ioctl.h>
+#include <net/if.h>
+#endif
+
+
+#if (NGX_HAVE_GETIFADDRS)
+#include <ifaddrs.h>
+#endif
+
+
#define NGX_LISTEN_BACKLOG 511
#define ngx_debug_init()
diff --git a/src/os/unix/ngx_solaris_config.h b/src/os/unix/ngx_solaris_config.h
--- a/src/os/unix/ngx_solaris_config.h
+++ b/src/os/unix/ngx_solaris_config.h
@@ -88,6 +88,17 @@
#endif
+#if (NGX_HAVE_SIOCGIFMTU)
+#include <sys/ioctl.h>
+#include <net/if.h>
+#endif
+
+
+#if (NGX_HAVE_GETIFADDRS)
+#include <ifaddrs.h>
+#endif
+
+
#define NGX_LISTEN_BACKLOG 511
--
Sergey Kandaurov
More information about the nginx-devel
mailing list