[PATCH 4 of 4] QUIC: path MTU discovery

Roman Arutyunyan arut at nginx.com
Mon Aug 14 13:52:36 UTC 2023


Hi,

On Thu, Aug 10, 2023 at 06:18:02PM +0400, Sergey Kandaurov wrote:
> 
> > On 10 Aug 2023, at 14:21, Roman Arutyunyan <arut at nginx.com> wrote:
> > 
> > On Tue, Aug 08, 2023 at 01:49:46PM +0400, Sergey Kandaurov wrote:
> > Hi,
> > 
> >> 
> >>> On 31 Jul 2023, at 21:34, Roman Arutyunyan <arut at nginx.com> wrote:
> >>> 
> >>> Hi,
> >>> 
> >>> On Fri, Jul 28, 2023 at 07:51:22PM +0400, Sergey Kandaurov wrote:
> >>>> 
> >>>>> On 6 Jul 2023, at 17:57, Roman Arutyunyan <arut at nginx.com> wrote:
> >>>>> 
> >>>>> # HG changeset patch
> >>>>> # User Roman Arutyunyan <arut at nginx.com>
> >>>>> # Date 1688481216 -14400
> >>>>> #      Tue Jul 04 18:33:36 2023 +0400
> >>>>> # Node ID 1a49fd5d2013b6c8d50263499e6ced440927e949
> >>>>> # Parent  a1ea543d009311765144351b2557c00c8e6445bf
> >>>>> QUIC: path MTU discovery.
> >>>>> 
> >>>>> MTU selection starts by stepping up until the first failure.  Then binary
> >>>>> search is used to find the path MTU.
> >>>>> 
> >>>>> diff --git a/src/core/ngx_connection.c b/src/core/ngx_connection.c
> >>>>> --- a/src/core/ngx_connection.c
> >>>>> +++ b/src/core/ngx_connection.c
> >>>>> @@ -1583,6 +1583,10 @@ ngx_connection_error(ngx_connection_t *c
> >>>>>   }
> >>>>> #endif
> >>>>> 
> >>>>> +    if (err == NGX_EMSGSIZE && c->log_error == NGX_ERROR_IGNORE_EMSGSIZE) {
> >>>>> +        return 0;
> >>>>> +    }
> >>>>> +
> >>>>>   if (err == 0
> >>>>>       || err == NGX_ECONNRESET
> >>>>> #if (NGX_WIN32)
> >>>>> @@ -1600,6 +1604,7 @@ ngx_connection_error(ngx_connection_t *c
> >>>>>   {
> >>>>>       switch (c->log_error) {
> >>>>> 
> >>>>> +        case NGX_ERROR_IGNORE_EMSGSIZE:
> >>>>>       case NGX_ERROR_IGNORE_EINVAL:
> >>>>>       case NGX_ERROR_IGNORE_ECONNRESET:
> >>>>>       case NGX_ERROR_INFO:
> >>>>> diff --git a/src/core/ngx_connection.h b/src/core/ngx_connection.h
> >>>>> --- a/src/core/ngx_connection.h
> >>>>> +++ b/src/core/ngx_connection.h
> >>>>> @@ -97,7 +97,8 @@ typedef enum {
> >>>>>   NGX_ERROR_ERR,
> >>>>>   NGX_ERROR_INFO,
> >>>>>   NGX_ERROR_IGNORE_ECONNRESET,
> >>>>> -    NGX_ERROR_IGNORE_EINVAL
> >>>>> +    NGX_ERROR_IGNORE_EINVAL,
> >>>>> +    NGX_ERROR_IGNORE_EMSGSIZE
> >>>>> } ngx_connection_log_error_e;
> >>>>> 
> >>>>> 
> >>>>> diff --git a/src/event/quic/ngx_event_quic.c b/src/event/quic/ngx_event_quic.c
> >>>>> --- a/src/event/quic/ngx_event_quic.c
> >>>>> +++ b/src/event/quic/ngx_event_quic.c
> >>>>> @@ -149,11 +149,6 @@ ngx_quic_apply_transport_params(ngx_conn
> >>>>>       ngx_log_error(NGX_LOG_INFO, c->log, 0,
> >>>>>                     "quic maximum packet size is invalid");
> >>>>>       return NGX_ERROR;
> >>>>> -
> >>>>> -    } else if (ctp->max_udp_payload_size > ngx_quic_max_udp_payload(c)) {
> >>>>> -        ctp->max_udp_payload_size = ngx_quic_max_udp_payload(c);
> >>>>> -        ngx_log_debug0(NGX_LOG_DEBUG_EVENT, c->log, 0,
> >>>>> -                       "quic client maximum packet size truncated");
> >>>>>   }
> >>>>> 
> >>>>>   if (ctp->active_connection_id_limit < 2) {
> >>>>> @@ -286,7 +281,7 @@ ngx_quic_new_connection(ngx_connection_t
> >>>>> 
> >>>>>   qc->path_validation.log = c->log;
> >>>>>   qc->path_validation.data = c;
> >>>>> -    qc->path_validation.handler = ngx_quic_path_validation_handler;
> >>>>> +    qc->path_validation.handler = ngx_quic_path_handler;
> >>>>> 
> >>>>>   qc->conf = conf;
> >>>>> 
> >>>>> @@ -297,7 +292,7 @@ ngx_quic_new_connection(ngx_connection_t
> >>>>>   ctp = &qc->ctp;
> >>>>> 
> >>>>>   /* defaults to be used before actual client parameters are received */
> >>>>> -    ctp->max_udp_payload_size = ngx_quic_max_udp_payload(c);
> >>>>> +    ctp->max_udp_payload_size = NGX_QUIC_MAX_UDP_PAYLOAD_SIZE;
> >>>>>   ctp->ack_delay_exponent = NGX_QUIC_DEFAULT_ACK_DELAY_EXPONENT;
> >>>>>   ctp->max_ack_delay = NGX_QUIC_DEFAULT_MAX_ACK_DELAY;
> >>>>>   ctp->active_connection_id_limit = 2;
> >>>>> diff --git a/src/event/quic/ngx_event_quic_ack.c b/src/event/quic/ngx_event_quic_ack.c
> >>>>> --- a/src/event/quic/ngx_event_quic_ack.c
> >>>>> +++ b/src/event/quic/ngx_event_quic_ack.c
> >>>>> @@ -229,6 +229,12 @@ ngx_quic_handle_ack_frame_range(ngx_conn
> >>>>> 
> >>>>>   qc = ngx_quic_get_connection(c);
> >>>>> 
> >>>>> +    if (ctx->level == ssl_encryption_application) {
> >>>>> +        if (ngx_quic_handle_path_mtu_ack(c, qc->path, min, max) != NGX_OK) {
> >>>>> +            return NGX_ERROR;
> >>>>> +        }
> >>>>> +    }
> >>>>> +
> >>>>>   st->max_pn = NGX_TIMER_INFINITE;
> >>>>>   found = 0;
> >>>>> 
> >>>>> diff --git a/src/event/quic/ngx_event_quic_connection.h b/src/event/quic/ngx_event_quic_connection.h
> >>>>> --- a/src/event/quic/ngx_event_quic_connection.h
> >>>>> +++ b/src/event/quic/ngx_event_quic_connection.h
> >>>>> @@ -66,6 +66,14 @@ typedef struct ngx_quic_keys_s        ng
> >>>>> #define ngx_quic_get_socket(c)               ((ngx_quic_socket_t *)((c)->udp))
> >>>>> 
> >>>>> 
> >>>>> +typedef enum {
> >>>>> +    NGX_QUIC_PATH_IDLE = 0,
> >>>>> +    NGX_QUIC_PATH_VALIDATING,
> >>>>> +    NGX_QUIC_PATH_WAITING,
> >>>>> +    NGX_QUIC_PATH_DISCOVERING_MTU
> >>>> 
> >>>> Nitpicking on too long name.
> >>>> What about NGX_QUIC_PATH_MTUD or something.
> >>> 
> >>> OK, renamed.
> >>> 
> >>>>> +} ngx_quic_path_state_e;
> >>>>> +
> >>>>> +
> >>>>> struct ngx_quic_client_id_s {
> >>>>>   ngx_queue_t                       queue;
> >>>>>   uint64_t                          seqnum;
> >>>>> @@ -89,18 +97,22 @@ struct ngx_quic_path_s {
> >>>>>   ngx_sockaddr_t                    sa;
> >>>>>   socklen_t                         socklen;
> >>>>>   ngx_quic_client_id_t             *cid;
> >>>>> +    ngx_quic_path_state_e             state;
> >>>>>   ngx_msec_t                        expires;
> >>>>>   ngx_uint_t                        tries;
> >>>>>   ngx_uint_t                        tag;
> >>>>> +    size_t                            mtu;
> >>>>> +    size_t                            mtud;
> >>>>> +    size_t                            max_mtu;
> >>>>>   off_t                             sent;
> >>>>>   off_t                             received;
> >>>>>   u_char                            challenge1[8];
> >>>>>   u_char                            challenge2[8];
> >>>>>   uint64_t                          seqnum;
> >>>>> +    uint64_t                          mtu_pnum[NGX_QUIC_PATH_RETRIES];
> >>>>>   ngx_str_t                         addr_text;
> >>>>>   u_char                            text[NGX_SOCKADDR_STRLEN];
> >>>>> -    unsigned                          validated:1;
> >>>>> -    unsigned                          validating:1;
> >>>>> +    ngx_uint_t                        validated; /* unsigned validated:1; */
> >>>>> };
> >>>>> 
> >>>>> 
> >>>>> diff --git a/src/event/quic/ngx_event_quic_migration.c b/src/event/quic/ngx_event_quic_migration.c
> >>>>> --- a/src/event/quic/ngx_event_quic_migration.c
> >>>>> +++ b/src/event/quic/ngx_event_quic_migration.c
> >>>>> @@ -10,6 +10,11 @@
> >>>>> #include <ngx_event_quic_connection.h>
> >>>>> 
> >>>>> 
> >>>>> +#define NGX_QUIC_PATH_MTU_DELAY       1000
> >>>> 
> >>>> The value looks too much,
> >>>> it would delay each subsequent probe (which is not retry).
> >>> 
> >>> Probably the right value should be determined in real-life scenarios.
> >> 
> >> I'm quite skeptical about delayed probes, it has little to no benefit
> >> for short connections with bulky transfers, as implemented, especially
> >> with short MTU steps.  OTOH, immediate probes can lead to congestion
> >> window starvation resulting in request processing slow down.
> >> 
> >> For the record,
> >> https://www.ietf.org/proceedings/62/pmtud.html provides a discussion
> >> about PMTUD, in particular, on search strategy and when to probe.
> >> For the latter, it seems to articulate an opposite view, i.e. to send
> >> probes ASAP.  Also, it raises a question how short lived connections
> >> could benefit from PMTUD, e.g. by caching path MTU, which seems to
> >> correspond to a shared PLPMTU state in RFC 8899.
> >> 
> >>> Let's decrease the value by a factor of 10 for now.
> >> 
> >> For a simple implementation, 100ms is a good enough trade-off, IMHO.
> >> 
> >>> 
> >>>>> +#define NGX_QUIC_PATH_MTU_STEP        256
> >>>>> +#define NGX_QUIC_PATH_MTU_PRECISION   16
> >>>>> +
> >>>>> +
> >>>>> static void ngx_quic_set_connection_path(ngx_connection_t *c,
> >>>>>   ngx_quic_path_t *path);
> >>>>> static ngx_int_t ngx_quic_validate_path(ngx_connection_t *c,
> >>>>> @@ -17,7 +22,15 @@ static ngx_int_t ngx_quic_validate_path(
> >>>>> static ngx_int_t ngx_quic_send_path_challenge(ngx_connection_t *c,
> >>>>>   ngx_quic_path_t *path);
> >>>>> static void ngx_quic_set_path_timer(ngx_connection_t *c);
> >>>>> +static ngx_int_t ngx_quic_expire_path_validation(ngx_connection_t *c,
> >>>>> +    ngx_quic_path_t *path);
> >>>>> +static ngx_int_t ngx_quic_expire_path_mtu_delay(ngx_connection_t *c,
> >>>>> +    ngx_quic_path_t *path);
> >>>>> +static ngx_int_t ngx_quic_expire_path_mtu_discovery(ngx_connection_t *c,
> >>>>> +    ngx_quic_path_t *path);
> >>>>> static ngx_quic_path_t *ngx_quic_get_path(ngx_connection_t *c, ngx_uint_t tag);
> >>>>> +static ngx_int_t ngx_quic_send_path_mtu_probe(ngx_connection_t *c,
> >>>>> +    ngx_quic_path_t *path);
> >>>>> 
> >>>>> 
> >>>>> ngx_int_t
> >>>>> @@ -97,7 +110,7 @@ ngx_quic_handle_path_response_frame(ngx_
> >>>>>   {
> >>>>>       path = ngx_queue_data(q, ngx_quic_path_t, queue);
> >>>>> 
> >>>>> -        if (!path->validating) {
> >>>>> +        if (path->state != NGX_QUIC_PATH_VALIDATING) {
> >>>>>           continue;
> >>>>>       }
> >>>>> 
> >>>>> @@ -136,6 +149,9 @@ valid:
> >>>>>       {
> >>>>>           /* address did not change */
> >>>>>           rst = 0;
> >>>>> +
> >>>>> +            path->mtu = prev->mtu;
> >>>>> +            path->max_mtu = prev->max_mtu;
> >>>>>       }
> >>>>>   }
> >>>>> 
> >>>>> @@ -167,9 +183,8 @@ valid:
> >>>>>   ngx_quic_path_dbg(c, "is validated", path);
> >>>>> 
> >>>>>   path->validated = 1;
> >>>>> -    path->validating = 0;
> >>>>> 
> >>>>> -    ngx_quic_set_path_timer(c);
> >>>>> +    ngx_quic_discover_path_mtu(c, path);
> >>>>> 
> >>>>>   return NGX_OK;
> >>>>> }
> >>>>> @@ -217,6 +232,8 @@ ngx_quic_new_path(ngx_connection_t *c,
> >>>>>   path->addr_text.len = ngx_sock_ntop(sockaddr, socklen, path->text,
> >>>>>                                       NGX_SOCKADDR_STRLEN, 1);
> >>>>> 
> >>>>> +    path->mtu = NGX_QUIC_MIN_INITIAL_SIZE;
> >>>>> +
> >>>>>   ngx_log_debug2(NGX_LOG_DEBUG_EVENT, c->log, 0,
> >>>>>                  "quic path seq:%uL created addr:%V",
> >>>>>                  path->seqnum, &path->addr_text);
> >>>>> @@ -464,7 +481,7 @@ ngx_quic_handle_migration(ngx_connection
> >>>>> 
> >>>>>   ngx_quic_set_connection_path(c, next);
> >>>>> 
> >>>>> -    if (!next->validated && !next->validating) {
> >>>>> +    if (!next->validated && next->state != NGX_QUIC_PATH_VALIDATING) {
> >>>>>       if (ngx_quic_validate_path(c, next) != NGX_OK) {
> >>>>>           return NGX_ERROR;
> >>>>>       }
> >>>>> @@ -492,7 +509,6 @@ ngx_quic_validate_path(ngx_connection_t 
> >>>>>   ngx_log_debug1(NGX_LOG_DEBUG_EVENT, c->log, 0,
> >>>>>                  "quic initiated validation of path seq:%uL", path->seqnum);
> >>>>> 
> >>>>> -    path->validating = 1;
> >>>>>   path->tries = 0;
> >>>>> 
> >>>>>   if (RAND_bytes(path->challenge1, 8) != 1) {
> >>>>> @@ -511,6 +527,7 @@ ngx_quic_validate_path(ngx_connection_t 
> >>>>>   pto = ngx_max(ngx_quic_pto(c, ctx), 1000);
> >>>>> 
> >>>>>   path->expires = ngx_current_msec + pto;
> >>>>> +    path->state = NGX_QUIC_PATH_VALIDATING;
> >>>>> 
> >>>>>   ngx_quic_set_path_timer(c);
> >>>>> 
> >>>>> @@ -558,6 +575,42 @@ ngx_quic_send_path_challenge(ngx_connect
> >>>>> }
> >>>>> 
> >>>>> 
> >>>>> +void
> >>>>> +ngx_quic_discover_path_mtu(ngx_connection_t *c, ngx_quic_path_t *path)
> >>>>> +{
> >>>>> +    ngx_quic_connection_t  *qc;
> >>>>> +
> >>>>> +    qc = ngx_quic_get_connection(c);
> >>>>> +
> >>>>> +    if (path->max_mtu) {
> >>>>> +        if (path->max_mtu - path->mtu <= NGX_QUIC_PATH_MTU_PRECISION) {
> >>>>> +            path->state = NGX_QUIC_PATH_IDLE;
> >>>>> +            ngx_quic_set_path_timer(c);
> >>>>> +            return;
> >>>>> +        }
> >>>>> +
> >>>>> +        path->mtud = (path->mtu + path->max_mtu) / 2;
> >>>>> +
> >>>>> +    } else {
> >>>>> +        path->mtud = path->mtu + NGX_QUIC_PATH_MTU_STEP;
> >>>> 
> >>>> I like this approach, compared to multiplying probe steps.
> >>>> This better fits to the pattern to keep around a well-known
> >>>> initial value, which is typically 1500 bytes.
> >>>> Another one could be to let user specify its own initial
> >>>> value for cases when he knows well its network environment.
> >>>> 
> >>>>> +
> >>>>> +        if (path->mtud >= qc->ctp.max_udp_payload_size) {
> >>>>> +            path->mtud = qc->ctp.max_udp_payload_size;
> >>>>> +            path->max_mtu = qc->ctp.max_udp_payload_size;
> >>>>> +        }
> >>>>> +    }
> >>>>> +
> >>>>> +    path->state = NGX_QUIC_PATH_WAITING;
> >>>>> +    path->expires = ngx_current_msec + NGX_QUIC_PATH_MTU_DELAY;
> >>>>> +
> >>>>> +    ngx_log_debug2(NGX_LOG_DEBUG_EVENT, c->log, 0,
> >>>>> +                   "quic path seq:%uL schedule mtu:%uz",
> >>>>> +                   path->seqnum, path->mtud);
> >>>>> +
> >>>>> +    ngx_quic_set_path_timer(c);
> >>>>> +}
> >>>>> +
> >>>>> +
> >>>>> static void
> >>>>> ngx_quic_set_path_timer(ngx_connection_t *c)
> >>>>> {
> >>>>> @@ -578,7 +631,7 @@ ngx_quic_set_path_timer(ngx_connection_t
> >>>>>   {
> >>>>>       path = ngx_queue_data(q, ngx_quic_path_t, queue);
> >>>>> 
> >>>>> -        if (!path->validating) {
> >>>>> +        if (path->state == NGX_QUIC_PATH_IDLE) {
> >>>>>           continue;
> >>>>>       }
> >>>>> 
> >>>>> @@ -600,22 +653,18 @@ ngx_quic_set_path_timer(ngx_connection_t
> >>>>> 
> >>>>> 
> >>>>> void
> >>>>> -ngx_quic_path_validation_handler(ngx_event_t *ev)
> >>>>> +ngx_quic_path_handler(ngx_event_t *ev)
> >>>>> {
> >>>>>   ngx_msec_t              now;
> >>>>>   ngx_queue_t            *q;
> >>>>> -    ngx_msec_int_t          left, next, pto;
> >>>>> -    ngx_quic_path_t        *path, *bkp;
> >>>>> +    ngx_msec_int_t          left;
> >>>>> +    ngx_quic_path_t        *path;
> >>>>>   ngx_connection_t       *c;
> >>>>> -    ngx_quic_send_ctx_t    *ctx;
> >>>>>   ngx_quic_connection_t  *qc;
> >>>>> 
> >>>>>   c = ev->data;
> >>>>>   qc = ngx_quic_get_connection(c);
> >>>>> 
> >>>>> -    ctx = ngx_quic_get_send_ctx(qc, ssl_encryption_application);
> >>>>> -
> >>>>> -    next = -1;
> >>>>>   now = ngx_current_msec;
> >>>>> 
> >>>>>   q = ngx_queue_head(&qc->paths);
> >>>>> @@ -625,83 +674,274 @@ ngx_quic_path_validation_handler(ngx_eve
> >>>>>       path = ngx_queue_data(q, ngx_quic_path_t, queue);
> >>>>>       q = ngx_queue_next(q);
> >>>>> 
> >>>>> -        if (!path->validating) {
> >>>>> +        if (path->state == NGX_QUIC_PATH_IDLE) {
> >>>>>           continue;
> >>>>>       }
> >>>>> 
> >>>>>       left = path->expires - now;
> >>>>> 
> >>>>>       if (left > 0) {
> >>>>> -
> >>>>> -            if (next == -1 || left < next) {
> >>>>> -                next = left;
> >>>>> -            }
> >>>>> -
> >>>>> -            continue;
> >>>>> -        }
> >>>>> -
> >>>>> -        if (++path->tries < NGX_QUIC_PATH_RETRIES) {
> >>>>> -            pto = ngx_max(ngx_quic_pto(c, ctx), 1000) << path->tries;
> >>>>> -
> >>>>> -            path->expires = ngx_current_msec + pto;
> >>>>> -
> >>>>> -            if (next == -1 || pto < next) {
> >>>>> -                next = pto;
> >>>>> -            }
> >>>>> -
> >>>>> -            /* retransmit */
> >>>>> -            (void) ngx_quic_send_path_challenge(c, path);
> >>>>> -
> >>>>>           continue;
> >>>>>       }
> >>>>> 
> >>>>> -        ngx_log_debug1(NGX_LOG_DEBUG_EVENT, ev->log, 0,
> >>>>> -                       "quic path seq:%uL validation failed", path->seqnum);
> >>>>> +        switch (path->state) {
> >>>>> +        case NGX_QUIC_PATH_VALIDATING:
> >>>>> +             if (ngx_quic_expire_path_validation(c, path) != NGX_OK) {
> >>>>> +                goto failed;
> >>>>> +            }
> >>>>> +
> >>>>> +            break;
> >>>>> +
> >>>>> +        case NGX_QUIC_PATH_WAITING:
> >>>>> +            if (ngx_quic_expire_path_mtu_delay(c, path) != NGX_OK) {
> >>>>> +                goto failed;
> >>>>> +            }
> >>>>> +
> >>>>> +            break;
> >>>>> +
> >>>>> +        case NGX_QUIC_PATH_DISCOVERING_MTU:
> >>>>> +            if (ngx_quic_expire_path_mtu_discovery(c, path) != NGX_OK) {
> >>>>> +                goto failed;
> >>>>> +             }
> >>>>> +
> >>>>> +            break;
> >>>>> +
> >>>> 
> >>>> indentation in several places here
> >>> 
> >>> Fixed, thanks.
> >>> 
> >>>>> +        default:
> >>>>> +            break;
> >>>>> +         }
> >>>>> +     }
> >>>>> +
> >>>>> +    ngx_quic_set_path_timer(c);
> >>>>> +
> >>>>> +    return;
> >>>>> 
> >>>>> -        /* found expired path */
> >>>>> +failed:
> >>>>> +
> >>>>> +    ngx_quic_close_connection(c, NGX_ERROR);
> >>>>> +}
> >>>>> +
> >>>>> +
> >>>>> +static ngx_int_t
> >>>>> +ngx_quic_expire_path_validation(ngx_connection_t *c, ngx_quic_path_t *path)
> >>>>> +{
> >>>>> +    ngx_msec_int_t          pto;
> >>>>> +    ngx_quic_path_t        *bkp;
> >>>>> +    ngx_quic_send_ctx_t    *ctx;
> >>>>> +    ngx_quic_connection_t  *qc;
> >>>>> 
> >>>>> -        path->validated = 0;
> >>>>> -        path->validating = 0;
> >>>>> +    qc = ngx_quic_get_connection(c);
> >>>>> +    ctx = ngx_quic_get_send_ctx(qc, ssl_encryption_application);
> >>>>> +
> >>>>> +    if (++path->tries < NGX_QUIC_PATH_RETRIES) {
> >>>>> +        pto = ngx_max(ngx_quic_pto(c, ctx), 1000) << path->tries;
> >>>>> +        path->expires = ngx_current_msec + pto;
> >>>>> +
> >>>>> +        (void) ngx_quic_send_path_challenge(c, path);
> >>>>> +
> >>>>> +        return NGX_OK;
> >>>>> +    }
> >>>>> +
> >>>>> +    ngx_log_debug1(NGX_LOG_DEBUG_EVENT, c->log, 0,
> >>>>> +                   "quic path seq:%uL validation failed", path->seqnum);
> >>>>> +
> >>>>> +    /* found expired path */
> >>>>> +
> >>>>> +    path->validated = 0;
> >>>>> 
> >>>>> 
> >>>>> -        /* RFC 9000, 9.3.2.  On-Path Address Spoofing
> >>>>> -         *
> >>>>> -         * To protect the connection from failing due to such a spurious
> >>>>> -         * migration, an endpoint MUST revert to using the last validated
> >>>>> -         * peer address when validation of a new peer address fails.
> >>>>> -         */
> >>>>> -
> >>>>> -        if (qc->path == path) {
> >>>>> -            /* active path validation failed */
> >>>>> -
> >>>>> -            bkp = ngx_quic_get_path(c, NGX_QUIC_PATH_BACKUP);
> >>>>> +    /* RFC 9000, 9.3.2.  On-Path Address Spoofing
> >>>>> +     *
> >>>>> +     * To protect the connection from failing due to such a spurious
> >>>>> +     * migration, an endpoint MUST revert to using the last validated
> >>>>> +     * peer address when validation of a new peer address fails.
> >>>>> +     */
> >>>>> 
> >>>>> -            if (bkp == NULL) {
> >>>>> -                qc->error = NGX_QUIC_ERR_NO_VIABLE_PATH;
> >>>>> -                qc->error_reason = "no viable path";
> >>>>> -                ngx_quic_close_connection(c, NGX_ERROR);
> >>>>> -                return;
> >>>>> -            }
> >>>>> +    if (qc->path == path) {
> >>>>> +        /* active path validation failed */
> >>>>> +
> >>>>> +        bkp = ngx_quic_get_path(c, NGX_QUIC_PATH_BACKUP);
> >>>>> 
> >>>>> -            qc->path = bkp;
> >>>>> -            qc->path->tag = NGX_QUIC_PATH_ACTIVE;
> >>>>> -
> >>>>> -            ngx_quic_set_connection_path(c, qc->path);
> >>>>> -
> >>>>> -            ngx_log_error(NGX_LOG_INFO, c->log, 0,
> >>>>> -                          "quic path seq:%uL addr:%V is restored from backup",
> >>>>> -                          qc->path->seqnum, &qc->path->addr_text);
> >>>>> -
> >>>>> -            ngx_quic_path_dbg(c, "is active", qc->path);
> >>>>> +        if (bkp == NULL) {
> >>>>> +            qc->error = NGX_QUIC_ERR_NO_VIABLE_PATH;
> >>>>> +            qc->error_reason = "no viable path";
> >>>>> +            return NGX_ERROR;
> >>>>>       }
> >>>>> 
> >>>>> -        if (ngx_quic_free_path(c, path) != NGX_OK) {
> >>>>> -            ngx_quic_close_connection(c, NGX_ERROR);
> >>>>> -            return;
> >>>>> +        qc->path = bkp;
> >>>>> +        qc->path->tag = NGX_QUIC_PATH_ACTIVE;
> >>>>> +
> >>>>> +        ngx_quic_set_connection_path(c, qc->path);
> >>>>> +
> >>>>> +        ngx_log_error(NGX_LOG_INFO, c->log, 0,
> >>>>> +                      "quic path seq:%uL addr:%V is restored from backup",
> >>>>> +                      qc->path->seqnum, &qc->path->addr_text);
> >>>>> +
> >>>>> +        ngx_quic_path_dbg(c, "is active", qc->path);
> >>>>> +    }
> >>>>> +
> >>>>> +    return ngx_quic_free_path(c, path);
> >>>>> +}
> >>>>> +
> >>>>> +
> >>>>> +static ngx_int_t
> >>>>> +ngx_quic_expire_path_mtu_delay(ngx_connection_t *c, ngx_quic_path_t *path)
> >>>>> +{
> >>>>> +    ngx_int_t               rc;
> >>>>> +    ngx_uint_t              i;
> >>>>> +    ngx_msec_t              pto;
> >>>>> +    ngx_quic_send_ctx_t    *ctx;
> >>>>> +    ngx_quic_connection_t  *qc;
> >>>>> +
> >>>>> +    qc = ngx_quic_get_connection(c);
> >>>>> +    ctx = ngx_quic_get_send_ctx(qc, ssl_encryption_application);
> >>>>> +
> >>>>> +    path->tries = 0;
> >>>>> +
> >>>>> +    for ( ;; ) {
> >>>>> +
> >>>>> +        for (i = 0; i < NGX_QUIC_PATH_RETRIES; i++) {
> >>>>> +            path->mtu_pnum[i] = NGX_QUIC_UNSET_PN;
> >>>>> +        }
> >>>>> +
> >>>>> +        rc = ngx_quic_send_path_mtu_probe(c, path);
> >>>>> +
> >>>>> +        if (rc == NGX_ERROR) {
> >>>>> +            return NGX_ERROR;
> >>>>> +        }
> >>>>> +
> >>>>> +        if (rc == NGX_OK) {
> >>>>> +            pto = ngx_quic_pto(c, ctx);
> >>>>> +            path->expires = ngx_current_msec + pto;
> >>>>> +            path->state = NGX_QUIC_PATH_DISCOVERING_MTU;
> >>>>> +            return NGX_OK;
> >>>>> +        }
> >>>>> +
> >>>>> +        /* rc == NGX_DECLINED */
> >>>>> +
> >>>>> +        path->max_mtu = path->mtud;
> >>>>> +
> >>>>> +        if (path->max_mtu - path->mtu <= NGX_QUIC_PATH_MTU_PRECISION) {
> >>>>> +            path->state = NGX_QUIC_PATH_IDLE;
> >>>>> +            return NGX_OK;
> >>>>> +        }
> >>>>> +
> >>>>> +        path->mtud = (path->mtu + path->max_mtu) / 2;
> >>>>> +    }
> >>>>> +}
> >>>>> +
> >>>>> +
> >>>>> +static ngx_int_t
> >>>>> +ngx_quic_expire_path_mtu_discovery(ngx_connection_t *c, ngx_quic_path_t *path)
> >>>>> +{
> >>>>> +    ngx_int_t               rc;
> >>>>> +    ngx_msec_int_t          pto;
> >>>>> +    ngx_quic_send_ctx_t    *ctx;
> >>>>> +    ngx_quic_connection_t  *qc;
> >>>>> +
> >>>>> +    qc = ngx_quic_get_connection(c);
> >>>>> +    ctx = ngx_quic_get_send_ctx(qc, ssl_encryption_application);
> >>>>> +
> >>>>> +    if (++path->tries < NGX_QUIC_PATH_RETRIES) {
> >>>>> +        pto = ngx_quic_pto(c, ctx) << path->tries;
> >>>>> +        path->expires = ngx_current_msec + pto;
> >>>>> +
> >>>> 
> >>>> Setting path->expires there seems useless.
> >>>> It don't seem to be used in ngx_quic_send_path_mtu_probe(),
> >>>> and ngx_quic_discover_path_mtu() used to reset path->expires.
> >>> 
> >>> If ngx_quic_send_path_mtu_probe() below returns NGX_OK, the path should stay
> >>> in the same state, but expiration time should be advanced.  To make it
> >>> more obvious, I changed this code.
> >> 
> >> Updated code makes it clear, thanks.
> >> 
> >>> 
> >>>>> +        rc = ngx_quic_send_path_mtu_probe(c, path);
> >>>>> +        if (rc != NGX_DECLINED) {
> >>>>> +            return rc;
> >>>>>       }
> >>>>>   }
> >>>>> 
> >>>>> -    if (next != -1) {
> >>>>> -        ngx_add_timer(&qc->path_validation, next);
> >>>>> +    ngx_log_debug2(NGX_LOG_DEBUG_EVENT, c->log, 0,
> >>>>> +                   "quic path seq:%uL expired mtu:%uz",
> >>>>> +                   path->seqnum, path->mtud);
> >>>>> +
> >>>>> +    path->max_mtu = path->mtud;
> >>>>> +
> >>>>> +    ngx_quic_discover_path_mtu(c, path);
> >>>>> +
> >>>>> +    return NGX_OK;
> >>>>> +}
> >>>>> +
> >>>>> +
> >>>>> +static ngx_int_t
> >>>>> +ngx_quic_send_path_mtu_probe(ngx_connection_t *c, ngx_quic_path_t *path)
> >>>>> +{
> >>>>> +    ngx_int_t               rc;
> >>>>> +    ngx_uint_t              log_error;
> >>>>> +    ngx_quic_frame_t        frame;
> >>>>> +    ngx_quic_send_ctx_t    *ctx;
> >>>>> +    ngx_quic_connection_t  *qc;
> >>>>> +
> >>>>> +    ngx_memzero(&frame, sizeof(ngx_quic_frame_t));
> >>>>> +
> >>>>> +    frame.level = ssl_encryption_application;
> >>>>> +    frame.type = NGX_QUIC_FT_PING;
> >>>>> +
> >>>>> +    qc = ngx_quic_get_connection(c);
> >>>>> +    ctx = ngx_quic_get_send_ctx(qc, ssl_encryption_application);
> >>>>> +    path->mtu_pnum[path->tries] = ctx->pnum;
> >>>>> +
> >>>>> +    ngx_log_debug4(NGX_LOG_DEBUG_EVENT, c->log, 0,
> >>>>> +                   "quic path seq:%uL send probe "
> >>>>> +                   "mtu:%uz pnum:%uL tries:%ui",
> >>>>> +                   path->seqnum, path->mtud, ctx->pnum, path->tries);
> >>>>> +
> >>>>> +    log_error = c->log_error;
> >>>>> +    c->log_error = NGX_ERROR_IGNORE_EMSGSIZE;
> >>>>> +
> >>>>> +    rc = ngx_quic_frame_sendto(c, &frame, path->mtud, path);
> >>>>> +    c->log_error = log_error;
> >>>>> +
> >>>>> +    if (rc == NGX_ERROR) {
> >>>>> +        if (c->write->error) {
> >>>>> +            c->write->error = 0;
> >>>>> +
> >>>>> +            ngx_log_debug2(NGX_LOG_DEBUG_EVENT, c->log, 0,
> >>>>> +                           "quic path seq:%uL rejected mtu:%uz",
> >>>>> +                           path->seqnum, path->mtud);
> >>>>> +
> >>>>> +            return NGX_DECLINED;
> >>>> 
> >>>> The error handling looks awkward to decline on write error.
> >>>> ngx_sendmsg() sets write error on fatal sendmsg errors,
> >>>> and keeps it unset NGX_EAGAIN and NGX_EINTR, which
> >>>> are typically non-fatal transient errors.
> >>>> I understand this is done mostly to ignore EMSGSIZE,
> >>>> maybe it makes sense to ignore the rest sendmsg() errors.
> >>> 
> >>> The problem is, the error doesn't make it to this point.  All be have here is
> >>> NGX_ERROR and c->write->error.  I think, for PMTUD purposes we can safely
> >>> ignore the errors.  It's hinghly likely, the error will repeat while doing
> >>> connection I/O and we'll shortly close the connection.
> >> 
> >> Ok, I re-read ngx_sendmsg() to clarify the following:
> >> - NGX_EINTR is handled internally and never returned up to here;
> >> - NGX_EAGAIN (NGX_AGAIN) seems to be returned, at least on Linux,
> >>  when send buffer is low for the MTU probe, converted to NGX_OK;
> >> - FTR, looking at sosend_dgram() on FreeBSD, sendmsg seems to
> >>  return EMSGSIZE there, instead, on low socket buffer space;
> >> - other sendmsg errors are returned as NGX_ERROR, and
> >>  c->write->error is set; this is converted to NGX_DECLINED;
> >> - non-sendmsg NGX_ERRORs appeared in ngx_quic_frame_sendto()
> >>  are passed as is without conversion.
> >> 
> >> So, the error handling as implemented to decline on any sendmsg
> >> "fatal" errors actually makes sense, if we'd like to avoid rewriting
> >> ngx_sendmsg() to further distinguish sendmsg errors, notably EMSGSIZE.
> >> The downside is that connection may ignore actually fatal errors,
> >> though the error will likely repeat on sending non-probing frames.
> >> 
> >> NGX_AGAIN conversion to NGX_OK is questionable, because this
> >> makes a probe considered successfully sent while it is not.
> >> I wonder though what we can do about that other than to simply
> >> ignore, similar to other ngx_quic_frame_sendto() callers,
> >> because next time on a probe retry sendmsg can return success.
> > 
> > Ignoring certain send error for probes is not a big deal since timeout
> > expiration will lead to the same result eventually.
> > 
> >>>>> +        }
> >>>>> +
> >>>>> +        return NGX_ERROR;
> >>>>>   }
> >>>>> +
> >>>>> +    return NGX_OK;
> >>>>> }
> >>>>> +
> >>>>> +
> >>>>> +ngx_int_t
> >>>>> +ngx_quic_handle_path_mtu_ack(ngx_connection_t *c, ngx_quic_path_t *path,
> >>>>> +    uint64_t min, uint64_t max)
> >>>>> +{
> >>>>> +    uint64_t    pnum;
> >>>>> +    ngx_uint_t  i;
> >>>>> +
> >>>>> +    if (path->state != NGX_QUIC_PATH_DISCOVERING_MTU) {
> >>>>> +        return NGX_OK;
> >>>>> +    }
> >>>>> +
> >>>>> +    for (i = 0; i < NGX_QUIC_PATH_RETRIES; i++) {
> >>>>> +        pnum = path->mtu_pnum[i];
> >>>>> +
> >>>>> +        if (pnum == NGX_QUIC_UNSET_PN) {
> >>>>> +            break;
> >>>>> +        }
> >>>>> +
> >>>>> +        if (pnum < min || pnum > max) {
> >>>>> +            continue;
> >>>>> +        }
> >>>>> +
> >>>>> +        path->mtu = path->mtud;
> >>>>> +
> >>>>> +        ngx_log_debug2(NGX_LOG_DEBUG_EVENT, c->log, 0,
> >>>>> +                       "quic path seq:%uL ack mtu:%uz",
> >>>>> +                       path->seqnum, path->mtu);
> >>>>> +
> >>>>> +        ngx_quic_discover_path_mtu(c, path);
> >>>>> +
> >>>>> +        break;
> >>>> 
> >>>> What if two probes appeared ACKed in one range.
> >>>> Though that seems to be safe to break there, as two
> >>>> probes barely have different probe sizes, rather retries.
> >>> 
> >>> No matter how many probes are acked, the next step is still the same.
> >>> So there's no reason to find all of them.
> >>> 
> >> 
> >> Ok, let's leave it as is.
> >> 
> >>>>> +    }
> >>>>> +
> >>>>> +    return NGX_OK;
> >>>>> +}
> >>>>> diff --git a/src/event/quic/ngx_event_quic_migration.h b/src/event/quic/ngx_event_quic_migration.h
> >>>>> --- a/src/event/quic/ngx_event_quic_migration.h
> >>>>> +++ b/src/event/quic/ngx_event_quic_migration.h
> >>>>> @@ -18,10 +18,10 @@
> >>>>> #define NGX_QUIC_PATH_BACKUP    2
> >>>>> 
> >>>>> #define ngx_quic_path_dbg(c, msg, path)                                       \
> >>>>> -    ngx_log_debug6(NGX_LOG_DEBUG_EVENT, c->log, 0,                            \
> >>>>> -                   "quic path seq:%uL %s sent:%O recvd:%O state:%s%s",        \
> >>>>> +    ngx_log_debug7(NGX_LOG_DEBUG_EVENT, c->log, 0,                            \
> >>>>> +                   "quic path seq:%uL %s tx:%O rx:%O valid:%d st:%d mtu:%uz", \
> >>>>>                  path->seqnum, msg, path->sent, path->received,             \
> >>>>> -                   path->validated ? "V": "N", path->validating ? "R": "");
> >>>>> +                   path->validated, path->state, path->mtu);
> >>>>> 
> >>>>> ngx_int_t ngx_quic_handle_path_challenge_frame(ngx_connection_t *c,
> >>>>>   ngx_quic_header_t *pkt, ngx_quic_path_challenge_frame_t *f);
> >>>>> @@ -36,6 +36,10 @@ ngx_int_t ngx_quic_set_path(ngx_connecti
> >>>>> ngx_int_t ngx_quic_handle_migration(ngx_connection_t *c,
> >>>>>   ngx_quic_header_t *pkt);
> >>>>> 
> >>>>> -void ngx_quic_path_validation_handler(ngx_event_t *ev);
> >>>>> +void ngx_quic_path_handler(ngx_event_t *ev);
> >>>>> +
> >>>>> +void ngx_quic_discover_path_mtu(ngx_connection_t *c, ngx_quic_path_t *path);
> >>>>> +ngx_int_t ngx_quic_handle_path_mtu_ack(ngx_connection_t *c,
> >>>>> +    ngx_quic_path_t *path, uint64_t min, uint64_t max);
> >>>> 
> >>>> Nitpicking:
> >>>> maybe just ngx_quic_handle_path_mtu() ?
> >>>> This would match ngx_quic_discover_path_mtu() naming.
> >>> 
> >>> OK, renamed.
> >>> 
> >>>>> 
> >>>>> #endif /* _NGX_EVENT_QUIC_MIGRATION_H_INCLUDED_ */
> >>>>> diff --git a/src/event/quic/ngx_event_quic_output.c b/src/event/quic/ngx_event_quic_output.c
> >>>>> --- a/src/event/quic/ngx_event_quic_output.c
> >>>>> +++ b/src/event/quic/ngx_event_quic_output.c
> >>>>> @@ -10,9 +10,6 @@
> >>>>> #include <ngx_event_quic_connection.h>
> >>>>> 
> >>>>> 
> >>>>> -#define NGX_QUIC_MAX_UDP_PAYLOAD_OUT   1252
> >>>>> -#define NGX_QUIC_MAX_UDP_PAYLOAD_OUT6  1232
> >>>>> -
> >>>>> #define NGX_QUIC_MAX_UDP_SEGMENT_BUF  65487 /* 65K - IPv6 header */
> >>>>> #define NGX_QUIC_MAX_SEGMENTS            64 /* UDP_MAX_SEGMENTS */
> >>>>> 
> >>>>> @@ -61,21 +58,6 @@ static size_t ngx_quic_path_limit(ngx_co
> >>>>>   size_t size);
> >>>>> 
> >>>>> 
> >>>>> -size_t
> >>>>> -ngx_quic_max_udp_payload(ngx_connection_t *c)
> >>>>> -{
> >>>>> -    /* TODO: path MTU discovery */
> >>>>> -
> >>>>> -#if (NGX_HAVE_INET6)
> >>>>> -    if (c->sockaddr->sa_family == AF_INET6) {
> >>>>> -        return NGX_QUIC_MAX_UDP_PAYLOAD_OUT6;
> >>>>> -    }
> >>>>> -#endif
> >>>>> -
> >>>>> -    return NGX_QUIC_MAX_UDP_PAYLOAD_OUT;
> >>>>> -}
> >>>>> -
> >>>>> -
> >>>>> ngx_int_t
> >>>>> ngx_quic_output(ngx_connection_t *c)
> >>>>> {
> >>>>> @@ -142,10 +124,7 @@ ngx_quic_create_datagrams(ngx_connection
> >>>>> 
> >>>>>       p = dst;
> >>>>> 
> >>>>> -        len = ngx_min(qc->ctp.max_udp_payload_size,
> >>>>> -                      NGX_QUIC_MAX_UDP_PAYLOAD_SIZE);
> >>>>> -
> >>>>> -        len = ngx_quic_path_limit(c, path, len);
> >>>>> +        len = ngx_quic_path_limit(c, path, path->mtu);
> >>>>> 
> >>>>>       pad = ngx_quic_get_padding_level(c);
> >>>>> 
> >>>>> @@ -299,9 +278,7 @@ ngx_quic_allow_segmentation(ngx_connecti
> >>>>>   ctx = ngx_quic_get_send_ctx(qc, ssl_encryption_application);
> >>>>> 
> >>>>>   bytes = 0;
> >>>>> -
> >>>>> -    len = ngx_min(qc->ctp.max_udp_payload_size,
> >>>>> -                  NGX_QUIC_MAX_UDP_SEGMENT_BUF);
> >>>>> +    len = ngx_min(qc->path->mtu, NGX_QUIC_MAX_UDP_SEGMENT_BUF);
> >>>>> 
> >>>>>   for (q = ngx_queue_head(&ctx->frames);
> >>>>>        q != ngx_queue_sentinel(&ctx->frames);
> >>>>> @@ -345,8 +322,7 @@ ngx_quic_create_segments(ngx_connection_
> >>>>>       return NGX_ERROR;
> >>>>>   }
> >>>>> 
> >>>>> -    segsize = ngx_min(qc->ctp.max_udp_payload_size,
> >>>>> -                      NGX_QUIC_MAX_UDP_SEGMENT_BUF);
> >>>>> +    segsize = ngx_min(path->mtu, NGX_QUIC_MAX_UDP_SEGMENT_BUF);
> >>>>>   p = dst;
> >>>>>   end = dst + sizeof(dst);
> >>>>> 
> >>>>> diff --git a/src/event/quic/ngx_event_quic_output.h b/src/event/quic/ngx_event_quic_output.h
> >>>>> --- a/src/event/quic/ngx_event_quic_output.h
> >>>>> +++ b/src/event/quic/ngx_event_quic_output.h
> >>>>> @@ -12,8 +12,6 @@
> >>>>> #include <ngx_core.h>
> >>>>> 
> >>>>> 
> >>>>> -size_t ngx_quic_max_udp_payload(ngx_connection_t *c);
> >>>>> -
> >>>>> ngx_int_t ngx_quic_output(ngx_connection_t *c);
> >>>>> 
> >>>>> ngx_int_t ngx_quic_negotiate_version(ngx_connection_t *c,
> >>>>> diff --git a/src/event/quic/ngx_event_quic_ssl.c b/src/event/quic/ngx_event_quic_ssl.c
> >>>>> --- a/src/event/quic/ngx_event_quic_ssl.c
> >>>>> +++ b/src/event/quic/ngx_event_quic_ssl.c
> >>>>> @@ -494,6 +494,8 @@ ngx_quic_crypto_input(ngx_connection_t *
> >>>>>    */
> >>>>>   ngx_quic_discard_ctx(c, ssl_encryption_handshake);
> >>>>> 
> >>>>> +    ngx_quic_discover_path_mtu(c, qc->path);
> >>>>> +
> >>>>>   /* start accepting clients on negotiated number of server ids */
> >>>>>   if (ngx_quic_create_sockets(c) != NGX_OK) {
> >>>>>       return NGX_ERROR;
> >>>>> diff --git a/src/os/unix/ngx_errno.h b/src/os/unix/ngx_errno.h
> >>>>> --- a/src/os/unix/ngx_errno.h
> >>>>> +++ b/src/os/unix/ngx_errno.h
> >>>>> @@ -54,6 +54,7 @@ typedef int               ngx_err_t;
> >>>>> #define NGX_ENOMOREFILES  0
> >>>>> #define NGX_ELOOP         ELOOP
> >>>>> #define NGX_EBADF         EBADF
> >>>>> +#define NGX_EMSGSIZE      EMSGSIZE
> >>>>> 
> >>>>> #if (NGX_HAVE_OPENAT)
> >>>>> #define NGX_EMLINK        EMLINK
> >> 
> >> This will break on win32, an NGX_EMSGSIZE definition needs to be
> >> provided there, which is WSAEMSGSIZE.
> > 
> > Thanks, fixed.
> > 
> >>> 
> >>> The diff is attached.
> >>> 
> >> 
> >> The diff looks good.
> > 
> > Also, with step-based mtu step updates, QUIC flood detection is triggered with
> > the h3_keepalive.t on interfaces with large mtu.  I think, doubling mtu
> > is a safer choice.
> 
> While optimizing for theoretical maximum is not something to accent,
> it is certainly expected to work well on various links.  Below are some
> numbers to consider, taken on various PMTU and interval/search strategy.
> 
> = 20 streams, 1mb response each, pmtu=65536 (sampled 3 times)
> 
> pn/ack  probes  last    total bytes
> 
> == 1000ms + 256
> 1       21      6.320   79.472
> 2       18      5.808   65.376
> 3       17      5.552   59.568
> 4       15      2.400   33.984
> 
> == 100ms + 256
> 1       263     65.500  8.962.971
> 2       256     65.500  8.708.555
> 3       268     65.500  9.188.363
> 
> == 100ms * 2
> 1       17-24   65.513    833.607-1.286.679
> 2       23-24   65.513  1.190.607-1.255.710
> 3       20-23   65.513    996.252-1.198.820
> 4       18      65.513    872.007
> 
> It is clearly seen from the above:
> - "1000ms + 256" (baseline) does not converge after 20 MB
> - "100ms + 256" converges but hits flood detection limit,
>   which is 8 MB of probes if no stream traffic in the connection.
> - "100ms * 2" results in ~ 1 MB of total probe traffic
> 
> This explains how h3_keepalive.t is failed.
> 
> 
> Below "100ms * 2" is also seen with better avg payload size.
> Compared with pto and ack-only packets cleared from raw data,
> seen increased with "100ms * 2" if client ack's with frequency
> less than on every second packet, which messed up avg stats.
> 
> = 20 streams, 1mb response each, pmtu=65536 (sampled 3 times)
> 
> pn/ack  pn      stat
> 
> == 1000ms + 256
> 1       N=8245  min=3  max=6033  m=2437 avg=2552  std=1750
> 2       N=8481  min=3  max=5733  m=2296 avg=2486  std=1453
> 3       N=8703  min=3  max=5517  m=2189 avg=2424  std=1361
> 4       N=11237 min=3 max=2349  m=2349 avg=1879   std=770
> 
> == 100ms + 256
> 1       N=3757  min=3 max=18573 m=3494 avg=5440   std=5226
> 2       N=3696  min=3 max=17037 m=4493 avg=5686   std=4475
> 3       N=3745  min=3 max=16525 m=4493 avg=5657   std=4308
> 
> == 100ms * 2
> 1       N=1956  min=3  max=65465 m=1165 avg=10757 std=23394
> 2       N=1150  min=3  max=65465 m=2320 avg=18264 std=26491
> 3       N=1411  min=3  max=65465 m=1165 avg=14879 std=25476
> 4       N=1691  min=3  max=65465 m=548  avg=12419 std=23854
> == after clearing pto and ack-only
> 1*      N=1910  min=9  max=65465 m=1165 avg=11014 std=23612
> 2*      N=1095  min=10 max=65465 m=2365 avg=19181 std=26823
> 3*      N=1117  min=12 max=65465 m=2193 avg=18798 std=27320
> 4*      N=1145  min=9  max=65465 m=2365 avg=18333 std=27049
> 
> As seen from the above, "100ms * 2" represents worse median,
> even with pto/ack-only cleared, which may be a sign of poor
> packing to maximum packet size, resulting in short packets.
> 
> 
> On a typical PMTU of up to 1500, the difference is leveled
> (captured with pn/ack=2)
> 
> = 20 streams, 1mb response each, pmtu=1500 (sampled 3 times)
> 
> x 1000_256
> + 100_256
> * 100_x2
>     N           Min           Max        Median           Avg        Stddev
> x 16443             3          1437          1437     1283.7392     414.27811
> + 16377             3          1437          1437     1288.8831     412.92124
> No difference proven at 95.0% confidence
> * 16398             3          1436          1436     1287.2667     413.79561
> No difference proven at 95.0% confidence
> 
> pn/ack  probes  last    total bytes
> == 1000ms + 256
> 2       6       1472    9232
> == 100ms + 256
> 2       6       1472    9232
> == 100ms * 2
> 2       8       1471    12889
> 
> > 
> > Also, while looking in migration test logs, I thought it might be a good idea to
> > only handle validation/mtu events for the currently active path.
> > 
> 
> Can you elaborate?

I had another patch applied while testing pmtud and had the migration test
failed.  I think, we can in future simplify path event handling by only handin
the active path.  But we have to be careful.

> > Including 3 patch updates:
> > 
> > - the last one + win32 fix + format fix in ngx_quic_path_dbg()
> > - x2 mtu

Merged these two updates.  Final patch attached.

> > - single-path event handling

Let's postpone this one for now.

> > --
> > Roman Arutyunyan
> > <quic-pmtud-fix.txt><quic-pmtud-scale-x2.txt><quic-pmtud-fix-2.txt>_______________________________________________
> > nginx-devel mailing list
> > nginx-devel at nginx.org
> > https://mailman.nginx.org/mailman/listinfo/nginx-devel
> 
> -- 
> Sergey Kandaurov
> _______________________________________________
> nginx-devel mailing list
> nginx-devel at nginx.org
> https://mailman.nginx.org/mailman/listinfo/nginx-devel

--
Roman Arutyunyan
-------------- next part --------------
# HG changeset patch
# User Roman Arutyunyan <arut at nginx.com>
# Date 1691990487 -14400
#      Mon Aug 14 09:21:27 2023 +0400
# Node ID 58afcd72446ff33811e773f1cabb7866a92a09a0
# Parent  f3412ec3b6d146f091acee2c8cf44e39ed0fbe26
QUIC: path MTU discovery.

MTU selection starts by doubling the initial MTU until the first failure.
Then binary search is used to find the path MTU.

diff --git a/src/core/ngx_connection.c b/src/core/ngx_connection.c
--- a/src/core/ngx_connection.c
+++ b/src/core/ngx_connection.c
@@ -1583,6 +1583,10 @@ ngx_connection_error(ngx_connection_t *c
     }
 #endif
 
+    if (err == NGX_EMSGSIZE && c->log_error == NGX_ERROR_IGNORE_EMSGSIZE) {
+        return 0;
+    }
+
     if (err == 0
         || err == NGX_ECONNRESET
 #if (NGX_WIN32)
@@ -1600,6 +1604,7 @@ ngx_connection_error(ngx_connection_t *c
     {
         switch (c->log_error) {
 
+        case NGX_ERROR_IGNORE_EMSGSIZE:
         case NGX_ERROR_IGNORE_EINVAL:
         case NGX_ERROR_IGNORE_ECONNRESET:
         case NGX_ERROR_INFO:
diff --git a/src/core/ngx_connection.h b/src/core/ngx_connection.h
--- a/src/core/ngx_connection.h
+++ b/src/core/ngx_connection.h
@@ -97,7 +97,8 @@ typedef enum {
     NGX_ERROR_ERR,
     NGX_ERROR_INFO,
     NGX_ERROR_IGNORE_ECONNRESET,
-    NGX_ERROR_IGNORE_EINVAL
+    NGX_ERROR_IGNORE_EINVAL,
+    NGX_ERROR_IGNORE_EMSGSIZE
 } ngx_connection_log_error_e;
 
 
diff --git a/src/event/quic/ngx_event_quic.c b/src/event/quic/ngx_event_quic.c
--- a/src/event/quic/ngx_event_quic.c
+++ b/src/event/quic/ngx_event_quic.c
@@ -149,11 +149,6 @@ ngx_quic_apply_transport_params(ngx_conn
         ngx_log_error(NGX_LOG_INFO, c->log, 0,
                       "quic maximum packet size is invalid");
         return NGX_ERROR;
-
-    } else if (ctp->max_udp_payload_size > ngx_quic_max_udp_payload(c)) {
-        ctp->max_udp_payload_size = ngx_quic_max_udp_payload(c);
-        ngx_log_debug0(NGX_LOG_DEBUG_EVENT, c->log, 0,
-                       "quic client maximum packet size truncated");
     }
 
     if (ctp->active_connection_id_limit < 2) {
@@ -286,7 +281,7 @@ ngx_quic_new_connection(ngx_connection_t
 
     qc->path_validation.log = c->log;
     qc->path_validation.data = c;
-    qc->path_validation.handler = ngx_quic_path_validation_handler;
+    qc->path_validation.handler = ngx_quic_path_handler;
 
     qc->conf = conf;
 
@@ -297,7 +292,7 @@ ngx_quic_new_connection(ngx_connection_t
     ctp = &qc->ctp;
 
     /* defaults to be used before actual client parameters are received */
-    ctp->max_udp_payload_size = ngx_quic_max_udp_payload(c);
+    ctp->max_udp_payload_size = NGX_QUIC_MAX_UDP_PAYLOAD_SIZE;
     ctp->ack_delay_exponent = NGX_QUIC_DEFAULT_ACK_DELAY_EXPONENT;
     ctp->max_ack_delay = NGX_QUIC_DEFAULT_MAX_ACK_DELAY;
     ctp->active_connection_id_limit = 2;
diff --git a/src/event/quic/ngx_event_quic_ack.c b/src/event/quic/ngx_event_quic_ack.c
--- a/src/event/quic/ngx_event_quic_ack.c
+++ b/src/event/quic/ngx_event_quic_ack.c
@@ -229,6 +229,12 @@ ngx_quic_handle_ack_frame_range(ngx_conn
 
     qc = ngx_quic_get_connection(c);
 
+    if (ctx->level == ssl_encryption_application) {
+        if (ngx_quic_handle_path_mtu(c, qc->path, min, max) != NGX_OK) {
+            return NGX_ERROR;
+        }
+    }
+
     st->max_pn = NGX_TIMER_INFINITE;
     found = 0;
 
diff --git a/src/event/quic/ngx_event_quic_connection.h b/src/event/quic/ngx_event_quic_connection.h
--- a/src/event/quic/ngx_event_quic_connection.h
+++ b/src/event/quic/ngx_event_quic_connection.h
@@ -66,6 +66,14 @@ typedef struct ngx_quic_keys_s        ng
 #define ngx_quic_get_socket(c)               ((ngx_quic_socket_t *)((c)->udp))
 
 
+typedef enum {
+    NGX_QUIC_PATH_IDLE = 0,
+    NGX_QUIC_PATH_VALIDATING,
+    NGX_QUIC_PATH_WAITING,
+    NGX_QUIC_PATH_MTUD
+} ngx_quic_path_state_e;
+
+
 struct ngx_quic_client_id_s {
     ngx_queue_t                       queue;
     uint64_t                          seqnum;
@@ -89,18 +97,22 @@ struct ngx_quic_path_s {
     ngx_sockaddr_t                    sa;
     socklen_t                         socklen;
     ngx_quic_client_id_t             *cid;
+    ngx_quic_path_state_e             state;
     ngx_msec_t                        expires;
     ngx_uint_t                        tries;
     ngx_uint_t                        tag;
+    size_t                            mtu;
+    size_t                            mtud;
+    size_t                            max_mtu;
     off_t                             sent;
     off_t                             received;
     u_char                            challenge1[8];
     u_char                            challenge2[8];
     uint64_t                          seqnum;
+    uint64_t                          mtu_pnum[NGX_QUIC_PATH_RETRIES];
     ngx_str_t                         addr_text;
     u_char                            text[NGX_SOCKADDR_STRLEN];
-    unsigned                          validated:1;
-    unsigned                          validating:1;
+    ngx_uint_t                        validated; /* unsigned validated:1; */
 };
 
 
diff --git a/src/event/quic/ngx_event_quic_migration.c b/src/event/quic/ngx_event_quic_migration.c
--- a/src/event/quic/ngx_event_quic_migration.c
+++ b/src/event/quic/ngx_event_quic_migration.c
@@ -10,6 +10,10 @@
 #include <ngx_event_quic_connection.h>
 
 
+#define NGX_QUIC_PATH_MTU_DELAY       100
+#define NGX_QUIC_PATH_MTU_PRECISION   16
+
+
 static void ngx_quic_set_connection_path(ngx_connection_t *c,
     ngx_quic_path_t *path);
 static ngx_int_t ngx_quic_validate_path(ngx_connection_t *c,
@@ -17,7 +21,15 @@ static ngx_int_t ngx_quic_validate_path(
 static ngx_int_t ngx_quic_send_path_challenge(ngx_connection_t *c,
     ngx_quic_path_t *path);
 static void ngx_quic_set_path_timer(ngx_connection_t *c);
+static ngx_int_t ngx_quic_expire_path_validation(ngx_connection_t *c,
+    ngx_quic_path_t *path);
+static ngx_int_t ngx_quic_expire_path_mtu_delay(ngx_connection_t *c,
+    ngx_quic_path_t *path);
+static ngx_int_t ngx_quic_expire_path_mtu_discovery(ngx_connection_t *c,
+    ngx_quic_path_t *path);
 static ngx_quic_path_t *ngx_quic_get_path(ngx_connection_t *c, ngx_uint_t tag);
+static ngx_int_t ngx_quic_send_path_mtu_probe(ngx_connection_t *c,
+    ngx_quic_path_t *path);
 
 
 ngx_int_t
@@ -97,7 +109,7 @@ ngx_quic_handle_path_response_frame(ngx_
     {
         path = ngx_queue_data(q, ngx_quic_path_t, queue);
 
-        if (!path->validating) {
+        if (path->state != NGX_QUIC_PATH_VALIDATING) {
             continue;
         }
 
@@ -136,6 +148,9 @@ valid:
         {
             /* address did not change */
             rst = 0;
+
+            path->mtu = prev->mtu;
+            path->max_mtu = prev->max_mtu;
         }
     }
 
@@ -167,9 +182,8 @@ valid:
     ngx_quic_path_dbg(c, "is validated", path);
 
     path->validated = 1;
-    path->validating = 0;
 
-    ngx_quic_set_path_timer(c);
+    ngx_quic_discover_path_mtu(c, path);
 
     return NGX_OK;
 }
@@ -217,6 +231,8 @@ ngx_quic_new_path(ngx_connection_t *c,
     path->addr_text.len = ngx_sock_ntop(sockaddr, socklen, path->text,
                                         NGX_SOCKADDR_STRLEN, 1);
 
+    path->mtu = NGX_QUIC_MIN_INITIAL_SIZE;
+
     ngx_log_debug2(NGX_LOG_DEBUG_EVENT, c->log, 0,
                    "quic path seq:%uL created addr:%V",
                    path->seqnum, &path->addr_text);
@@ -464,7 +480,7 @@ ngx_quic_handle_migration(ngx_connection
 
     ngx_quic_set_connection_path(c, next);
 
-    if (!next->validated && !next->validating) {
+    if (!next->validated && next->state != NGX_QUIC_PATH_VALIDATING) {
         if (ngx_quic_validate_path(c, next) != NGX_OK) {
             return NGX_ERROR;
         }
@@ -492,7 +508,6 @@ ngx_quic_validate_path(ngx_connection_t 
     ngx_log_debug1(NGX_LOG_DEBUG_EVENT, c->log, 0,
                    "quic initiated validation of path seq:%uL", path->seqnum);
 
-    path->validating = 1;
     path->tries = 0;
 
     if (RAND_bytes(path->challenge1, 8) != 1) {
@@ -511,6 +526,7 @@ ngx_quic_validate_path(ngx_connection_t 
     pto = ngx_max(ngx_quic_pto(c, ctx), 1000);
 
     path->expires = ngx_current_msec + pto;
+    path->state = NGX_QUIC_PATH_VALIDATING;
 
     ngx_quic_set_path_timer(c);
 
@@ -558,6 +574,42 @@ ngx_quic_send_path_challenge(ngx_connect
 }
 
 
+void
+ngx_quic_discover_path_mtu(ngx_connection_t *c, ngx_quic_path_t *path)
+{
+    ngx_quic_connection_t  *qc;
+
+    qc = ngx_quic_get_connection(c);
+
+    if (path->max_mtu) {
+        if (path->max_mtu - path->mtu <= NGX_QUIC_PATH_MTU_PRECISION) {
+            path->state = NGX_QUIC_PATH_IDLE;
+            ngx_quic_set_path_timer(c);
+            return;
+        }
+
+        path->mtud = (path->mtu + path->max_mtu) / 2;
+
+    } else {
+        path->mtud = path->mtu * 2;
+
+        if (path->mtud >= qc->ctp.max_udp_payload_size) {
+            path->mtud = qc->ctp.max_udp_payload_size;
+            path->max_mtu = qc->ctp.max_udp_payload_size;
+        }
+    }
+
+    path->state = NGX_QUIC_PATH_WAITING;
+    path->expires = ngx_current_msec + NGX_QUIC_PATH_MTU_DELAY;
+
+    ngx_log_debug2(NGX_LOG_DEBUG_EVENT, c->log, 0,
+                   "quic path seq:%uL schedule mtu:%uz",
+                   path->seqnum, path->mtud);
+
+    ngx_quic_set_path_timer(c);
+}
+
+
 static void
 ngx_quic_set_path_timer(ngx_connection_t *c)
 {
@@ -578,7 +630,7 @@ ngx_quic_set_path_timer(ngx_connection_t
     {
         path = ngx_queue_data(q, ngx_quic_path_t, queue);
 
-        if (!path->validating) {
+        if (path->state == NGX_QUIC_PATH_IDLE) {
             continue;
         }
 
@@ -600,22 +652,18 @@ ngx_quic_set_path_timer(ngx_connection_t
 
 
 void
-ngx_quic_path_validation_handler(ngx_event_t *ev)
+ngx_quic_path_handler(ngx_event_t *ev)
 {
     ngx_msec_t              now;
     ngx_queue_t            *q;
-    ngx_msec_int_t          left, next, pto;
-    ngx_quic_path_t        *path, *bkp;
+    ngx_msec_int_t          left;
+    ngx_quic_path_t        *path;
     ngx_connection_t       *c;
-    ngx_quic_send_ctx_t    *ctx;
     ngx_quic_connection_t  *qc;
 
     c = ev->data;
     qc = ngx_quic_get_connection(c);
 
-    ctx = ngx_quic_get_send_ctx(qc, ssl_encryption_application);
-
-    next = -1;
     now = ngx_current_msec;
 
     q = ngx_queue_head(&qc->paths);
@@ -625,83 +673,280 @@ ngx_quic_path_validation_handler(ngx_eve
         path = ngx_queue_data(q, ngx_quic_path_t, queue);
         q = ngx_queue_next(q);
 
-        if (!path->validating) {
+        if (path->state == NGX_QUIC_PATH_IDLE) {
             continue;
         }
 
         left = path->expires - now;
 
         if (left > 0) {
-
-            if (next == -1 || left < next) {
-                next = left;
-            }
-
-            continue;
-        }
-
-        if (++path->tries < NGX_QUIC_PATH_RETRIES) {
-            pto = ngx_max(ngx_quic_pto(c, ctx), 1000) << path->tries;
-
-            path->expires = ngx_current_msec + pto;
-
-            if (next == -1 || pto < next) {
-                next = pto;
-            }
-
-            /* retransmit */
-            (void) ngx_quic_send_path_challenge(c, path);
-
             continue;
         }
 
-        ngx_log_debug1(NGX_LOG_DEBUG_EVENT, ev->log, 0,
-                       "quic path seq:%uL validation failed", path->seqnum);
-
-        /* found expired path */
-
-        path->validated = 0;
-        path->validating = 0;
-
-
-        /* RFC 9000, 9.3.2.  On-Path Address Spoofing
-         *
-         * To protect the connection from failing due to such a spurious
-         * migration, an endpoint MUST revert to using the last validated
-         * peer address when validation of a new peer address fails.
-         */
-
-        if (qc->path == path) {
-            /* active path validation failed */
-
-            bkp = ngx_quic_get_path(c, NGX_QUIC_PATH_BACKUP);
-
-            if (bkp == NULL) {
-                qc->error = NGX_QUIC_ERR_NO_VIABLE_PATH;
-                qc->error_reason = "no viable path";
-                ngx_quic_close_connection(c, NGX_ERROR);
-                return;
+        switch (path->state) {
+        case NGX_QUIC_PATH_VALIDATING:
+            if (ngx_quic_expire_path_validation(c, path) != NGX_OK) {
+                goto failed;
             }
 
-            qc->path = bkp;
-            qc->path->tag = NGX_QUIC_PATH_ACTIVE;
+            break;
 
-            ngx_quic_set_connection_path(c, qc->path);
+        case NGX_QUIC_PATH_WAITING:
+            if (ngx_quic_expire_path_mtu_delay(c, path) != NGX_OK) {
+                goto failed;
+            }
+
+            break;
 
-            ngx_log_error(NGX_LOG_INFO, c->log, 0,
-                          "quic path seq:%uL addr:%V is restored from backup",
-                          qc->path->seqnum, &qc->path->addr_text);
+        case NGX_QUIC_PATH_MTUD:
+            if (ngx_quic_expire_path_mtu_discovery(c, path) != NGX_OK) {
+                goto failed;
+            }
 
-            ngx_quic_path_dbg(c, "is active", qc->path);
-        }
+            break;
 
-        if (ngx_quic_free_path(c, path) != NGX_OK) {
-            ngx_quic_close_connection(c, NGX_ERROR);
-            return;
+        default:
+            break;
         }
     }
 
-    if (next != -1) {
-        ngx_add_timer(&qc->path_validation, next);
+    ngx_quic_set_path_timer(c);
+
+    return;
+
+failed:
+
+    ngx_quic_close_connection(c, NGX_ERROR);
+}
+
+
+static ngx_int_t
+ngx_quic_expire_path_validation(ngx_connection_t *c, ngx_quic_path_t *path)
+{
+    ngx_msec_int_t          pto;
+    ngx_quic_path_t        *bkp;
+    ngx_quic_send_ctx_t    *ctx;
+    ngx_quic_connection_t  *qc;
+
+    qc = ngx_quic_get_connection(c);
+    ctx = ngx_quic_get_send_ctx(qc, ssl_encryption_application);
+
+    if (++path->tries < NGX_QUIC_PATH_RETRIES) {
+        pto = ngx_max(ngx_quic_pto(c, ctx), 1000) << path->tries;
+        path->expires = ngx_current_msec + pto;
+
+        (void) ngx_quic_send_path_challenge(c, path);
+
+        return NGX_OK;
+    }
+
+    ngx_log_debug1(NGX_LOG_DEBUG_EVENT, c->log, 0,
+                   "quic path seq:%uL validation failed", path->seqnum);
+
+    /* found expired path */
+
+    path->validated = 0;
+
+
+    /* RFC 9000, 9.3.2.  On-Path Address Spoofing
+     *
+     * To protect the connection from failing due to such a spurious
+     * migration, an endpoint MUST revert to using the last validated
+     * peer address when validation of a new peer address fails.
+     */
+
+    if (qc->path == path) {
+        /* active path validation failed */
+
+        bkp = ngx_quic_get_path(c, NGX_QUIC_PATH_BACKUP);
+
+        if (bkp == NULL) {
+            qc->error = NGX_QUIC_ERR_NO_VIABLE_PATH;
+            qc->error_reason = "no viable path";
+            return NGX_ERROR;
+        }
+
+        qc->path = bkp;
+        qc->path->tag = NGX_QUIC_PATH_ACTIVE;
+
+        ngx_quic_set_connection_path(c, qc->path);
+
+        ngx_log_error(NGX_LOG_INFO, c->log, 0,
+                      "quic path seq:%uL addr:%V is restored from backup",
+                      qc->path->seqnum, &qc->path->addr_text);
+
+        ngx_quic_path_dbg(c, "is active", qc->path);
+    }
+
+    return ngx_quic_free_path(c, path);
+}
+
+
+static ngx_int_t
+ngx_quic_expire_path_mtu_delay(ngx_connection_t *c, ngx_quic_path_t *path)
+{
+    ngx_int_t               rc;
+    ngx_uint_t              i;
+    ngx_msec_t              pto;
+    ngx_quic_send_ctx_t    *ctx;
+    ngx_quic_connection_t  *qc;
+
+    qc = ngx_quic_get_connection(c);
+    ctx = ngx_quic_get_send_ctx(qc, ssl_encryption_application);
+
+    path->tries = 0;
+
+    for ( ;; ) {
+
+        for (i = 0; i < NGX_QUIC_PATH_RETRIES; i++) {
+            path->mtu_pnum[i] = NGX_QUIC_UNSET_PN;
+        }
+
+        rc = ngx_quic_send_path_mtu_probe(c, path);
+
+        if (rc == NGX_ERROR) {
+            return NGX_ERROR;
+        }
+
+        if (rc == NGX_OK) {
+            pto = ngx_quic_pto(c, ctx);
+            path->expires = ngx_current_msec + pto;
+            path->state = NGX_QUIC_PATH_MTUD;
+            return NGX_OK;
+        }
+
+        /* rc == NGX_DECLINED */
+
+        path->max_mtu = path->mtud;
+
+        if (path->max_mtu - path->mtu <= NGX_QUIC_PATH_MTU_PRECISION) {
+            path->state = NGX_QUIC_PATH_IDLE;
+            return NGX_OK;
+        }
+
+        path->mtud = (path->mtu + path->max_mtu) / 2;
     }
 }
+
+
+static ngx_int_t
+ngx_quic_expire_path_mtu_discovery(ngx_connection_t *c, ngx_quic_path_t *path)
+{
+    ngx_int_t               rc;
+    ngx_msec_int_t          pto;
+    ngx_quic_send_ctx_t    *ctx;
+    ngx_quic_connection_t  *qc;
+
+    qc = ngx_quic_get_connection(c);
+    ctx = ngx_quic_get_send_ctx(qc, ssl_encryption_application);
+
+    if (++path->tries < NGX_QUIC_PATH_RETRIES) {
+        rc = ngx_quic_send_path_mtu_probe(c, path);
+
+        if (rc == NGX_ERROR) {
+            return NGX_ERROR;
+        }
+
+        if (rc == NGX_OK) {
+            pto = ngx_quic_pto(c, ctx) << path->tries;
+            path->expires = ngx_current_msec + pto;
+            return NGX_OK;
+        }
+
+        /* rc == NGX_DECLINED */
+    }
+
+    ngx_log_debug2(NGX_LOG_DEBUG_EVENT, c->log, 0,
+                   "quic path seq:%uL expired mtu:%uz",
+                   path->seqnum, path->mtud);
+
+    path->max_mtu = path->mtud;
+
+    ngx_quic_discover_path_mtu(c, path);
+
+    return NGX_OK;
+}
+
+
+static ngx_int_t
+ngx_quic_send_path_mtu_probe(ngx_connection_t *c, ngx_quic_path_t *path)
+{
+    ngx_int_t               rc;
+    ngx_uint_t              log_error;
+    ngx_quic_frame_t        frame;
+    ngx_quic_send_ctx_t    *ctx;
+    ngx_quic_connection_t  *qc;
+
+    ngx_memzero(&frame, sizeof(ngx_quic_frame_t));
+
+    frame.level = ssl_encryption_application;
+    frame.type = NGX_QUIC_FT_PING;
+
+    qc = ngx_quic_get_connection(c);
+    ctx = ngx_quic_get_send_ctx(qc, ssl_encryption_application);
+    path->mtu_pnum[path->tries] = ctx->pnum;
+
+    ngx_log_debug4(NGX_LOG_DEBUG_EVENT, c->log, 0,
+                   "quic path seq:%uL send probe "
+                   "mtu:%uz pnum:%uL tries:%ui",
+                   path->seqnum, path->mtud, ctx->pnum, path->tries);
+
+    log_error = c->log_error;
+    c->log_error = NGX_ERROR_IGNORE_EMSGSIZE;
+
+    rc = ngx_quic_frame_sendto(c, &frame, path->mtud, path);
+    c->log_error = log_error;
+
+    if (rc == NGX_ERROR) {
+        if (c->write->error) {
+            c->write->error = 0;
+
+            ngx_log_debug2(NGX_LOG_DEBUG_EVENT, c->log, 0,
+                           "quic path seq:%uL rejected mtu:%uz",
+                           path->seqnum, path->mtud);
+
+            return NGX_DECLINED;
+        }
+
+        return NGX_ERROR;
+    }
+
+    return NGX_OK;
+}
+
+
+ngx_int_t
+ngx_quic_handle_path_mtu(ngx_connection_t *c, ngx_quic_path_t *path,
+    uint64_t min, uint64_t max)
+{
+    uint64_t    pnum;
+    ngx_uint_t  i;
+
+    if (path->state != NGX_QUIC_PATH_MTUD) {
+        return NGX_OK;
+    }
+
+    for (i = 0; i < NGX_QUIC_PATH_RETRIES; i++) {
+        pnum = path->mtu_pnum[i];
+
+        if (pnum == NGX_QUIC_UNSET_PN) {
+            break;
+        }
+
+        if (pnum < min || pnum > max) {
+            continue;
+        }
+
+        path->mtu = path->mtud;
+
+        ngx_log_debug2(NGX_LOG_DEBUG_EVENT, c->log, 0,
+                       "quic path seq:%uL ack mtu:%uz",
+                       path->seqnum, path->mtu);
+
+        ngx_quic_discover_path_mtu(c, path);
+
+        break;
+    }
+
+    return NGX_OK;
+}
diff --git a/src/event/quic/ngx_event_quic_migration.h b/src/event/quic/ngx_event_quic_migration.h
--- a/src/event/quic/ngx_event_quic_migration.h
+++ b/src/event/quic/ngx_event_quic_migration.h
@@ -18,10 +18,10 @@
 #define NGX_QUIC_PATH_BACKUP    2
 
 #define ngx_quic_path_dbg(c, msg, path)                                       \
-    ngx_log_debug6(NGX_LOG_DEBUG_EVENT, c->log, 0,                            \
-                   "quic path seq:%uL %s sent:%O recvd:%O state:%s%s",        \
+    ngx_log_debug7(NGX_LOG_DEBUG_EVENT, c->log, 0,                            \
+                   "quic path seq:%uL %s tx:%O rx:%O valid:%ui st:%d mtu:%uz",\
                    path->seqnum, msg, path->sent, path->received,             \
-                   path->validated ? "V": "N", path->validating ? "R": "");
+                   path->validated, path->state, path->mtu);
 
 ngx_int_t ngx_quic_handle_path_challenge_frame(ngx_connection_t *c,
     ngx_quic_header_t *pkt, ngx_quic_path_challenge_frame_t *f);
@@ -36,6 +36,10 @@ ngx_int_t ngx_quic_set_path(ngx_connecti
 ngx_int_t ngx_quic_handle_migration(ngx_connection_t *c,
     ngx_quic_header_t *pkt);
 
-void ngx_quic_path_validation_handler(ngx_event_t *ev);
+void ngx_quic_path_handler(ngx_event_t *ev);
+
+void ngx_quic_discover_path_mtu(ngx_connection_t *c, ngx_quic_path_t *path);
+ngx_int_t ngx_quic_handle_path_mtu(ngx_connection_t *c,
+    ngx_quic_path_t *path, uint64_t min, uint64_t max);
 
 #endif /* _NGX_EVENT_QUIC_MIGRATION_H_INCLUDED_ */
diff --git a/src/event/quic/ngx_event_quic_output.c b/src/event/quic/ngx_event_quic_output.c
--- a/src/event/quic/ngx_event_quic_output.c
+++ b/src/event/quic/ngx_event_quic_output.c
@@ -10,9 +10,6 @@
 #include <ngx_event_quic_connection.h>
 
 
-#define NGX_QUIC_MAX_UDP_PAYLOAD_OUT   1252
-#define NGX_QUIC_MAX_UDP_PAYLOAD_OUT6  1232
-
 #define NGX_QUIC_MAX_UDP_SEGMENT_BUF  65487 /* 65K - IPv6 header */
 #define NGX_QUIC_MAX_SEGMENTS            64 /* UDP_MAX_SEGMENTS */
 
@@ -61,21 +58,6 @@ static size_t ngx_quic_path_limit(ngx_co
     size_t size);
 
 
-size_t
-ngx_quic_max_udp_payload(ngx_connection_t *c)
-{
-    /* TODO: path MTU discovery */
-
-#if (NGX_HAVE_INET6)
-    if (c->sockaddr->sa_family == AF_INET6) {
-        return NGX_QUIC_MAX_UDP_PAYLOAD_OUT6;
-    }
-#endif
-
-    return NGX_QUIC_MAX_UDP_PAYLOAD_OUT;
-}
-
-
 ngx_int_t
 ngx_quic_output(ngx_connection_t *c)
 {
@@ -142,10 +124,7 @@ ngx_quic_create_datagrams(ngx_connection
 
         p = dst;
 
-        len = ngx_min(qc->ctp.max_udp_payload_size,
-                      NGX_QUIC_MAX_UDP_PAYLOAD_SIZE);
-
-        len = ngx_quic_path_limit(c, path, len);
+        len = ngx_quic_path_limit(c, path, path->mtu);
 
         pad = ngx_quic_get_padding_level(c);
 
@@ -299,9 +278,7 @@ ngx_quic_allow_segmentation(ngx_connecti
     ctx = ngx_quic_get_send_ctx(qc, ssl_encryption_application);
 
     bytes = 0;
-
-    len = ngx_min(qc->ctp.max_udp_payload_size,
-                  NGX_QUIC_MAX_UDP_SEGMENT_BUF);
+    len = ngx_min(qc->path->mtu, NGX_QUIC_MAX_UDP_SEGMENT_BUF);
 
     for (q = ngx_queue_head(&ctx->frames);
          q != ngx_queue_sentinel(&ctx->frames);
@@ -345,8 +322,7 @@ ngx_quic_create_segments(ngx_connection_
         return NGX_ERROR;
     }
 
-    segsize = ngx_min(qc->ctp.max_udp_payload_size,
-                      NGX_QUIC_MAX_UDP_SEGMENT_BUF);
+    segsize = ngx_min(path->mtu, NGX_QUIC_MAX_UDP_SEGMENT_BUF);
     p = dst;
     end = dst + sizeof(dst);
 
diff --git a/src/event/quic/ngx_event_quic_output.h b/src/event/quic/ngx_event_quic_output.h
--- a/src/event/quic/ngx_event_quic_output.h
+++ b/src/event/quic/ngx_event_quic_output.h
@@ -12,8 +12,6 @@
 #include <ngx_core.h>
 
 
-size_t ngx_quic_max_udp_payload(ngx_connection_t *c);
-
 ngx_int_t ngx_quic_output(ngx_connection_t *c);
 
 ngx_int_t ngx_quic_negotiate_version(ngx_connection_t *c,
diff --git a/src/event/quic/ngx_event_quic_ssl.c b/src/event/quic/ngx_event_quic_ssl.c
--- a/src/event/quic/ngx_event_quic_ssl.c
+++ b/src/event/quic/ngx_event_quic_ssl.c
@@ -494,6 +494,8 @@ ngx_quic_crypto_input(ngx_connection_t *
      */
     ngx_quic_discard_ctx(c, ssl_encryption_handshake);
 
+    ngx_quic_discover_path_mtu(c, qc->path);
+
     /* start accepting clients on negotiated number of server ids */
     if (ngx_quic_create_sockets(c) != NGX_OK) {
         return NGX_ERROR;
diff --git a/src/os/unix/ngx_errno.h b/src/os/unix/ngx_errno.h
--- a/src/os/unix/ngx_errno.h
+++ b/src/os/unix/ngx_errno.h
@@ -54,6 +54,7 @@ typedef int               ngx_err_t;
 #define NGX_ENOMOREFILES  0
 #define NGX_ELOOP         ELOOP
 #define NGX_EBADF         EBADF
+#define NGX_EMSGSIZE      EMSGSIZE
 
 #if (NGX_HAVE_OPENAT)
 #define NGX_EMLINK        EMLINK
diff --git a/src/os/win32/ngx_errno.h b/src/os/win32/ngx_errno.h
--- a/src/os/win32/ngx_errno.h
+++ b/src/os/win32/ngx_errno.h
@@ -57,6 +57,7 @@ typedef DWORD                      ngx_e
 #define NGX_EILSEQ                 ERROR_NO_UNICODE_TRANSLATION
 #define NGX_ELOOP                  0
 #define NGX_EBADF                  WSAEBADF
+#define NGX_EMSGSIZE               WSAEMSGSIZE
 
 #define NGX_EALREADY               WSAEALREADY
 #define NGX_EINVAL                 WSAEINVAL


More information about the nginx-devel mailing list