[PATCH 4 of 4] QUIC: path MTU discovery
Sergey Kandaurov
pluknet at nginx.com
Thu Aug 10 14:18:02 UTC 2023
> On 10 Aug 2023, at 14:21, Roman Arutyunyan <arut at nginx.com> wrote:
>
> On Tue, Aug 08, 2023 at 01:49:46PM +0400, Sergey Kandaurov wrote:
> Hi,
>
>>
>>> On 31 Jul 2023, at 21:34, Roman Arutyunyan <arut at nginx.com> wrote:
>>>
>>> Hi,
>>>
>>> On Fri, Jul 28, 2023 at 07:51:22PM +0400, Sergey Kandaurov wrote:
>>>>
>>>>> On 6 Jul 2023, at 17:57, Roman Arutyunyan <arut at nginx.com> wrote:
>>>>>
>>>>> # HG changeset patch
>>>>> # User Roman Arutyunyan <arut at nginx.com>
>>>>> # Date 1688481216 -14400
>>>>> # Tue Jul 04 18:33:36 2023 +0400
>>>>> # Node ID 1a49fd5d2013b6c8d50263499e6ced440927e949
>>>>> # Parent a1ea543d009311765144351b2557c00c8e6445bf
>>>>> QUIC: path MTU discovery.
>>>>>
>>>>> MTU selection starts by stepping up until the first failure. Then binary
>>>>> search is used to find the path MTU.
>>>>>
>>>>> diff --git a/src/core/ngx_connection.c b/src/core/ngx_connection.c
>>>>> --- a/src/core/ngx_connection.c
>>>>> +++ b/src/core/ngx_connection.c
>>>>> @@ -1583,6 +1583,10 @@ ngx_connection_error(ngx_connection_t *c
>>>>> }
>>>>> #endif
>>>>>
>>>>> + if (err == NGX_EMSGSIZE && c->log_error == NGX_ERROR_IGNORE_EMSGSIZE) {
>>>>> + return 0;
>>>>> + }
>>>>> +
>>>>> if (err == 0
>>>>> || err == NGX_ECONNRESET
>>>>> #if (NGX_WIN32)
>>>>> @@ -1600,6 +1604,7 @@ ngx_connection_error(ngx_connection_t *c
>>>>> {
>>>>> switch (c->log_error) {
>>>>>
>>>>> + case NGX_ERROR_IGNORE_EMSGSIZE:
>>>>> case NGX_ERROR_IGNORE_EINVAL:
>>>>> case NGX_ERROR_IGNORE_ECONNRESET:
>>>>> case NGX_ERROR_INFO:
>>>>> diff --git a/src/core/ngx_connection.h b/src/core/ngx_connection.h
>>>>> --- a/src/core/ngx_connection.h
>>>>> +++ b/src/core/ngx_connection.h
>>>>> @@ -97,7 +97,8 @@ typedef enum {
>>>>> NGX_ERROR_ERR,
>>>>> NGX_ERROR_INFO,
>>>>> NGX_ERROR_IGNORE_ECONNRESET,
>>>>> - NGX_ERROR_IGNORE_EINVAL
>>>>> + NGX_ERROR_IGNORE_EINVAL,
>>>>> + NGX_ERROR_IGNORE_EMSGSIZE
>>>>> } ngx_connection_log_error_e;
>>>>>
>>>>>
>>>>> diff --git a/src/event/quic/ngx_event_quic.c b/src/event/quic/ngx_event_quic.c
>>>>> --- a/src/event/quic/ngx_event_quic.c
>>>>> +++ b/src/event/quic/ngx_event_quic.c
>>>>> @@ -149,11 +149,6 @@ ngx_quic_apply_transport_params(ngx_conn
>>>>> ngx_log_error(NGX_LOG_INFO, c->log, 0,
>>>>> "quic maximum packet size is invalid");
>>>>> return NGX_ERROR;
>>>>> -
>>>>> - } else if (ctp->max_udp_payload_size > ngx_quic_max_udp_payload(c)) {
>>>>> - ctp->max_udp_payload_size = ngx_quic_max_udp_payload(c);
>>>>> - ngx_log_debug0(NGX_LOG_DEBUG_EVENT, c->log, 0,
>>>>> - "quic client maximum packet size truncated");
>>>>> }
>>>>>
>>>>> if (ctp->active_connection_id_limit < 2) {
>>>>> @@ -286,7 +281,7 @@ ngx_quic_new_connection(ngx_connection_t
>>>>>
>>>>> qc->path_validation.log = c->log;
>>>>> qc->path_validation.data = c;
>>>>> - qc->path_validation.handler = ngx_quic_path_validation_handler;
>>>>> + qc->path_validation.handler = ngx_quic_path_handler;
>>>>>
>>>>> qc->conf = conf;
>>>>>
>>>>> @@ -297,7 +292,7 @@ ngx_quic_new_connection(ngx_connection_t
>>>>> ctp = &qc->ctp;
>>>>>
>>>>> /* defaults to be used before actual client parameters are received */
>>>>> - ctp->max_udp_payload_size = ngx_quic_max_udp_payload(c);
>>>>> + ctp->max_udp_payload_size = NGX_QUIC_MAX_UDP_PAYLOAD_SIZE;
>>>>> ctp->ack_delay_exponent = NGX_QUIC_DEFAULT_ACK_DELAY_EXPONENT;
>>>>> ctp->max_ack_delay = NGX_QUIC_DEFAULT_MAX_ACK_DELAY;
>>>>> ctp->active_connection_id_limit = 2;
>>>>> diff --git a/src/event/quic/ngx_event_quic_ack.c b/src/event/quic/ngx_event_quic_ack.c
>>>>> --- a/src/event/quic/ngx_event_quic_ack.c
>>>>> +++ b/src/event/quic/ngx_event_quic_ack.c
>>>>> @@ -229,6 +229,12 @@ ngx_quic_handle_ack_frame_range(ngx_conn
>>>>>
>>>>> qc = ngx_quic_get_connection(c);
>>>>>
>>>>> + if (ctx->level == ssl_encryption_application) {
>>>>> + if (ngx_quic_handle_path_mtu_ack(c, qc->path, min, max) != NGX_OK) {
>>>>> + return NGX_ERROR;
>>>>> + }
>>>>> + }
>>>>> +
>>>>> st->max_pn = NGX_TIMER_INFINITE;
>>>>> found = 0;
>>>>>
>>>>> diff --git a/src/event/quic/ngx_event_quic_connection.h b/src/event/quic/ngx_event_quic_connection.h
>>>>> --- a/src/event/quic/ngx_event_quic_connection.h
>>>>> +++ b/src/event/quic/ngx_event_quic_connection.h
>>>>> @@ -66,6 +66,14 @@ typedef struct ngx_quic_keys_s ng
>>>>> #define ngx_quic_get_socket(c) ((ngx_quic_socket_t *)((c)->udp))
>>>>>
>>>>>
>>>>> +typedef enum {
>>>>> + NGX_QUIC_PATH_IDLE = 0,
>>>>> + NGX_QUIC_PATH_VALIDATING,
>>>>> + NGX_QUIC_PATH_WAITING,
>>>>> + NGX_QUIC_PATH_DISCOVERING_MTU
>>>>
>>>> Nitpicking on too long name.
>>>> What about NGX_QUIC_PATH_MTUD or something.
>>>
>>> OK, renamed.
>>>
>>>>> +} ngx_quic_path_state_e;
>>>>> +
>>>>> +
>>>>> struct ngx_quic_client_id_s {
>>>>> ngx_queue_t queue;
>>>>> uint64_t seqnum;
>>>>> @@ -89,18 +97,22 @@ struct ngx_quic_path_s {
>>>>> ngx_sockaddr_t sa;
>>>>> socklen_t socklen;
>>>>> ngx_quic_client_id_t *cid;
>>>>> + ngx_quic_path_state_e state;
>>>>> ngx_msec_t expires;
>>>>> ngx_uint_t tries;
>>>>> ngx_uint_t tag;
>>>>> + size_t mtu;
>>>>> + size_t mtud;
>>>>> + size_t max_mtu;
>>>>> off_t sent;
>>>>> off_t received;
>>>>> u_char challenge1[8];
>>>>> u_char challenge2[8];
>>>>> uint64_t seqnum;
>>>>> + uint64_t mtu_pnum[NGX_QUIC_PATH_RETRIES];
>>>>> ngx_str_t addr_text;
>>>>> u_char text[NGX_SOCKADDR_STRLEN];
>>>>> - unsigned validated:1;
>>>>> - unsigned validating:1;
>>>>> + ngx_uint_t validated; /* unsigned validated:1; */
>>>>> };
>>>>>
>>>>>
>>>>> diff --git a/src/event/quic/ngx_event_quic_migration.c b/src/event/quic/ngx_event_quic_migration.c
>>>>> --- a/src/event/quic/ngx_event_quic_migration.c
>>>>> +++ b/src/event/quic/ngx_event_quic_migration.c
>>>>> @@ -10,6 +10,11 @@
>>>>> #include <ngx_event_quic_connection.h>
>>>>>
>>>>>
>>>>> +#define NGX_QUIC_PATH_MTU_DELAY 1000
>>>>
>>>> The value looks too much,
>>>> it would delay each subsequent probe (which is not retry).
>>>
>>> Probably the right value should be determined in real-life scenarios.
>>
>> I'm quite skeptical about delayed probes, it has little to no benefit
>> for short connections with bulky transfers, as implemented, especially
>> with short MTU steps. OTOH, immediate probes can lead to congestion
>> window starvation resulting in request processing slow down.
>>
>> For the record,
>> https://www.ietf.org/proceedings/62/pmtud.html provides a discussion
>> about PMTUD, in particular, on search strategy and when to probe.
>> For the latter, it seems to articulate an opposite view, i.e. to send
>> probes ASAP. Also, it raises a question how short lived connections
>> could benefit from PMTUD, e.g. by caching path MTU, which seems to
>> correspond to a shared PLPMTU state in RFC 8899.
>>
>>> Let's decrease the value by a factor of 10 for now.
>>
>> For a simple implementation, 100ms is a good enough trade-off, IMHO.
>>
>>>
>>>>> +#define NGX_QUIC_PATH_MTU_STEP 256
>>>>> +#define NGX_QUIC_PATH_MTU_PRECISION 16
>>>>> +
>>>>> +
>>>>> static void ngx_quic_set_connection_path(ngx_connection_t *c,
>>>>> ngx_quic_path_t *path);
>>>>> static ngx_int_t ngx_quic_validate_path(ngx_connection_t *c,
>>>>> @@ -17,7 +22,15 @@ static ngx_int_t ngx_quic_validate_path(
>>>>> static ngx_int_t ngx_quic_send_path_challenge(ngx_connection_t *c,
>>>>> ngx_quic_path_t *path);
>>>>> static void ngx_quic_set_path_timer(ngx_connection_t *c);
>>>>> +static ngx_int_t ngx_quic_expire_path_validation(ngx_connection_t *c,
>>>>> + ngx_quic_path_t *path);
>>>>> +static ngx_int_t ngx_quic_expire_path_mtu_delay(ngx_connection_t *c,
>>>>> + ngx_quic_path_t *path);
>>>>> +static ngx_int_t ngx_quic_expire_path_mtu_discovery(ngx_connection_t *c,
>>>>> + ngx_quic_path_t *path);
>>>>> static ngx_quic_path_t *ngx_quic_get_path(ngx_connection_t *c, ngx_uint_t tag);
>>>>> +static ngx_int_t ngx_quic_send_path_mtu_probe(ngx_connection_t *c,
>>>>> + ngx_quic_path_t *path);
>>>>>
>>>>>
>>>>> ngx_int_t
>>>>> @@ -97,7 +110,7 @@ ngx_quic_handle_path_response_frame(ngx_
>>>>> {
>>>>> path = ngx_queue_data(q, ngx_quic_path_t, queue);
>>>>>
>>>>> - if (!path->validating) {
>>>>> + if (path->state != NGX_QUIC_PATH_VALIDATING) {
>>>>> continue;
>>>>> }
>>>>>
>>>>> @@ -136,6 +149,9 @@ valid:
>>>>> {
>>>>> /* address did not change */
>>>>> rst = 0;
>>>>> +
>>>>> + path->mtu = prev->mtu;
>>>>> + path->max_mtu = prev->max_mtu;
>>>>> }
>>>>> }
>>>>>
>>>>> @@ -167,9 +183,8 @@ valid:
>>>>> ngx_quic_path_dbg(c, "is validated", path);
>>>>>
>>>>> path->validated = 1;
>>>>> - path->validating = 0;
>>>>>
>>>>> - ngx_quic_set_path_timer(c);
>>>>> + ngx_quic_discover_path_mtu(c, path);
>>>>>
>>>>> return NGX_OK;
>>>>> }
>>>>> @@ -217,6 +232,8 @@ ngx_quic_new_path(ngx_connection_t *c,
>>>>> path->addr_text.len = ngx_sock_ntop(sockaddr, socklen, path->text,
>>>>> NGX_SOCKADDR_STRLEN, 1);
>>>>>
>>>>> + path->mtu = NGX_QUIC_MIN_INITIAL_SIZE;
>>>>> +
>>>>> ngx_log_debug2(NGX_LOG_DEBUG_EVENT, c->log, 0,
>>>>> "quic path seq:%uL created addr:%V",
>>>>> path->seqnum, &path->addr_text);
>>>>> @@ -464,7 +481,7 @@ ngx_quic_handle_migration(ngx_connection
>>>>>
>>>>> ngx_quic_set_connection_path(c, next);
>>>>>
>>>>> - if (!next->validated && !next->validating) {
>>>>> + if (!next->validated && next->state != NGX_QUIC_PATH_VALIDATING) {
>>>>> if (ngx_quic_validate_path(c, next) != NGX_OK) {
>>>>> return NGX_ERROR;
>>>>> }
>>>>> @@ -492,7 +509,6 @@ ngx_quic_validate_path(ngx_connection_t
>>>>> ngx_log_debug1(NGX_LOG_DEBUG_EVENT, c->log, 0,
>>>>> "quic initiated validation of path seq:%uL", path->seqnum);
>>>>>
>>>>> - path->validating = 1;
>>>>> path->tries = 0;
>>>>>
>>>>> if (RAND_bytes(path->challenge1, 8) != 1) {
>>>>> @@ -511,6 +527,7 @@ ngx_quic_validate_path(ngx_connection_t
>>>>> pto = ngx_max(ngx_quic_pto(c, ctx), 1000);
>>>>>
>>>>> path->expires = ngx_current_msec + pto;
>>>>> + path->state = NGX_QUIC_PATH_VALIDATING;
>>>>>
>>>>> ngx_quic_set_path_timer(c);
>>>>>
>>>>> @@ -558,6 +575,42 @@ ngx_quic_send_path_challenge(ngx_connect
>>>>> }
>>>>>
>>>>>
>>>>> +void
>>>>> +ngx_quic_discover_path_mtu(ngx_connection_t *c, ngx_quic_path_t *path)
>>>>> +{
>>>>> + ngx_quic_connection_t *qc;
>>>>> +
>>>>> + qc = ngx_quic_get_connection(c);
>>>>> +
>>>>> + if (path->max_mtu) {
>>>>> + if (path->max_mtu - path->mtu <= NGX_QUIC_PATH_MTU_PRECISION) {
>>>>> + path->state = NGX_QUIC_PATH_IDLE;
>>>>> + ngx_quic_set_path_timer(c);
>>>>> + return;
>>>>> + }
>>>>> +
>>>>> + path->mtud = (path->mtu + path->max_mtu) / 2;
>>>>> +
>>>>> + } else {
>>>>> + path->mtud = path->mtu + NGX_QUIC_PATH_MTU_STEP;
>>>>
>>>> I like this approach, compared to multiplying probe steps.
>>>> This better fits to the pattern to keep around a well-known
>>>> initial value, which is typically 1500 bytes.
>>>> Another one could be to let user specify its own initial
>>>> value for cases when he knows well its network environment.
>>>>
>>>>> +
>>>>> + if (path->mtud >= qc->ctp.max_udp_payload_size) {
>>>>> + path->mtud = qc->ctp.max_udp_payload_size;
>>>>> + path->max_mtu = qc->ctp.max_udp_payload_size;
>>>>> + }
>>>>> + }
>>>>> +
>>>>> + path->state = NGX_QUIC_PATH_WAITING;
>>>>> + path->expires = ngx_current_msec + NGX_QUIC_PATH_MTU_DELAY;
>>>>> +
>>>>> + ngx_log_debug2(NGX_LOG_DEBUG_EVENT, c->log, 0,
>>>>> + "quic path seq:%uL schedule mtu:%uz",
>>>>> + path->seqnum, path->mtud);
>>>>> +
>>>>> + ngx_quic_set_path_timer(c);
>>>>> +}
>>>>> +
>>>>> +
>>>>> static void
>>>>> ngx_quic_set_path_timer(ngx_connection_t *c)
>>>>> {
>>>>> @@ -578,7 +631,7 @@ ngx_quic_set_path_timer(ngx_connection_t
>>>>> {
>>>>> path = ngx_queue_data(q, ngx_quic_path_t, queue);
>>>>>
>>>>> - if (!path->validating) {
>>>>> + if (path->state == NGX_QUIC_PATH_IDLE) {
>>>>> continue;
>>>>> }
>>>>>
>>>>> @@ -600,22 +653,18 @@ ngx_quic_set_path_timer(ngx_connection_t
>>>>>
>>>>>
>>>>> void
>>>>> -ngx_quic_path_validation_handler(ngx_event_t *ev)
>>>>> +ngx_quic_path_handler(ngx_event_t *ev)
>>>>> {
>>>>> ngx_msec_t now;
>>>>> ngx_queue_t *q;
>>>>> - ngx_msec_int_t left, next, pto;
>>>>> - ngx_quic_path_t *path, *bkp;
>>>>> + ngx_msec_int_t left;
>>>>> + ngx_quic_path_t *path;
>>>>> ngx_connection_t *c;
>>>>> - ngx_quic_send_ctx_t *ctx;
>>>>> ngx_quic_connection_t *qc;
>>>>>
>>>>> c = ev->data;
>>>>> qc = ngx_quic_get_connection(c);
>>>>>
>>>>> - ctx = ngx_quic_get_send_ctx(qc, ssl_encryption_application);
>>>>> -
>>>>> - next = -1;
>>>>> now = ngx_current_msec;
>>>>>
>>>>> q = ngx_queue_head(&qc->paths);
>>>>> @@ -625,83 +674,274 @@ ngx_quic_path_validation_handler(ngx_eve
>>>>> path = ngx_queue_data(q, ngx_quic_path_t, queue);
>>>>> q = ngx_queue_next(q);
>>>>>
>>>>> - if (!path->validating) {
>>>>> + if (path->state == NGX_QUIC_PATH_IDLE) {
>>>>> continue;
>>>>> }
>>>>>
>>>>> left = path->expires - now;
>>>>>
>>>>> if (left > 0) {
>>>>> -
>>>>> - if (next == -1 || left < next) {
>>>>> - next = left;
>>>>> - }
>>>>> -
>>>>> - continue;
>>>>> - }
>>>>> -
>>>>> - if (++path->tries < NGX_QUIC_PATH_RETRIES) {
>>>>> - pto = ngx_max(ngx_quic_pto(c, ctx), 1000) << path->tries;
>>>>> -
>>>>> - path->expires = ngx_current_msec + pto;
>>>>> -
>>>>> - if (next == -1 || pto < next) {
>>>>> - next = pto;
>>>>> - }
>>>>> -
>>>>> - /* retransmit */
>>>>> - (void) ngx_quic_send_path_challenge(c, path);
>>>>> -
>>>>> continue;
>>>>> }
>>>>>
>>>>> - ngx_log_debug1(NGX_LOG_DEBUG_EVENT, ev->log, 0,
>>>>> - "quic path seq:%uL validation failed", path->seqnum);
>>>>> + switch (path->state) {
>>>>> + case NGX_QUIC_PATH_VALIDATING:
>>>>> + if (ngx_quic_expire_path_validation(c, path) != NGX_OK) {
>>>>> + goto failed;
>>>>> + }
>>>>> +
>>>>> + break;
>>>>> +
>>>>> + case NGX_QUIC_PATH_WAITING:
>>>>> + if (ngx_quic_expire_path_mtu_delay(c, path) != NGX_OK) {
>>>>> + goto failed;
>>>>> + }
>>>>> +
>>>>> + break;
>>>>> +
>>>>> + case NGX_QUIC_PATH_DISCOVERING_MTU:
>>>>> + if (ngx_quic_expire_path_mtu_discovery(c, path) != NGX_OK) {
>>>>> + goto failed;
>>>>> + }
>>>>> +
>>>>> + break;
>>>>> +
>>>>
>>>> indentation in several places here
>>>
>>> Fixed, thanks.
>>>
>>>>> + default:
>>>>> + break;
>>>>> + }
>>>>> + }
>>>>> +
>>>>> + ngx_quic_set_path_timer(c);
>>>>> +
>>>>> + return;
>>>>>
>>>>> - /* found expired path */
>>>>> +failed:
>>>>> +
>>>>> + ngx_quic_close_connection(c, NGX_ERROR);
>>>>> +}
>>>>> +
>>>>> +
>>>>> +static ngx_int_t
>>>>> +ngx_quic_expire_path_validation(ngx_connection_t *c, ngx_quic_path_t *path)
>>>>> +{
>>>>> + ngx_msec_int_t pto;
>>>>> + ngx_quic_path_t *bkp;
>>>>> + ngx_quic_send_ctx_t *ctx;
>>>>> + ngx_quic_connection_t *qc;
>>>>>
>>>>> - path->validated = 0;
>>>>> - path->validating = 0;
>>>>> + qc = ngx_quic_get_connection(c);
>>>>> + ctx = ngx_quic_get_send_ctx(qc, ssl_encryption_application);
>>>>> +
>>>>> + if (++path->tries < NGX_QUIC_PATH_RETRIES) {
>>>>> + pto = ngx_max(ngx_quic_pto(c, ctx), 1000) << path->tries;
>>>>> + path->expires = ngx_current_msec + pto;
>>>>> +
>>>>> + (void) ngx_quic_send_path_challenge(c, path);
>>>>> +
>>>>> + return NGX_OK;
>>>>> + }
>>>>> +
>>>>> + ngx_log_debug1(NGX_LOG_DEBUG_EVENT, c->log, 0,
>>>>> + "quic path seq:%uL validation failed", path->seqnum);
>>>>> +
>>>>> + /* found expired path */
>>>>> +
>>>>> + path->validated = 0;
>>>>>
>>>>>
>>>>> - /* RFC 9000, 9.3.2. On-Path Address Spoofing
>>>>> - *
>>>>> - * To protect the connection from failing due to such a spurious
>>>>> - * migration, an endpoint MUST revert to using the last validated
>>>>> - * peer address when validation of a new peer address fails.
>>>>> - */
>>>>> -
>>>>> - if (qc->path == path) {
>>>>> - /* active path validation failed */
>>>>> -
>>>>> - bkp = ngx_quic_get_path(c, NGX_QUIC_PATH_BACKUP);
>>>>> + /* RFC 9000, 9.3.2. On-Path Address Spoofing
>>>>> + *
>>>>> + * To protect the connection from failing due to such a spurious
>>>>> + * migration, an endpoint MUST revert to using the last validated
>>>>> + * peer address when validation of a new peer address fails.
>>>>> + */
>>>>>
>>>>> - if (bkp == NULL) {
>>>>> - qc->error = NGX_QUIC_ERR_NO_VIABLE_PATH;
>>>>> - qc->error_reason = "no viable path";
>>>>> - ngx_quic_close_connection(c, NGX_ERROR);
>>>>> - return;
>>>>> - }
>>>>> + if (qc->path == path) {
>>>>> + /* active path validation failed */
>>>>> +
>>>>> + bkp = ngx_quic_get_path(c, NGX_QUIC_PATH_BACKUP);
>>>>>
>>>>> - qc->path = bkp;
>>>>> - qc->path->tag = NGX_QUIC_PATH_ACTIVE;
>>>>> -
>>>>> - ngx_quic_set_connection_path(c, qc->path);
>>>>> -
>>>>> - ngx_log_error(NGX_LOG_INFO, c->log, 0,
>>>>> - "quic path seq:%uL addr:%V is restored from backup",
>>>>> - qc->path->seqnum, &qc->path->addr_text);
>>>>> -
>>>>> - ngx_quic_path_dbg(c, "is active", qc->path);
>>>>> + if (bkp == NULL) {
>>>>> + qc->error = NGX_QUIC_ERR_NO_VIABLE_PATH;
>>>>> + qc->error_reason = "no viable path";
>>>>> + return NGX_ERROR;
>>>>> }
>>>>>
>>>>> - if (ngx_quic_free_path(c, path) != NGX_OK) {
>>>>> - ngx_quic_close_connection(c, NGX_ERROR);
>>>>> - return;
>>>>> + qc->path = bkp;
>>>>> + qc->path->tag = NGX_QUIC_PATH_ACTIVE;
>>>>> +
>>>>> + ngx_quic_set_connection_path(c, qc->path);
>>>>> +
>>>>> + ngx_log_error(NGX_LOG_INFO, c->log, 0,
>>>>> + "quic path seq:%uL addr:%V is restored from backup",
>>>>> + qc->path->seqnum, &qc->path->addr_text);
>>>>> +
>>>>> + ngx_quic_path_dbg(c, "is active", qc->path);
>>>>> + }
>>>>> +
>>>>> + return ngx_quic_free_path(c, path);
>>>>> +}
>>>>> +
>>>>> +
>>>>> +static ngx_int_t
>>>>> +ngx_quic_expire_path_mtu_delay(ngx_connection_t *c, ngx_quic_path_t *path)
>>>>> +{
>>>>> + ngx_int_t rc;
>>>>> + ngx_uint_t i;
>>>>> + ngx_msec_t pto;
>>>>> + ngx_quic_send_ctx_t *ctx;
>>>>> + ngx_quic_connection_t *qc;
>>>>> +
>>>>> + qc = ngx_quic_get_connection(c);
>>>>> + ctx = ngx_quic_get_send_ctx(qc, ssl_encryption_application);
>>>>> +
>>>>> + path->tries = 0;
>>>>> +
>>>>> + for ( ;; ) {
>>>>> +
>>>>> + for (i = 0; i < NGX_QUIC_PATH_RETRIES; i++) {
>>>>> + path->mtu_pnum[i] = NGX_QUIC_UNSET_PN;
>>>>> + }
>>>>> +
>>>>> + rc = ngx_quic_send_path_mtu_probe(c, path);
>>>>> +
>>>>> + if (rc == NGX_ERROR) {
>>>>> + return NGX_ERROR;
>>>>> + }
>>>>> +
>>>>> + if (rc == NGX_OK) {
>>>>> + pto = ngx_quic_pto(c, ctx);
>>>>> + path->expires = ngx_current_msec + pto;
>>>>> + path->state = NGX_QUIC_PATH_DISCOVERING_MTU;
>>>>> + return NGX_OK;
>>>>> + }
>>>>> +
>>>>> + /* rc == NGX_DECLINED */
>>>>> +
>>>>> + path->max_mtu = path->mtud;
>>>>> +
>>>>> + if (path->max_mtu - path->mtu <= NGX_QUIC_PATH_MTU_PRECISION) {
>>>>> + path->state = NGX_QUIC_PATH_IDLE;
>>>>> + return NGX_OK;
>>>>> + }
>>>>> +
>>>>> + path->mtud = (path->mtu + path->max_mtu) / 2;
>>>>> + }
>>>>> +}
>>>>> +
>>>>> +
>>>>> +static ngx_int_t
>>>>> +ngx_quic_expire_path_mtu_discovery(ngx_connection_t *c, ngx_quic_path_t *path)
>>>>> +{
>>>>> + ngx_int_t rc;
>>>>> + ngx_msec_int_t pto;
>>>>> + ngx_quic_send_ctx_t *ctx;
>>>>> + ngx_quic_connection_t *qc;
>>>>> +
>>>>> + qc = ngx_quic_get_connection(c);
>>>>> + ctx = ngx_quic_get_send_ctx(qc, ssl_encryption_application);
>>>>> +
>>>>> + if (++path->tries < NGX_QUIC_PATH_RETRIES) {
>>>>> + pto = ngx_quic_pto(c, ctx) << path->tries;
>>>>> + path->expires = ngx_current_msec + pto;
>>>>> +
>>>>
>>>> Setting path->expires there seems useless.
>>>> It don't seem to be used in ngx_quic_send_path_mtu_probe(),
>>>> and ngx_quic_discover_path_mtu() used to reset path->expires.
>>>
>>> If ngx_quic_send_path_mtu_probe() below returns NGX_OK, the path should stay
>>> in the same state, but expiration time should be advanced. To make it
>>> more obvious, I changed this code.
>>
>> Updated code makes it clear, thanks.
>>
>>>
>>>>> + rc = ngx_quic_send_path_mtu_probe(c, path);
>>>>> + if (rc != NGX_DECLINED) {
>>>>> + return rc;
>>>>> }
>>>>> }
>>>>>
>>>>> - if (next != -1) {
>>>>> - ngx_add_timer(&qc->path_validation, next);
>>>>> + ngx_log_debug2(NGX_LOG_DEBUG_EVENT, c->log, 0,
>>>>> + "quic path seq:%uL expired mtu:%uz",
>>>>> + path->seqnum, path->mtud);
>>>>> +
>>>>> + path->max_mtu = path->mtud;
>>>>> +
>>>>> + ngx_quic_discover_path_mtu(c, path);
>>>>> +
>>>>> + return NGX_OK;
>>>>> +}
>>>>> +
>>>>> +
>>>>> +static ngx_int_t
>>>>> +ngx_quic_send_path_mtu_probe(ngx_connection_t *c, ngx_quic_path_t *path)
>>>>> +{
>>>>> + ngx_int_t rc;
>>>>> + ngx_uint_t log_error;
>>>>> + ngx_quic_frame_t frame;
>>>>> + ngx_quic_send_ctx_t *ctx;
>>>>> + ngx_quic_connection_t *qc;
>>>>> +
>>>>> + ngx_memzero(&frame, sizeof(ngx_quic_frame_t));
>>>>> +
>>>>> + frame.level = ssl_encryption_application;
>>>>> + frame.type = NGX_QUIC_FT_PING;
>>>>> +
>>>>> + qc = ngx_quic_get_connection(c);
>>>>> + ctx = ngx_quic_get_send_ctx(qc, ssl_encryption_application);
>>>>> + path->mtu_pnum[path->tries] = ctx->pnum;
>>>>> +
>>>>> + ngx_log_debug4(NGX_LOG_DEBUG_EVENT, c->log, 0,
>>>>> + "quic path seq:%uL send probe "
>>>>> + "mtu:%uz pnum:%uL tries:%ui",
>>>>> + path->seqnum, path->mtud, ctx->pnum, path->tries);
>>>>> +
>>>>> + log_error = c->log_error;
>>>>> + c->log_error = NGX_ERROR_IGNORE_EMSGSIZE;
>>>>> +
>>>>> + rc = ngx_quic_frame_sendto(c, &frame, path->mtud, path);
>>>>> + c->log_error = log_error;
>>>>> +
>>>>> + if (rc == NGX_ERROR) {
>>>>> + if (c->write->error) {
>>>>> + c->write->error = 0;
>>>>> +
>>>>> + ngx_log_debug2(NGX_LOG_DEBUG_EVENT, c->log, 0,
>>>>> + "quic path seq:%uL rejected mtu:%uz",
>>>>> + path->seqnum, path->mtud);
>>>>> +
>>>>> + return NGX_DECLINED;
>>>>
>>>> The error handling looks awkward to decline on write error.
>>>> ngx_sendmsg() sets write error on fatal sendmsg errors,
>>>> and keeps it unset NGX_EAGAIN and NGX_EINTR, which
>>>> are typically non-fatal transient errors.
>>>> I understand this is done mostly to ignore EMSGSIZE,
>>>> maybe it makes sense to ignore the rest sendmsg() errors.
>>>
>>> The problem is, the error doesn't make it to this point. All be have here is
>>> NGX_ERROR and c->write->error. I think, for PMTUD purposes we can safely
>>> ignore the errors. It's hinghly likely, the error will repeat while doing
>>> connection I/O and we'll shortly close the connection.
>>
>> Ok, I re-read ngx_sendmsg() to clarify the following:
>> - NGX_EINTR is handled internally and never returned up to here;
>> - NGX_EAGAIN (NGX_AGAIN) seems to be returned, at least on Linux,
>> when send buffer is low for the MTU probe, converted to NGX_OK;
>> - FTR, looking at sosend_dgram() on FreeBSD, sendmsg seems to
>> return EMSGSIZE there, instead, on low socket buffer space;
>> - other sendmsg errors are returned as NGX_ERROR, and
>> c->write->error is set; this is converted to NGX_DECLINED;
>> - non-sendmsg NGX_ERRORs appeared in ngx_quic_frame_sendto()
>> are passed as is without conversion.
>>
>> So, the error handling as implemented to decline on any sendmsg
>> "fatal" errors actually makes sense, if we'd like to avoid rewriting
>> ngx_sendmsg() to further distinguish sendmsg errors, notably EMSGSIZE.
>> The downside is that connection may ignore actually fatal errors,
>> though the error will likely repeat on sending non-probing frames.
>>
>> NGX_AGAIN conversion to NGX_OK is questionable, because this
>> makes a probe considered successfully sent while it is not.
>> I wonder though what we can do about that other than to simply
>> ignore, similar to other ngx_quic_frame_sendto() callers,
>> because next time on a probe retry sendmsg can return success.
>
> Ignoring certain send error for probes is not a big deal since timeout
> expiration will lead to the same result eventually.
>
>>>>> + }
>>>>> +
>>>>> + return NGX_ERROR;
>>>>> }
>>>>> +
>>>>> + return NGX_OK;
>>>>> }
>>>>> +
>>>>> +
>>>>> +ngx_int_t
>>>>> +ngx_quic_handle_path_mtu_ack(ngx_connection_t *c, ngx_quic_path_t *path,
>>>>> + uint64_t min, uint64_t max)
>>>>> +{
>>>>> + uint64_t pnum;
>>>>> + ngx_uint_t i;
>>>>> +
>>>>> + if (path->state != NGX_QUIC_PATH_DISCOVERING_MTU) {
>>>>> + return NGX_OK;
>>>>> + }
>>>>> +
>>>>> + for (i = 0; i < NGX_QUIC_PATH_RETRIES; i++) {
>>>>> + pnum = path->mtu_pnum[i];
>>>>> +
>>>>> + if (pnum == NGX_QUIC_UNSET_PN) {
>>>>> + break;
>>>>> + }
>>>>> +
>>>>> + if (pnum < min || pnum > max) {
>>>>> + continue;
>>>>> + }
>>>>> +
>>>>> + path->mtu = path->mtud;
>>>>> +
>>>>> + ngx_log_debug2(NGX_LOG_DEBUG_EVENT, c->log, 0,
>>>>> + "quic path seq:%uL ack mtu:%uz",
>>>>> + path->seqnum, path->mtu);
>>>>> +
>>>>> + ngx_quic_discover_path_mtu(c, path);
>>>>> +
>>>>> + break;
>>>>
>>>> What if two probes appeared ACKed in one range.
>>>> Though that seems to be safe to break there, as two
>>>> probes barely have different probe sizes, rather retries.
>>>
>>> No matter how many probes are acked, the next step is still the same.
>>> So there's no reason to find all of them.
>>>
>>
>> Ok, let's leave it as is.
>>
>>>>> + }
>>>>> +
>>>>> + return NGX_OK;
>>>>> +}
>>>>> diff --git a/src/event/quic/ngx_event_quic_migration.h b/src/event/quic/ngx_event_quic_migration.h
>>>>> --- a/src/event/quic/ngx_event_quic_migration.h
>>>>> +++ b/src/event/quic/ngx_event_quic_migration.h
>>>>> @@ -18,10 +18,10 @@
>>>>> #define NGX_QUIC_PATH_BACKUP 2
>>>>>
>>>>> #define ngx_quic_path_dbg(c, msg, path) \
>>>>> - ngx_log_debug6(NGX_LOG_DEBUG_EVENT, c->log, 0, \
>>>>> - "quic path seq:%uL %s sent:%O recvd:%O state:%s%s", \
>>>>> + ngx_log_debug7(NGX_LOG_DEBUG_EVENT, c->log, 0, \
>>>>> + "quic path seq:%uL %s tx:%O rx:%O valid:%d st:%d mtu:%uz", \
>>>>> path->seqnum, msg, path->sent, path->received, \
>>>>> - path->validated ? "V": "N", path->validating ? "R": "");
>>>>> + path->validated, path->state, path->mtu);
>>>>>
>>>>> ngx_int_t ngx_quic_handle_path_challenge_frame(ngx_connection_t *c,
>>>>> ngx_quic_header_t *pkt, ngx_quic_path_challenge_frame_t *f);
>>>>> @@ -36,6 +36,10 @@ ngx_int_t ngx_quic_set_path(ngx_connecti
>>>>> ngx_int_t ngx_quic_handle_migration(ngx_connection_t *c,
>>>>> ngx_quic_header_t *pkt);
>>>>>
>>>>> -void ngx_quic_path_validation_handler(ngx_event_t *ev);
>>>>> +void ngx_quic_path_handler(ngx_event_t *ev);
>>>>> +
>>>>> +void ngx_quic_discover_path_mtu(ngx_connection_t *c, ngx_quic_path_t *path);
>>>>> +ngx_int_t ngx_quic_handle_path_mtu_ack(ngx_connection_t *c,
>>>>> + ngx_quic_path_t *path, uint64_t min, uint64_t max);
>>>>
>>>> Nitpicking:
>>>> maybe just ngx_quic_handle_path_mtu() ?
>>>> This would match ngx_quic_discover_path_mtu() naming.
>>>
>>> OK, renamed.
>>>
>>>>>
>>>>> #endif /* _NGX_EVENT_QUIC_MIGRATION_H_INCLUDED_ */
>>>>> diff --git a/src/event/quic/ngx_event_quic_output.c b/src/event/quic/ngx_event_quic_output.c
>>>>> --- a/src/event/quic/ngx_event_quic_output.c
>>>>> +++ b/src/event/quic/ngx_event_quic_output.c
>>>>> @@ -10,9 +10,6 @@
>>>>> #include <ngx_event_quic_connection.h>
>>>>>
>>>>>
>>>>> -#define NGX_QUIC_MAX_UDP_PAYLOAD_OUT 1252
>>>>> -#define NGX_QUIC_MAX_UDP_PAYLOAD_OUT6 1232
>>>>> -
>>>>> #define NGX_QUIC_MAX_UDP_SEGMENT_BUF 65487 /* 65K - IPv6 header */
>>>>> #define NGX_QUIC_MAX_SEGMENTS 64 /* UDP_MAX_SEGMENTS */
>>>>>
>>>>> @@ -61,21 +58,6 @@ static size_t ngx_quic_path_limit(ngx_co
>>>>> size_t size);
>>>>>
>>>>>
>>>>> -size_t
>>>>> -ngx_quic_max_udp_payload(ngx_connection_t *c)
>>>>> -{
>>>>> - /* TODO: path MTU discovery */
>>>>> -
>>>>> -#if (NGX_HAVE_INET6)
>>>>> - if (c->sockaddr->sa_family == AF_INET6) {
>>>>> - return NGX_QUIC_MAX_UDP_PAYLOAD_OUT6;
>>>>> - }
>>>>> -#endif
>>>>> -
>>>>> - return NGX_QUIC_MAX_UDP_PAYLOAD_OUT;
>>>>> -}
>>>>> -
>>>>> -
>>>>> ngx_int_t
>>>>> ngx_quic_output(ngx_connection_t *c)
>>>>> {
>>>>> @@ -142,10 +124,7 @@ ngx_quic_create_datagrams(ngx_connection
>>>>>
>>>>> p = dst;
>>>>>
>>>>> - len = ngx_min(qc->ctp.max_udp_payload_size,
>>>>> - NGX_QUIC_MAX_UDP_PAYLOAD_SIZE);
>>>>> -
>>>>> - len = ngx_quic_path_limit(c, path, len);
>>>>> + len = ngx_quic_path_limit(c, path, path->mtu);
>>>>>
>>>>> pad = ngx_quic_get_padding_level(c);
>>>>>
>>>>> @@ -299,9 +278,7 @@ ngx_quic_allow_segmentation(ngx_connecti
>>>>> ctx = ngx_quic_get_send_ctx(qc, ssl_encryption_application);
>>>>>
>>>>> bytes = 0;
>>>>> -
>>>>> - len = ngx_min(qc->ctp.max_udp_payload_size,
>>>>> - NGX_QUIC_MAX_UDP_SEGMENT_BUF);
>>>>> + len = ngx_min(qc->path->mtu, NGX_QUIC_MAX_UDP_SEGMENT_BUF);
>>>>>
>>>>> for (q = ngx_queue_head(&ctx->frames);
>>>>> q != ngx_queue_sentinel(&ctx->frames);
>>>>> @@ -345,8 +322,7 @@ ngx_quic_create_segments(ngx_connection_
>>>>> return NGX_ERROR;
>>>>> }
>>>>>
>>>>> - segsize = ngx_min(qc->ctp.max_udp_payload_size,
>>>>> - NGX_QUIC_MAX_UDP_SEGMENT_BUF);
>>>>> + segsize = ngx_min(path->mtu, NGX_QUIC_MAX_UDP_SEGMENT_BUF);
>>>>> p = dst;
>>>>> end = dst + sizeof(dst);
>>>>>
>>>>> diff --git a/src/event/quic/ngx_event_quic_output.h b/src/event/quic/ngx_event_quic_output.h
>>>>> --- a/src/event/quic/ngx_event_quic_output.h
>>>>> +++ b/src/event/quic/ngx_event_quic_output.h
>>>>> @@ -12,8 +12,6 @@
>>>>> #include <ngx_core.h>
>>>>>
>>>>>
>>>>> -size_t ngx_quic_max_udp_payload(ngx_connection_t *c);
>>>>> -
>>>>> ngx_int_t ngx_quic_output(ngx_connection_t *c);
>>>>>
>>>>> ngx_int_t ngx_quic_negotiate_version(ngx_connection_t *c,
>>>>> diff --git a/src/event/quic/ngx_event_quic_ssl.c b/src/event/quic/ngx_event_quic_ssl.c
>>>>> --- a/src/event/quic/ngx_event_quic_ssl.c
>>>>> +++ b/src/event/quic/ngx_event_quic_ssl.c
>>>>> @@ -494,6 +494,8 @@ ngx_quic_crypto_input(ngx_connection_t *
>>>>> */
>>>>> ngx_quic_discard_ctx(c, ssl_encryption_handshake);
>>>>>
>>>>> + ngx_quic_discover_path_mtu(c, qc->path);
>>>>> +
>>>>> /* start accepting clients on negotiated number of server ids */
>>>>> if (ngx_quic_create_sockets(c) != NGX_OK) {
>>>>> return NGX_ERROR;
>>>>> diff --git a/src/os/unix/ngx_errno.h b/src/os/unix/ngx_errno.h
>>>>> --- a/src/os/unix/ngx_errno.h
>>>>> +++ b/src/os/unix/ngx_errno.h
>>>>> @@ -54,6 +54,7 @@ typedef int ngx_err_t;
>>>>> #define NGX_ENOMOREFILES 0
>>>>> #define NGX_ELOOP ELOOP
>>>>> #define NGX_EBADF EBADF
>>>>> +#define NGX_EMSGSIZE EMSGSIZE
>>>>>
>>>>> #if (NGX_HAVE_OPENAT)
>>>>> #define NGX_EMLINK EMLINK
>>
>> This will break on win32, an NGX_EMSGSIZE definition needs to be
>> provided there, which is WSAEMSGSIZE.
>
> Thanks, fixed.
>
>>>
>>> The diff is attached.
>>>
>>
>> The diff looks good.
>
> Also, with step-based mtu step updates, QUIC flood detection is triggered with
> the h3_keepalive.t on interfaces with large mtu. I think, doubling mtu
> is a safer choice.
While optimizing for theoretical maximum is not something to accent,
it is certainly expected to work well on various links. Below are some
numbers to consider, taken on various PMTU and interval/search strategy.
= 20 streams, 1mb response each, pmtu=65536 (sampled 3 times)
pn/ack probes last total bytes
== 1000ms + 256
1 21 6.320 79.472
2 18 5.808 65.376
3 17 5.552 59.568
4 15 2.400 33.984
== 100ms + 256
1 263 65.500 8.962.971
2 256 65.500 8.708.555
3 268 65.500 9.188.363
== 100ms * 2
1 17-24 65.513 833.607-1.286.679
2 23-24 65.513 1.190.607-1.255.710
3 20-23 65.513 996.252-1.198.820
4 18 65.513 872.007
It is clearly seen from the above:
- "1000ms + 256" (baseline) does not converge after 20 MB
- "100ms + 256" converges but hits flood detection limit,
which is 8 MB of probes if no stream traffic in the connection.
- "100ms * 2" results in ~ 1 MB of total probe traffic
This explains how h3_keepalive.t is failed.
Below "100ms * 2" is also seen with better avg payload size.
Compared with pto and ack-only packets cleared from raw data,
seen increased with "100ms * 2" if client ack's with frequency
less than on every second packet, which messed up avg stats.
= 20 streams, 1mb response each, pmtu=65536 (sampled 3 times)
pn/ack pn stat
== 1000ms + 256
1 N=8245 min=3 max=6033 m=2437 avg=2552 std=1750
2 N=8481 min=3 max=5733 m=2296 avg=2486 std=1453
3 N=8703 min=3 max=5517 m=2189 avg=2424 std=1361
4 N=11237 min=3 max=2349 m=2349 avg=1879 std=770
== 100ms + 256
1 N=3757 min=3 max=18573 m=3494 avg=5440 std=5226
2 N=3696 min=3 max=17037 m=4493 avg=5686 std=4475
3 N=3745 min=3 max=16525 m=4493 avg=5657 std=4308
== 100ms * 2
1 N=1956 min=3 max=65465 m=1165 avg=10757 std=23394
2 N=1150 min=3 max=65465 m=2320 avg=18264 std=26491
3 N=1411 min=3 max=65465 m=1165 avg=14879 std=25476
4 N=1691 min=3 max=65465 m=548 avg=12419 std=23854
== after clearing pto and ack-only
1* N=1910 min=9 max=65465 m=1165 avg=11014 std=23612
2* N=1095 min=10 max=65465 m=2365 avg=19181 std=26823
3* N=1117 min=12 max=65465 m=2193 avg=18798 std=27320
4* N=1145 min=9 max=65465 m=2365 avg=18333 std=27049
As seen from the above, "100ms * 2" represents worse median,
even with pto/ack-only cleared, which may be a sign of poor
packing to maximum packet size, resulting in short packets.
On a typical PMTU of up to 1500, the difference is leveled
(captured with pn/ack=2)
= 20 streams, 1mb response each, pmtu=1500 (sampled 3 times)
x 1000_256
+ 100_256
* 100_x2
N Min Max Median Avg Stddev
x 16443 3 1437 1437 1283.7392 414.27811
+ 16377 3 1437 1437 1288.8831 412.92124
No difference proven at 95.0% confidence
* 16398 3 1436 1436 1287.2667 413.79561
No difference proven at 95.0% confidence
pn/ack probes last total bytes
== 1000ms + 256
2 6 1472 9232
== 100ms + 256
2 6 1472 9232
== 100ms * 2
2 8 1471 12889
>
> Also, while looking in migration test logs, I thought it might be a good idea to
> only handle validation/mtu events for the currently active path.
>
Can you elaborate?
> Including 3 patch updates:
>
> - the last one + win32 fix + format fix in ngx_quic_path_dbg()
> - x2 mtu
> - single-path event handling
>
> --
> Roman Arutyunyan
> <quic-pmtud-fix.txt><quic-pmtud-scale-x2.txt><quic-pmtud-fix-2.txt>_______________________________________________
> nginx-devel mailing list
> nginx-devel at nginx.org
> https://mailman.nginx.org/mailman/listinfo/nginx-devel
--
Sergey Kandaurov
More information about the nginx-devel
mailing list