[BUG?]fail_timeout/max_fails: code doesn't do what doc says
Dmitry Popov
dp at highloadlab.com
Tue May 21 12:16:05 UTC 2013
On Mon, 20 May 2013 21:11:03 +0400
Maxim Dounin <mdounin at mdounin.ru> wrote:
> Patch:
>
> diff --git a/src/http/modules/ngx_http_upstream_least_conn_module.c b/src/http/modules/ngx_http_upstream_least_conn_module.c
> --- a/src/http/modules/ngx_http_upstream_least_conn_module.c
> +++ b/src/http/modules/ngx_http_upstream_least_conn_module.c
> @@ -282,7 +282,10 @@ ngx_http_upstream_get_least_conn_peer(ng
> }
>
> best->current_weight -= total;
> - best->checked = now;
> +
> + if (now - best->checked > best->fail_timeout) {
> + best->checked = now;
> + }
>
> pc->sockaddr = best->sockaddr;
> pc->socklen = best->socklen;
> diff --git a/src/http/ngx_http_upstream_round_robin.c b/src/http/ngx_http_upstream_round_robin.c
> --- a/src/http/ngx_http_upstream_round_robin.c
> +++ b/src/http/ngx_http_upstream_round_robin.c
> @@ -523,7 +523,10 @@ ngx_http_upstream_get_peer(ngx_http_upst
> rrp->tried[n] |= m;
>
> best->current_weight -= total;
> - best->checked = now;
> +
> + if (now - best->checked > best->fail_timeout) {
> + best->checked = now;
> + }
>
> return best;
> }
Still a bug exists because of this code:
if (state & NGX_PEER_FAILED) {
/* ... */
peer->fails++;
peer->accessed = now;
peer->checked = now;
/* .... */
} else {
if (peer->accessed < peer->checked) {
peer->fails = 0;
}
}
Consider upstream which fails at least once each second (actually it's enough
to fail each fail_timeout seconds). This upstream will be disabled as soon as it
will fail max_fails times no matter what fail_timeout is.
I propose the following solution:
let's have two bits per peer:
1) avail - set if this peer is available for usage (fails < max_fails)
2) avail_check - set if this peer was disabled, but its fail_timeout is expired and
we sent one request to check if it's alive
and two timestamps:
3) first_fail - we'll count number of fails only inside
[first_fail, first_fail + fail_timeout) interval
4) unavail_from - peer is unavailable inside
[unavail_from, unavail_from + fail_timeout) interval
first_fail will only be used if avail == 1, unavail_from only if avail == 0,
so we may use union here.
Simplified code:
get:
if (!avail
&& (now < unavail_from + fail_timeout || avail_check))
{
continue;
}
if (!avail)
avail_check =1;
free:
if (avail && now >= first_fail + fail_timeout)
fails = 0;
if (request_failed) {
if (fails == 0)
first_fail = now;
fails++;
if (fails >= max_fails) {
avail = 0;
unavail_from = now;
}
avail_check = 0;
} else {
if (avail_check) {
fails = 0;
avail = 1;
avail_check = 0;
}
}
Also I think it's a good idea to split fail_timeout to fail_timeout and
disable_time:
we'll count fails during fail_timeout and disable upstream for disable_time.
patch with bugfix:
Upstreams bugfix: fail_timeout could be ignored
Upstream could be disabled no matter what fail_timeout is (if requests fail
each fail_timeout seconds).
Signed-off-by: Dmitry Popov <dp at highloadlab.com>
diff -ur old/src/http/ngx_http_upstream_round_robin.h new/src/http/ngx_http_upstream_round_robin.h
--- old/src/http/ngx_http_upstream_round_robin.h 2013-05-21 13:22:09.478903537 +0400
+++ new/src/http/ngx_http_upstream_round_robin.h 2013-05-21 15:38:05.488236353 +0400
@@ -24,13 +24,17 @@
ngx_int_t weight;
ngx_uint_t fails;
- time_t accessed;
- time_t checked;
+ union {
+ time_t first_fail;
+ time_t unavail_from;
+ };
ngx_uint_t max_PCRE: retain input pattern for all regular expressions.
Previously, input pattern was kept only for regular expressions
with named captures, which resulted in error log entries without
input pattern for PCRE errors that occured while processing
regular expressions without them.
Signed-off-by: Piotr Sikora <piotr at cloudflare.com>
fails;
time_t fail_timeout;
- ngx_uint_t down; /* unsigned down:1; */
+ unsigned avail:1;
+ unsigned avail_check:1;
+ unsigned down:1;
#if (NGX_HTTP_SSL)
ngx_ssl_session_t *ssl_session; /* local to a process */
diff -ur old/src/http/modules/ngx_http_upstream_ip_hash_module.c new/src/http/modules/ngx_http_upstream_ip_hash_module.c
--- old/src/http/modules/ngx_http_upstream_ip_hash_module.c 2013-05-21 13:22:09.475570203 +0400
+++ new/src/http/modules/ngx_http_upstream_ip_hash_module.c 2013-05-21 15:44:22.671564821 +0400
@@ -208,12 +208,14 @@
if (!peer->down) {
- if (peer->max_fails == 0 || peer->fails < peer->max_fails) {
+ if (peer->avail) {
break;
}
- if (now - peer->checked > peer->fail_timeout) {
- peer->checked = now;
+ if (now - peer->unavail_from >= peer->fail_timeout
+ && !peer->avail_check)
+ {
+ peer->avail_check = 1;
break;
}
}
diff -ur old/src/http/modules/ngx_http_upstream_least_conn_module.c new/src/http/modules/ngx_http_upstream_least_conn_module.c
--- old/src/http/modules/ngx_http_upstream_least_conn_module.c 2013-05-21 13:22:09.472236869 +0400
+++ new/src/http/modules/ngx_http_upstream_least_conn_module.c 2013-05-21 15:45:35.254897219 +0400
@@ -203,9 +203,9 @@
continue;
}
- if (peer->max_fails
- && peer->fails >= peer->max_fails
- && now - peer->checked <= peer->fail_timeout)
+ if (!peer->avail
+ && (now - peer->unavail_from < peer->fail_timeout
+ || peer->avail_check))
{
continue;
}
@@ -260,9 +260,9 @@
continue;
}
- if (peer->max_fails
- && peer->fails >= peer->max_fails
- && now - peer->checked <= peer->fail_timeout)
+ if (!peer->avail
+ && (now - peer->unavail_from < peer->fail_timeout
+ || peer->avail_check))
{
continue;
}
@@ -282,7 +282,9 @@
}
best->current_weight -= total;
- best->checked = now;
+ if (!best->avail) {
+ best->avail_check = 1;
+ }
pc->sockaddr = best->sockaddr;
pc->socklen = best->socklen;
@@ -331,6 +333,9 @@
for (i = 0; i < peers->number; i++) {
peers->peer[i].fails = 0;
+ peers->peer[i].avail = 1;
+ peers->peer[i].avail_check = 0;
+ /* peers->peer[i].first_fail = 0; */
}
pc->name = peers->name;
diff -ur old/src/http/ngx_http_upstream_round_robin.c new/src/http/ngx_http_upstream_round_robin.c
--- old/src/http/ngx_http_upstream_round_robin.c 2013-05-21 13:22:09.472236869 +0400
+++ new/src/http/ngx_http_upstream_round_robin.c 2013-05-21 15:46:07.171563474 +0400
@@ -85,6 +85,8 @@
peers->peer[n].weight = server[i].weight;
peers->peer[n].effective_weight = server[i].weight;
peers->peer[n].current_weight = 0;
+ peers->peer[n].avail = 1;
+ peers->peer[n].avail_check = 0;
n++;
}
}
@@ -139,6 +141,8 @@
backup->peer[n].max_fails = server[i].max_fails;
backup->peer[n].fail_timeout = server[i].fail_timeout;
backup->peer[n].down = server[i].down;
+ backup->peer[n].avail = 1;
+ backup->peer[n].avail_check = 0;
n++;
}
}
@@ -196,6 +200,8 @@
peers->peer[i].current_weight = 0;
peers->peer[i].max_fails = 1;
peers->peer[i].fail_timeout = 10;
+ peers->peer[i].avail = 1;
+ peers->peer[i].avail_check = 0;
}
us->peer.data = peers;
@@ -301,6 +307,8 @@
peers->peer[0].current_weight = 0;
peers->peer[0].max_fails = 1;
peers->peer[0].fail_timeout = 10;
+ peers->peer[0].avail = 1;
+ peers->peer[0].avail_check = 0;
} else {
@@ -334,6 +342,8 @@
peers->peer[i].current_weight = 0;
peers->peer[i].max_fails = 1;
peers->peer[i].fail_timeout = 10;
+ peers->peer[i].avail = 1;
+ peers->peer[i].avail_check = 0;
}
}
@@ -451,6 +461,9 @@
for (i = 0; i < peers->number; i++) {
peers->peer[i].fails = 0;
+ peers->peer[i].avail = 1;
+ peers->peer[i].avail_check = 0;
+ /* peers->peer[i].first_fail = 0; */
}
/* ngx_unlock_mutex(peers->mutex); */
@@ -490,9 +503,9 @@
continue;
}
- if (peer->max_fails
- && peer->fails >= peer->max_fails
- && now - peer->checked <= peer->fail_timeout)
+ if (!peer->avail
+ && (now - peer->unavail_from < peer->fail_timeout
+ || peer->avail_check))
{
continue;
}
@@ -523,7 +536,9 @@
rrp->tried[n] |= m;
best->current_weight -= total;
- best->checked = now;
+ if (!best->avail) {
+ best->avail_check = 1;
+ }
return best;
}
@@ -550,35 +565,49 @@
peer = &rrp->peers->peer[rrp->current];
- if (state & NGX_PEER_FAILED) {
+ if (peer->max_fails) {
now = ngx_time();
- /* ngx_lock_mutex(rrp->peers->mutex); */
+ if (peer->avail && now - peer->first_fail >= peer->fail_timeout) {
+ peer->fails = 0;
+ }
+
+ if (state & NGX_PEER_FAILED) {
+ /* ngx_lock_mutex(rrp->peers->mutex); */
+
+ if (peer->fails++ == 0) {
+ peer->first_fail = now;
+ }
+
+ if (peer->fails >= peer->max_fails) {
+ peer->avail = 0;
+ peer->unavail_from = now;
+ }
- peer->fails++;
- peer->accessed = now;
- peer->checked = now;
+ peer->avail_check = 0;
- if (peer->max_fails) {
peer->effective_weight -= peer->weight / peer->max_fails;
- }
- ngx_log_debug2(NGX_LOG_DEBUG_HTTP, pc->log, 0,
- "free rr peer failed: %ui %i",
- rrp->current, peer->effective_weight);
+ ngx_log_debug2(NGX_LOG_DEBUG_HTTP, pc->log, 0,
+ "free rr peer failed: %ui %i",
+ rrp->current, peer->effective_weight);
- if (peer->effective_weight < 0) {
- peer->effective_weight = 0;
- }
+ if (peer->effective_weight < 0) {
+ peer->effective_weight = 0;
+ }
- /* ngx_unlock_mutex(rrp->peers->mutex); */
+ /* ngx_unlock_mutex(rrp->peers->mutex); */
- } else {
+ } else {
- /* mark peer live if check passed */
+ /* mark peer live if check passed */
- if (peer->accessed < peer->checked) {
- peer->fails = 0;
+ if (peer->avail_check) {
+ peer->fails = 0;
+ peer->avail = 1;
+ peer->avail_check = 0;
+ /* peers->peer[i].first_fail = 0; */
+ }
}
}
--
Dmitry Popov
Highloadlab
More information about the nginx-devel
mailing list