[Patch] SO_REUSEPORT support from master process

Lu, Yingqi yingqi.lu at intel.com
Thu Sep 18 20:05:39 UTC 2014


Dear All,

Here is the updated patch for SO_REUSEPORT support enablement on Linux OS (attached below). The changes in this version are:

1. Solve the issue with "binary upgrade on the fly" feature. Thanks to Sepherosa Ziehau for his feedback! With this version of patch, there is no issue or connection loss during the binary upgrade. I tested on RHEL 6.5 (with kernel 3.13.9) and CentOS 7. Both are working fine. The new master process inherited all the previous listen sockets during the upgrade and new children processes inherited from the new master. Also, the workload data show there is no connection loss or performance impact during the binary upgrade.

2. Make the duplication of listen sockets happen in the function ngx_http_init_listening instead of ngx_init_cycle. 

Please review it and let me know your questions and comments. Thanks very much for your time reviewing the patch.

Thanks,
Yingqi Lu

# HG changeset patch
# User Yingqi Lu <Yingqi.Lu at intel.com>
# Date 1411067116 25200
#      Thu Sep 18 12:05:16 2014 -0700
# Node ID 222e0a18a7fd4d61d219ed6e7a2443716978cc71
# Parent  45aef9a5b176d8e522277b9abf6c09fb874ab044
These are the patch files to enable SO_REUSEPORT support (patch is based on nginx-106a8bfa4f42.tar.gz)

diff -r 45aef9a5b176 -r 222e0a18a7fd src/core/ngx_connection.c
--- a/src/core/ngx_connection.c	Thu Sep 18 12:03:41 2014 -0700
+++ b/src/core/ngx_connection.c	Thu Sep 18 12:05:16 2014 -0700
@@ -304,7 +304,7 @@
 ngx_int_t
 ngx_open_listening_sockets(ngx_cycle_t *cycle)
 {
-    int               reuseaddr;
+    int               reuseaddr, reuseport;
     ngx_uint_t        i, tries, failed;
     ngx_err_t         err;
     ngx_log_t        *log;
@@ -312,6 +312,7 @@
     ngx_listening_t  *ls;
 
     reuseaddr = 1;
+    reuseport = 1;
 #if (NGX_SUPPRESS_WARN)
     failed = 0;
 #endif
@@ -369,6 +370,23 @@
 
                 return NGX_ERROR;
             }
+            if (ngx_so_reuseport_enabled)
+            {
+                if (setsockopt(s, SOL_SOCKET, SO_REUSEPORT,
+                               (const void *) &reuseport, sizeof(int))
+                    == -1) {
+                    ngx_log_error(NGX_LOG_EMERG, log, ngx_socket_errno,
+                                  "setsockopt(SO_REUSEPORT) %V failed",
+                                  &ls[i].addr_text);
+                    if (ngx_close_socket(s) == -1) {
+                        ngx_log_error(NGX_LOG_EMERG, log, ngx_socket_errno,
+                                      ngx_close_socket_n " %V failed",
+                                      &ls[i].addr_text);
+                    }
+
+                    return NGX_ERROR;
+                }
+            }
 
 #if (NGX_HAVE_INET6 && defined IPV6_V6ONLY)
 
diff -r 45aef9a5b176 -r 222e0a18a7fd src/core/ngx_cycle.c
--- a/src/core/ngx_cycle.c	Thu Sep 18 12:03:41 2014 -0700
+++ b/src/core/ngx_cycle.c	Thu Sep 18 12:05:16 2014 -0700
@@ -26,6 +26,9 @@
 ngx_uint_t             ngx_test_config;
 ngx_uint_t             ngx_quiet_mode;
 
+ngx_uint_t             ngx_so_reuseport_enabled;
+ngx_uint_t             ngx_num_dup_sockets;
+
 #if (NGX_THREADS)
 ngx_tls_key_t          ngx_core_tls_key;
 #endif
@@ -54,7 +57,36 @@
     ngx_core_conf_t     *ccf, *old_ccf;
     ngx_core_module_t   *module;
     char                 hostname[NGX_MAXHOSTNAMELEN];
+    ngx_uint_t           num_cores, taken;
+    ngx_socket_t         temp_s;
+    int                  one = 1;
 
+    ngx_so_reuseport_enabled = 0;
+    temp_s = ngx_socket(AF_INET, SOCK_STREAM, 0);
+#ifndef SO_REUSEPORT
+#define SO_REUSEPORT 15
+#endif
+    if (setsockopt(temp_s, SOL_SOCKET, SO_REUSEPORT,
+                  (const void *) &one, sizeof(int)) == 0) {
+        ngx_so_reuseport_enabled = 1;
+    }
+    ngx_close_socket(temp_s);
+
+    if (ngx_so_reuseport_enabled) {
+#ifdef _SC_NPROCESSORS_ONLN
+        num_cores = sysconf(_SC_NPROCESSORS_ONLN);
+#else
+        num_cores = 1;
+#endif
+        if (num_cores > 8) {
+            ngx_num_dup_sockets = num_cores/8;
+        } else {
+            ngx_num_dup_sockets = 1;
+        }
+    } else {
+        ngx_num_dup_sockets = 1;
+    }    
+   
     ngx_timezone_update();
 
     /* force localtime update with a new timezone */
@@ -114,7 +146,7 @@
     }
 
 
-    n = old_cycle->paths.nelts ? old_cycle->paths.nelts : 10;
+    n = old_cycle->paths.nelts ? old_cycle->paths.nelts : 10 * ngx_num_dup_sockets;
 
     cycle->paths.elts = ngx_pcalloc(pool, n * sizeof(ngx_path_t *));
     if (cycle->paths.elts == NULL) {
@@ -164,7 +196,7 @@
         return NULL;
     }
 
-    n = old_cycle->listening.nelts ? old_cycle->listening.nelts : 10;
+    n = old_cycle->listening.nelts ? old_cycle->listening.nelts : 10 * ngx_num_dup_sockets;
 
     cycle->listening.elts = ngx_pcalloc(pool, n * sizeof(ngx_listening_t));
     if (cycle->listening.elts == NULL) {
@@ -231,7 +263,7 @@
 
     ngx_memzero(&conf, sizeof(ngx_conf_t));
     /* STUB: init array ? */
-    conf.args = ngx_array_create(pool, 10, sizeof(ngx_str_t));
+    conf.args = ngx_array_create(pool, (10 * ngx_num_dup_sockets), sizeof(ngx_str_t));
     if (conf.args == NULL) {
         ngx_destroy_pool(pool);
         return NULL;
@@ -486,6 +518,7 @@
         }
 
         nls = cycle->listening.elts;
+        taken = 0;
         for (n = 0; n < cycle->listening.nelts; n++) {
 
             for (i = 0; i < old_cycle->listening.nelts; i++) {
@@ -493,9 +526,9 @@
                     continue;
                 }
 
-                if (ngx_cmp_sockaddr(nls[n].sockaddr, nls[n].socklen,
+                if ((ngx_cmp_sockaddr(nls[n].sockaddr, nls[n].socklen,
                                      ls[i].sockaddr, ls[i].socklen, 1)
-                    == NGX_OK)
+                    == NGX_OK) && i >= taken)
                 {
                     nls[n].fd = ls[i].fd;
                     nls[n].previous = &ls[i];
@@ -540,6 +573,7 @@
                         nls[n].add_deferred = 1;
                     }
 #endif
+                    taken = i + 1;
                     break;
                 }
             }
@@ -747,7 +781,7 @@
             exit(1);
         }
 
-        n = 10;
+        n = 10 * ngx_num_dup_sockets;
         ngx_old_cycles.elts = ngx_pcalloc(ngx_temp_pool,
                                           n * sizeof(ngx_cycle_t *));
         if (ngx_old_cycles.elts == NULL) {
diff -r 45aef9a5b176 -r 222e0a18a7fd src/core/ngx_cycle.h
--- a/src/core/ngx_cycle.h	Thu Sep 18 12:03:41 2014 -0700
+++ b/src/core/ngx_cycle.h	Thu Sep 18 12:05:16 2014 -0700
@@ -136,6 +136,8 @@
 extern ngx_module_t           ngx_core_module;
 extern ngx_uint_t             ngx_test_config;
 extern ngx_uint_t             ngx_quiet_mode;
+extern ngx_uint_t             ngx_so_reuseport_enabled;
+extern ngx_uint_t             ngx_num_dup_sockets;
 #if (NGX_THREADS)
 extern ngx_tls_key_t          ngx_core_tls_key;
 #endif
diff -r 45aef9a5b176 -r 222e0a18a7fd src/http/ngx_http.c
--- a/src/http/ngx_http.c	Thu Sep 18 12:03:41 2014 -0700
+++ b/src/http/ngx_http.c	Thu Sep 18 12:05:16 2014 -0700
@@ -1671,7 +1671,7 @@
 static ngx_int_t
 ngx_http_init_listening(ngx_conf_t *cf, ngx_http_conf_port_t *port)
 {
-    ngx_uint_t                 i, last, bind_wildcard;
+    ngx_uint_t                 i, j, last, bind_wildcard;
     ngx_listening_t           *ls;
     ngx_http_port_t           *hport;
     ngx_http_conf_addr_t      *addr;
@@ -1703,42 +1703,43 @@
             continue;
         }
 
-        ls = ngx_http_add_listening(cf, &addr[i]);
-        if (ls == NULL) {
-            return NGX_ERROR;
-        }
-
-        hport = ngx_pcalloc(cf->pool, sizeof(ngx_http_port_t));
-        if (hport == NULL) {
-            return NGX_ERROR;
-        }
-
-        ls->servers = hport;
-
-        if (i == last - 1) {
-            hport->naddrs = last;
-
-        } else {
-            hport->naddrs = 1;
-            i = 0;
-        }
-
-        switch (ls->sockaddr->sa_family) {
-
-#if (NGX_HAVE_INET6)
-        case AF_INET6:
-            if (ngx_http_add_addrs6(cf, hport, addr) != NGX_OK) {
+        for(j = 0; j < ngx_num_dup_sockets; j++) {
+            ls = ngx_http_add_listening(cf, &addr[i]);
+            if (ls == NULL) {
                 return NGX_ERROR;
             }
-            break;
-#endif
-        default: /* AF_INET */
-            if (ngx_http_add_addrs(cf, hport, addr) != NGX_OK) {
+
+            hport = ngx_pcalloc(cf->pool, sizeof(ngx_http_port_t));
+            if (hport == NULL) {
                 return NGX_ERROR;
             }
-            break;
+
+            ls->servers = hport;
+
+            if (i == last - 1) {
+                hport->naddrs = last;
+
+            } else {
+                hport->naddrs = 1;
+                i = 0;
+            }
+
+            switch (ls->sockaddr->sa_family) {
+
+#if (NGX_HAVE_INET6)
+            case AF_INET6:
+                if (ngx_http_add_addrs6(cf, hport, addr) != NGX_OK) {
+                    return NGX_ERROR;
+                }
+                break;
+#endif
+            default: /* AF_INET */
+                if (ngx_http_add_addrs(cf, hport, addr) != NGX_OK) {
+                    return NGX_ERROR;
+                }
+                break;
+            }
         }
-
         addr++;
         last--;
     }

1. Software and workloads used in performance tests may have been optimized for performance only on Intel microprocessors. Performance tests, such as SYSmark and MobileMark, are measured using specific computer systems, components, software, operations and functions. Any change to any of those factors may cause the results to vary. You should consult other information and performance tests to assist you in fully evaluating your contemplated purchases, including the performance of that product when combined with other products.


-----Original Message-----
From: nginx-devel-bounces at nginx.org [mailto:nginx-devel-bounces at nginx.org] On Behalf Of Lu, Yingqi
Sent: Wednesday, August 27, 2014 10:33 AM
To: nginx-devel at nginx.org
Subject: RE: [Patch] SO_REUSEPORT support from master process

Dear All,

I am resending this patch with plain text instead of HTML format. I will also post the patch at the end of this email. Hope this will be easier for all of you to review. Please let me know if you have trouble viewing the message or the patch itself. This is our first time submitting the patch here. Your feedback and suggestions are highly appreciated.

The "SO_REUSEPORT support for listen sockets support" patches submitted by Sepherosa Ziehau are posted and discussed in [1] and [2]. Last update on the threads was 09/05/2013 and the patch is not included in the current Nginx code. Reading from the discussion, my understanding is that his patch makes a dedicated listen socket for each of the child process. In order to make sure at any given time there is always a listen socket available, the patch makes the first worker process different/special than the rest.

Here, I am proposing a simpler way to enable the SO_REUSEPORT support. It is just to create and configure certain number of listen sockets in the master process with SO_REUSEPORT enabled. All the children processes can inherit. In this case, we do not need to worry about ensuring 1 available listen socket at the run time. The number of the listen sockets to be created is calculated based on the number of active CPU threads. With big system that has more CPU threads (where we have the scalability issue), there are more duplicated listen sockets created to improve the throughput and scalability. With system that has only 8 or less CPU threads, there will be only 1 listen socket. This makes sure duplicated listen sockets only being created when necessary. In case that SO_REUSEPORT is not supported by the OS, it will fall back to the default/original behavior (this is tested on Linux kernel 3.8.8 where SO_REUSEPORT is not supported).

This prototype patch has been tested on an Intel modern dual socket platform with a three tier open source web server workload (PHP+Nginx/memcached/MySQL). The web server has 2 IP network interfaces configured for testing. The Linux kernel used for testing is 3.13.9. Data show:

Case 1: with single listen statement (Listen 80) specified in the configuration file, there is 46.3% throughout increase.
Case 2: with dual listen statements (for example, Listen 192.168.1.1:80 and Listen 192.168.1.2:80), there is 10% throughput increase.

Both testing cases keep everything the same except the patch itself to get above result. 

The reason that Case1 has bigger performance gains is that Case1 by default only has 1 listen socket while Case2 by default already has 2. 

Please review it and let me know your questions and comments. Thanks very much for your time reviewing the patch.

Thanks,
Yingqi Lu

[1] http://forum.nginx.org/read.php?29,241283,241283
[2] http://forum.nginx.org/read.php?29,241470,241470

# HG changeset patch
# User Yingqi Lu <Yingqi.Lu at intel.com>
# Date 1408145210 25200
#      Fri Aug 15 16:26:50 2014 -0700
# Node ID d9c7259d275dbcae8a0d001ee9703b13312b3263
# Parent  6edcb183e62d610808addebbd18249abb7224a0a
These are the patch files for SO_REUSEPORT support.

diff -r 6edcb183e62d -r d9c7259d275d ngx_connection.c
--- a/ngx_connection.c	Fri Aug 15 16:25:32 2014 -0700
+++ b/ngx_connection.c	Fri Aug 15 16:26:50 2014 -0700
@@ -304,7 +304,7 @@
 ngx_int_t
 ngx_open_listening_sockets(ngx_cycle_t *cycle)  {
-    int               reuseaddr;
+    int               reuseaddr, reuseport;
     ngx_uint_t        i, tries, failed;
     ngx_err_t         err;
     ngx_log_t        *log;
@@ -312,6 +312,7 @@
     ngx_listening_t  *ls;
 
     reuseaddr = 1;
+    reuseport = 1;
 #if (NGX_SUPPRESS_WARN)
     failed = 0;
 #endif
@@ -370,6 +371,24 @@
                 return NGX_ERROR;
             }
 
+            if (so_reuseport_enabled)
+            {
+                if (setsockopt(s, SOL_SOCKET, SO_REUSEPORT,
+                               (const void *) &reuseport, sizeof(int))
+                    == -1) {
+                    ngx_log_error(NGX_LOG_EMERG, log, ngx_socket_errno,
+                                  "setsockopt(SO_REUSEPORT) %V failed",
+                                  &ls[i].addr_text);
+                    if (ngx_close_socket(s) == -1) {
+                        ngx_log_error(NGX_LOG_EMERG, log, ngx_socket_errno,
+                                      ngx_close_socket_n " %V failed",
+                                      &ls[i].addr_text);
+                    }
+
+                    return NGX_ERROR;
+                }
+            }
+
 #if (NGX_HAVE_INET6 && defined IPV6_V6ONLY)
 
             if (ls[i].sockaddr->sa_family == AF_INET6) { diff -r 6edcb183e62d -r d9c7259d275d ngx_cycle.c
--- a/ngx_cycle.c	Fri Aug 15 16:25:32 2014 -0700
+++ b/ngx_cycle.c	Fri Aug 15 16:26:50 2014 -0700
@@ -25,7 +25,7 @@
 
 ngx_uint_t             ngx_test_config;
 ngx_uint_t             ngx_quiet_mode;
-
+ngx_uint_t             so_reuseport_enabled;
 #if (NGX_THREADS)
 ngx_tls_key_t          ngx_core_tls_key;
 #endif
@@ -55,6 +55,34 @@
     ngx_core_module_t   *module;
     char                 hostname[NGX_MAXHOSTNAMELEN];
 
+    ngx_uint_t           j, num_cores, num_dup_sockets, orig_nelts;
+    ngx_socket_t         temp_s;
+    int                  one = 1;
+    so_reuseport_enabled = 0;
+    temp_s = ngx_socket(AF_INET, SOCK_STREAM, 0); #ifndef SO_REUSEPORT 
+#define SO_REUSEPORT 15 #endif
+    if (setsockopt(temp_s, SOL_SOCKET, SO_REUSEPORT,
+                  (const void *) &one, sizeof(int)) == 0) {
+        so_reuseport_enabled = 1;
+    }
+    ngx_close_socket(temp_s);
+
+    if (so_reuseport_enabled) {
+#ifdef _SC_NPROCESSORS_ONLN
+        num_cores = sysconf(_SC_NPROCESSORS_ONLN); #else
+        num_cores = 1;
+#endif
+        if (num_cores > 8) {
+            num_dup_sockets = num_cores/8;
+        } else {
+            num_dup_sockets = 1;
+        }
+    } else {
+        num_dup_sockets = 1;
+    }
     ngx_timezone_update();
 
     /* force localtime update with a new timezone */ @@ -114,7 +142,7 @@
     }
 
 
-    n = old_cycle->paths.nelts ? old_cycle->paths.nelts : 10;
+    n = old_cycle->paths.nelts ? old_cycle->paths.nelts : 10 * 
+ num_dup_sockets;
 
     cycle->paths.elts = ngx_pcalloc(pool, n * sizeof(ngx_path_t *));
     if (cycle->paths.elts == NULL) {
@@ -164,7 +192,7 @@
         return NULL;
     }
 
-    n = old_cycle->listening.nelts ? old_cycle->listening.nelts : 10;
+    n = old_cycle->listening.nelts ? old_cycle->listening.nelts : 10 * 
+ num_dup_sockets;
 
     cycle->listening.elts = ngx_pcalloc(pool, n * sizeof(ngx_listening_t));
     if (cycle->listening.elts == NULL) { @@ -231,7 +259,7 @@
 
     ngx_memzero(&conf, sizeof(ngx_conf_t));
     /* STUB: init array ? */
-    conf.args = ngx_array_create(pool, 10, sizeof(ngx_str_t));
+    conf.args = ngx_array_create(pool, (10 * num_dup_sockets), 
+ sizeof(ngx_str_t));
     if (conf.args == NULL) {
         ngx_destroy_pool(pool);
         return NULL;
@@ -575,7 +603,15 @@
 #endif
         }
     }
+    orig_nelts = cycle->listening.nelts;
+    cycle->listening.nelts  = cycle->listening.nelts * num_dup_sockets;
 
+    ls = cycle->listening.elts;
+    for (i = 0; i < num_dup_sockets; i++) {
+        for(j = 0; j < orig_nelts; j++) {
+            ls[j + i * orig_nelts] = ls[j];
+        }
+    }
     if (ngx_open_listening_sockets(cycle) != NGX_OK) {
         goto failed;
     }
@@ -747,7 +783,7 @@
             exit(1);
         }
 
-        n = 10;
+        n = 10 * num_dup_sockets;
         ngx_old_cycles.elts = ngx_pcalloc(ngx_temp_pool,
                                           n * sizeof(ngx_cycle_t *));
         if (ngx_old_cycles.elts == NULL) { diff -r 6edcb183e62d -r d9c7259d275d ngx_cycle.h
--- a/ngx_cycle.h	Fri Aug 15 16:25:32 2014 -0700
+++ b/ngx_cycle.h	Fri Aug 15 16:26:50 2014 -0700
@@ -136,6 +136,7 @@
 extern ngx_module_t           ngx_core_module;
 extern ngx_uint_t             ngx_test_config;
 extern ngx_uint_t             ngx_quiet_mode;
+extern ngx_uint_t             so_reuseport_enabled;
 #if (NGX_THREADS)
 extern ngx_tls_key_t          ngx_core_tls_key;
 #endif

1. Software and workloads used in performance tests may have been optimized for performance only on Intel microprocessors. Performance tests, such as SYSmark and MobileMark, are measured using specific computer systems, components, software, operations and functions. Any change to any of those factors may cause the results to vary. You should consult other information and performance tests to assist you in fully evaluating your contemplated purchases, including the performance of that product when combined with other products.

_______________________________________________
nginx-devel mailing list
nginx-devel at nginx.org
http://mailman.nginx.org/mailman/listinfo/nginx-devel



More information about the nginx-devel mailing list