Skip to content

Instantly share code, notes, and snippets.

@pietern
Created August 17, 2016 02:43
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save pietern/09329d1040cc0454c3d1618495bf7362 to your computer and use it in GitHub Desktop.
Save pietern/09329d1040cc0454c3d1618495bf7362 to your computer and use it in GitHub Desktop.
commit 19917b7bc38c4ffe6975c25ae76d31fb9f582dfd
Author: Pieter Noordhuis <pietern@fb.com>
Date: Tue Aug 16 19:29:19 2016 -0700
Fix IPv6 supporting in OOB and TCP BTL
diff --git a/1.10.2/src/openmpi-1.10.2/ompi/mca/btl/tcp/btl_tcp_endpoint.c b/1.10.2/src/openmpi-1.10.2/ompi/mca/btl/tcp/btl_tcp_endpoint.c
index c444390..7be03a7 100644
--- a/1.10.2/src/openmpi-1.10.2/ompi/mca/btl/tcp/btl_tcp_endpoint.c
+++ b/1.10.2/src/openmpi-1.10.2/ompi/mca/btl/tcp/btl_tcp_endpoint.c
@@ -778,7 +778,7 @@ static int mca_btl_tcp_endpoint_start_connect(mca_btl_base_endpoint_t* btl_endpo
BTL_PEER_ERROR( btl_endpoint->endpoint_proc->proc_ompi,
( "Unable to connect to the peer %s on port %d: %s\n",
address,
- btl_endpoint->endpoint_addr->addr_port, strerror(opal_socket_errno) ) );
+ ntohs(btl_endpoint->endpoint_addr->addr_port), strerror(opal_socket_errno) ) );
}
btl_endpoint->endpoint_state = MCA_BTL_TCP_FAILED;
diff --git a/1.10.2/src/openmpi-1.10.2/opal/mca/if/linux_ipv6/if_linux_ipv6.c b/1.10.2/src/openmpi-1.10.2/opal/mca/if/linux_ipv6/if_linux_ipv6.c
index 5e25a96..2832371 100644
--- a/1.10.2/src/openmpi-1.10.2/opal/mca/if/linux_ipv6/if_linux_ipv6.c
+++ b/1.10.2/src/openmpi-1.10.2/opal/mca/if/linux_ipv6/if_linux_ipv6.c
@@ -118,8 +118,8 @@ static int if_linux_ipv6_open(void)
addrbyte[8], addrbyte[9], addrbyte[10], addrbyte[11],
addrbyte[12], addrbyte[13], addrbyte[14], addrbyte[15], scope);
- /* we don't want any other scope less than link-local */
- if (scope < 0x20) {
+ /* Only interested in global (0x00) scope */
+ if (scope != 0x00) {
opal_output_verbose(1, opal_if_base_framework.framework_output,
"skipping interface %2x%2x:%2x%2x:%2x%2x:%2x%2x:%2x%2x:%2x%2x:%2x%2x:%2x%2x scope %x\n",
addrbyte[0], addrbyte[1], addrbyte[2], addrbyte[3],
diff --git a/1.10.2/src/openmpi-1.10.2/orte/mca/oob/tcp/oob_tcp.c b/1.10.2/src/openmpi-1.10.2/orte/mca/oob/tcp/oob_tcp.c
index 0a8b5d2..2691c8d 100644
--- a/1.10.2/src/openmpi-1.10.2/orte/mca/oob/tcp/oob_tcp.c
+++ b/1.10.2/src/openmpi-1.10.2/orte/mca/oob/tcp/oob_tcp.c
@@ -219,40 +219,45 @@ static void accept_connection(const int accepted_fd,
static int parse_uri(const uint16_t af_family,
const char* host,
const char *port,
- struct sockaddr* inaddr)
+ struct sockaddr_storage* inaddr)
{
struct sockaddr_in *in;
#if OPAL_ENABLE_IPV6
+ struct sockaddr_in6 *in6;
struct addrinfo hints, *res;
int ret;
#endif
if (AF_INET == af_family) {
- memset(inaddr, 0, sizeof(struct sockaddr_in));
+ memset(inaddr, 0, sizeof(*inaddr));
in = (struct sockaddr_in*) inaddr;
in->sin_family = AF_INET;
in->sin_addr.s_addr = inet_addr(host);
if (in->sin_addr.s_addr == INADDR_NONE) {
return ORTE_ERR_BAD_PARAM;
}
- ((struct sockaddr_in*) inaddr)->sin_port = htons(atoi(port));
+ in->sin_port = htons(atoi(port));
}
#if OPAL_ENABLE_IPV6
else if (AF_INET6 == af_family) {
- size_t len;
- memset(inaddr, 0, sizeof(struct sockaddr_in6));
+ memset(inaddr, 0, sizeof(*inaddr));
memset(&hints, 0, sizeof(hints));
hints.ai_family = af_family;
hints.ai_socktype = SOCK_STREAM;
ret = getaddrinfo(host, NULL, &hints, &res);
-
if (ret) {
opal_output (0, "oob_tcp_parse_uri: Could not resolve %s. [Error: %s]\n",
host, gai_strerror (ret));
return ORTE_ERR_BAD_PARAM;
}
- len = (res->ai_addrlen < sizeof(struct sockaddr_in6)) ? res->ai_addrlen : sizeof(struct sockaddr_in6);
- memcpy(inaddr, res->ai_addr, len);
+ if (res->ai_addrlen < sizeof(struct sockaddr_in6)) {
+ opal_output (0, "oob_tcp_parse_uri: addrlen unexpected for %s (%d)\n",
+ host, res->ai_addrlen);
+ return ORTE_ERR_BAD_PARAM;
+ }
+ in6 = (struct sockaddr_in6*) inaddr;
+ memcpy(in6, res->ai_addr, res->ai_addrlen);
+ in6->sin6_port = htons(atoi(port));
freeaddrinfo(res);
}
#endif
@@ -271,7 +276,7 @@ static int parse_uri(const uint16_t af_family,
static void process_set_peer(int fd, short args, void *cbdata)
{
mca_oob_tcp_peer_op_t *pop = (mca_oob_tcp_peer_op_t*)cbdata;
- struct sockaddr inaddr;
+ struct sockaddr_storage inaddr;
mca_oob_tcp_peer_t *peer;
int rc=ORTE_SUCCESS;
uint64_t *ui64 = (uint64_t*)(&pop->peer);
@@ -281,12 +286,6 @@ static void process_set_peer(int fd, short args, void *cbdata)
"%s:tcp:processing set_peer cmd",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME));
- if (AF_INET != pop->af_family) {
- opal_output_verbose(20, orte_oob_base_framework.framework_output,
- "%s NOT AF_INET", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME));
- goto cleanup;
- }
-
if (NULL == (peer = mca_oob_tcp_peer_lookup(&pop->peer))) {
peer = OBJ_NEW(mca_oob_tcp_peer_t);
peer->name.jobid = pop->peer.jobid;
@@ -301,7 +300,7 @@ static void process_set_peer(int fd, short args, void *cbdata)
}
}
- if ((rc = parse_uri(pop->af_family, pop->net, pop->port, (struct sockaddr*) &inaddr)) != ORTE_SUCCESS) {
+ if ((rc = parse_uri(pop->af_family, pop->net, pop->port, &inaddr)) != ORTE_SUCCESS) {
ORTE_ERROR_LOG(rc);
goto cleanup;
}
diff --git a/1.10.2/src/openmpi-1.10.2/orte/mca/oob/tcp/oob_tcp_connection.c b/1.10.2/src/openmpi-1.10.2/orte/mca/oob/tcp/oob_tcp_connection.c
index 34075b8..124dc9d 100644
--- a/1.10.2/src/openmpi-1.10.2/orte/mca/oob/tcp/oob_tcp_connection.c
+++ b/1.10.2/src/openmpi-1.10.2/orte/mca/oob/tcp/oob_tcp_connection.c
@@ -104,7 +104,6 @@ void mca_oob_tcp_peer_try_connect(int fd, short args, void *cbdata)
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
ORTE_NAME_PRINT(&(peer->name)));
- addrlen = sizeof(struct sockaddr_in);
OPAL_LIST_FOREACH(addr, &peer->addrs, mca_oob_tcp_addr_t) {
opal_output_verbose(OOB_TCP_DEBUG_CONNECT, orte_oob_base_framework.framework_output,
"%s orte_tcp_peer_try_connect: "
@@ -135,7 +134,7 @@ void mca_oob_tcp_peer_try_connect(int fd, short args, void *cbdata)
while (addr->retries < mca_oob_tcp_component.max_retries) {
addr->retries++;
/* Create the new socket */
- peer->sd = socket(AF_INET, SOCK_STREAM, 0);
+ peer->sd = socket(addr->addr.ss_family, SOCK_STREAM, 0);
/* Set this fd to be close-on-exec so that any subsequent children don't see it */
if (opal_fd_set_cloexec(peer->sd) != OPAL_SUCCESS) {
opal_output(0, "%s unable to set socket to CLOEXEC",
@@ -145,9 +144,13 @@ void mca_oob_tcp_peer_try_connect(int fd, short args, void *cbdata)
continue;
}
- if (connect(peer->sd, (struct sockaddr*)&addr->addr, addrlen) < 0) {
+ addrlen = addr->addr.ss_family == AF_INET6
+ ? sizeof(struct sockaddr_in6)
+ : sizeof(struct sockaddr_in);
+ rc = connect(peer->sd, (struct sockaddr*) &addr->addr, addrlen);
+ if (rc < 0) {
if (opal_socket_errno == ETIMEDOUT) {
- /* The server may be too busy to accept new connections */
+ /* The server may be too busy to accept new connections */
opal_output_verbose(OOB_TCP_DEBUG_CONNECT, orte_oob_base_framework.framework_output,
"%s timeout connecting to %s",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
@@ -161,7 +164,7 @@ void mca_oob_tcp_peer_try_connect(int fd, short args, void *cbdata)
* attempt, without even trying to establish the
* connection. Handle that case in a semi-rational
* way by trying twice before giving up */
- if (ECONNABORTED == opal_socket_errno) {
+ if (ECONNABORTED == opal_socket_errno) {
opal_output_verbose(OOB_TCP_DEBUG_CONNECT, orte_oob_base_framework.framework_output,
"%s connection to %s aborted by OS - retrying",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
@@ -169,6 +172,15 @@ void mca_oob_tcp_peer_try_connect(int fd, short args, void *cbdata)
CLOSE_THE_SOCKET(peer->sd);
continue;
}
+ if (rc < 0) {
+ opal_output_verbose(OOB_TCP_DEBUG_CONNECT, orte_oob_base_framework.framework_output,
+ "%s connection to %s returned %d (%d, %s)",
+ ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
+ ORTE_NAME_PRINT(&peer->name),
+ rc, errno, strerror(errno));
+ CLOSE_THE_SOCKET(peer->sd);
+ continue;
+ }
/* connection succeeded */
addr->retries = 0;
goto connected;
diff --git a/1.10.2/src/openmpi-1.10.2/orte/mca/oob/tcp/oob_tcp_listener.c b/1.10.2/src/openmpi-1.10.2/orte/mca/oob/tcp/oob_tcp_listener.c
index 1f218cb..4e25ab9 100644
--- a/1.10.2/src/openmpi-1.10.2/orte/mca/oob/tcp/oob_tcp_listener.c
+++ b/1.10.2/src/openmpi-1.10.2/orte/mca/oob/tcp/oob_tcp_listener.c
@@ -271,8 +271,9 @@ static int create_listen(void)
if (NULL == ports) {
return ORTE_ERROR;
}
-
+
/* get the address info for this interface */
+ memset(&inaddr, 0, sizeof(inaddr));
((struct sockaddr_in*) &inaddr)->sin_family = AF_INET;
((struct sockaddr_in*) &inaddr)->sin_addr.s_addr = INADDR_ANY;
addrlen = sizeof(struct sockaddr_in);
@@ -511,8 +512,9 @@ static int create_listen6(void)
if (NULL == ports) {
return ORTE_ERROR;
}
-
+
/* get the address info for this interface */
+ memset(&inaddr, 0, sizeof(inaddr));
((struct sockaddr_in6*) &inaddr)->sin6_family = AF_INET6;
((struct sockaddr_in6*) &inaddr)->sin6_addr = in6addr_any;
addrlen = sizeof(struct sockaddr_in6);
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment