Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

btl tcp: Use reachability and graph solving for global interface matching #7134

Merged
merged 1 commit into from
Jan 27, 2020
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 2 additions & 6 deletions opal/mca/btl/tcp/btl_tcp.c
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,8 @@
* Copyright (c) 2016-2017 Research Organization for Information Science
* and Technology (RIST). All rights reserved.
* Copyright (c) 2016 Intel, Inc. All rights reserved.
* Copyright (c) 2019 Amazon.com, Inc. or its affiliates. All Rights
* reserved.
*
* $COPYRIGHT$
*
Expand Down Expand Up @@ -90,12 +92,6 @@ int mca_btl_tcp_add_procs( struct mca_btl_base_module_t* btl,
continue;
}

/*
* Check to make sure that the peer has at least as many interface
* addresses exported as we are trying to use. If not, then
* don't bind this BTL instance to the proc.
*/

OPAL_THREAD_LOCK(&tcp_proc->proc_lock);

for (uint32_t j = 0 ; j < (uint32_t)tcp_proc->proc_endpoint_count ; ++j) {
Expand Down
6 changes: 6 additions & 0 deletions opal/mca/btl/tcp/btl_tcp.h
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,8 @@
* and Technology (RIST). All rights reserved.
* Copyright (c) 2014-2015 Los Alamos National Security, LLC. All rights
* reserved.
* Copyright (c) 2019 Amazon.com, Inc. or its affiliates. All Rights
* reserved.
* $COPYRIGHT$
*
* Additional copyrights may follow
Expand Down Expand Up @@ -107,6 +109,7 @@ struct mca_btl_tcp_component_t {
uint32_t tcp_num_btls; /**< number of interfaces available to the TCP component */
unsigned int tcp_num_links; /**< number of logical links per physical device */
struct mca_btl_tcp_module_t **tcp_btls; /**< array of available BTL modules */
opal_list_t local_ifs; /**< opal list of local opal_if_t interfaces */
int tcp_free_list_num; /**< initial size of free lists */
int tcp_free_list_max; /**< maximum size of free lists */
int tcp_free_list_inc; /**< number of elements to alloc when growing free lists */
Expand Down Expand Up @@ -163,6 +166,9 @@ OPAL_MODULE_DECLSPEC extern mca_btl_tcp_component_t mca_btl_tcp_component;
*/
struct mca_btl_tcp_module_t {
mca_btl_base_module_t super; /**< base BTL interface */
uint32_t btl_index; /**< Local BTL module index, used for vertex
data and used as a hash key when
solving module matching problem */
uint16_t tcp_ifkindex; /** <BTL kernel interface index */
struct sockaddr_storage tcp_ifaddr; /**< First address
discovered for this
Expand Down
20 changes: 14 additions & 6 deletions opal/mca/btl/tcp/btl_tcp_addr.h
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,9 @@
* University of Stuttgart. All rights reserved.
* Copyright (c) 2004-2005 The Regents of the University of California.
* All rights reserved.
* Copyright (c) 2019 Amazon.com, Inc. or its affiliates. All Rights
* reserved.
*
* $COPYRIGHT$
*
* Additional copyrights may follow
Expand All @@ -30,37 +33,43 @@
#ifdef HAVE_NETINET_IN_H
#include <netinet/in.h>
#endif

#include <assert.h>

/**
* Modex address structure.
*
* One of these structures will be sent for every btl module in use by
* the local BTL TCP component.
* the local BTL TCP component. This is used to construct an opal_if_t
* structure for the reachability component as well as populate the
* mca_btl_tcp_addr_t structure on remote procs. These will be used
* for interface matching and filling out the mca_btl_base_endpoint_t
* structure.
*/
struct mca_btl_tcp_modex_addr_t {
uint8_t addr[16]; /* endpoint address. for addr_family
of MCA_BTL_TCP_AF_INET, only the
first 4 bytes have meaning. */
uint32_t addr_ifkindex; /* endpoint kernel index */
uint32_t addr_mask; /* ip mask */
uint32_t addr_bandwidth; /* interface bandwidth */
uint16_t addr_port; /* endpoint listen port */
uint8_t addr_family; /* endpoint address family. Note that
this is
MCA_BTL_TCP_AF_{INET,INET6}, not
the traditional
AF_INET/AF_INET6. */
uint8_t padding[1]; /* padd out to an 8-byte word */
uint8_t padding[1]; /* pad out to an 8-byte word */
};
typedef struct mca_btl_tcp_modex_addr_t mca_btl_tcp_modex_addr_t;

_Static_assert(sizeof(struct mca_btl_tcp_modex_addr_t) == 32, "mca_btl_tcp_modex_addr_t");

/**
* Remote peer address structure
*
* One of these structures will be allocated for every remote endpoint
* associated with a remote proc. The data is pulled from the
* mca_btl_tcp_modex_addr_t structure, except for the addr_inuse
* field, which is local.
* mca_btl_tcp_modex_addr_t structure.
*/
struct mca_btl_tcp_addr_t {
union {
Expand All @@ -73,7 +82,6 @@ struct mca_btl_tcp_addr_t {
int addr_ifkindex; /**< remote interface index assigned with
this address */
uint8_t addr_family; /**< AF_INET or AF_INET6 */
bool addr_inuse; /**< local meaning only */
};
typedef struct mca_btl_tcp_addr_t mca_btl_tcp_addr_t;

Expand Down
54 changes: 41 additions & 13 deletions opal/mca/btl/tcp/btl_tcp_component.c
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,8 @@
* Copyright (c) 2014-2015 Intel, Inc. All rights reserved.
* Copyright (c) 2014-2017 Research Organization for Information Science
* and Technology (RIST). All rights reserved.
* Copyright (c) 2018 Amazon.com, Inc. or its affiliates. All Rights reserved.
* Copyright (c) 2018-2019 Amazon.com, Inc. or its affiliates. All Rights
* reserved.
* $COPYRIGHT$
*
* Additional copyrights may follow
Expand Down Expand Up @@ -69,13 +70,15 @@
#include "opal/util/net.h"
#include "opal/util/fd.h"
#include "opal/util/show_help.h"
#include "opal/util/string_copy.h"
#include "opal/util/printf.h"
#include "opal/constants.h"
#include "opal/mca/btl/btl.h"
#include "opal/mca/btl/base/base.h"
#include "opal/mca/mpool/base/base.h"
#include "opal/mca/btl/base/btl_base_error.h"
#include "opal/mca/pmix/pmix.h"
#include "opal/mca/reachable/base/base.h"
#include "opal/threads/threads.h"

#include "opal/constants.h"
Expand Down Expand Up @@ -368,6 +371,7 @@ static int mca_btl_tcp_component_open(void)
mca_btl_tcp_component.tcp_btls = NULL;

/* initialize objects */
OBJ_CONSTRUCT(&mca_btl_tcp_component.local_ifs, opal_list_t);
OBJ_CONSTRUCT(&mca_btl_tcp_component.tcp_lock, opal_mutex_t);
OBJ_CONSTRUCT(&mca_btl_tcp_component.tcp_procs, opal_proc_table_t);
OBJ_CONSTRUCT(&mca_btl_tcp_component.tcp_events, opal_list_t);
Expand Down Expand Up @@ -477,6 +481,7 @@ static int mca_btl_tcp_component_close(void)
OBJ_DESTRUCT(&mca_btl_tcp_component.tcp_frag_max);
OBJ_DESTRUCT(&mca_btl_tcp_component.tcp_frag_user);
OBJ_DESTRUCT(&mca_btl_tcp_component.tcp_lock);
OBJ_DESTRUCT(&mca_btl_tcp_component.local_ifs);

#if OPAL_CUDA_SUPPORT
mca_common_cuda_fini();
Expand All @@ -493,8 +498,9 @@ static int mca_btl_tcp_component_close(void)
static int mca_btl_tcp_create(const int if_kindex, const char* if_name)
{
struct mca_btl_tcp_module_t* btl;
opal_if_t *copied_interface, *selected_interface;
char param[256];
int i;
int i, if_index;
struct sockaddr_storage addr;
bool found = false;

Expand All @@ -515,18 +521,15 @@ static int mca_btl_tcp_create(const int if_kindex, const char* if_name)
* 10.1.0.1 as the one that is published in the modex and used for
* connection.
*/
for (i = opal_ifbegin() ; i >= 0 ; i = opal_ifnext(i)) {
int ret;

if (if_kindex != opal_ifindextokindex(i)) {
OPAL_LIST_FOREACH(selected_interface, &opal_if_list, opal_if_t) {
if (if_kindex != selected_interface->if_kernel_index) {
continue;
}

ret = opal_ifindextoaddr(i, (struct sockaddr*)&addr,
sizeof(struct sockaddr_storage));
if (OPAL_SUCCESS != ret) {
return ret;
}
if_index = selected_interface->if_index;

memcpy((struct sockaddr*)&addr, &selected_interface->if_addr,
MIN(sizeof(struct sockaddr_storage), sizeof(selected_interface->if_addr)));

if (addr.ss_family == AF_INET &&
4 != mca_btl_tcp_component.tcp_disable_family) {
Expand All @@ -548,12 +551,19 @@ static int mca_btl_tcp_create(const int if_kindex, const char* if_name)
btl = (struct mca_btl_tcp_module_t *)malloc(sizeof(mca_btl_tcp_module_t));
if(NULL == btl)
return OPAL_ERR_OUT_OF_RESOURCE;
copied_interface = OBJ_NEW(opal_if_t);
if (NULL == copied_interface) {
free(btl);
return OPAL_ERR_OUT_OF_RESOURCE;
}
memcpy(btl, &mca_btl_tcp_module, sizeof(mca_btl_tcp_module));
OBJ_CONSTRUCT(&btl->tcp_endpoints, opal_list_t);
OBJ_CONSTRUCT(&btl->tcp_endpoints_mutex, opal_mutex_t);
mca_btl_tcp_component.tcp_btls[mca_btl_tcp_component.tcp_num_btls++] = btl;

/* initialize the btl */
/* This index is used as a key for a hash table used for interface matching. */
btl->btl_index = mca_btl_tcp_component.tcp_num_btls - 1;
btl->tcp_ifkindex = (uint16_t) if_kindex;
#if MCA_BTL_TCP_STATISTICS
btl->tcp_bytes_recv = 0;
Expand All @@ -562,6 +572,7 @@ static int mca_btl_tcp_create(const int if_kindex, const char* if_name)
#endif

memcpy(&btl->tcp_ifaddr, &addr, sizeof(struct sockaddr_storage));
btl->tcp_ifmask = selected_interface->if_mask;

/* allow user to specify interface bandwidth */
sprintf(param, "bandwidth_%s", if_name);
Expand Down Expand Up @@ -603,6 +614,21 @@ static int mca_btl_tcp_create(const int if_kindex, const char* if_name)
}
}

/* Add another entry to the local interface list */
opal_string_copy(copied_interface->if_name, if_name, OPAL_IF_NAMESIZE);
copied_interface->if_index = if_index;
copied_interface->if_kernel_index = btl->tcp_ifkindex;
copied_interface->af_family = btl->tcp_ifaddr.ss_family;
copied_interface->if_flags = selected_interface->if_flags;
copied_interface->if_speed = selected_interface->if_speed;
memcpy(&copied_interface->if_addr, &btl->tcp_ifaddr, sizeof(struct sockaddr_storage));
copied_interface->if_mask = selected_interface->if_mask;
copied_interface->if_bandwidth = btl->super.btl_bandwidth;
memcpy(&copied_interface->if_mac, &selected_interface->if_mac, sizeof(copied_interface->if_mac));
copied_interface->ifmtu = selected_interface->ifmtu;

opal_list_append(&mca_btl_tcp_component.local_ifs, &(copied_interface->super));

opal_output_verbose(5, opal_btl_base_framework.framework_output,
"btl:tcp: %p: if %s kidx %d cnt %i addr %s %s bw %d lt %d\n",
(void*)btl, if_name, (int) btl->tcp_ifkindex, i,
Expand Down Expand Up @@ -1188,7 +1214,6 @@ static int mca_btl_tcp_component_exchange(void)
memcpy(&addrs[i].addr, &(inaddr6->sin6_addr),
sizeof(struct in6_addr));
addrs[i].addr_port = mca_btl_tcp_component.tcp6_listen_port;
addrs[i].addr_ifkindex = btl->tcp_ifkindex;
addrs[i].addr_family = MCA_BTL_TCP_AF_INET6;
opal_output_verbose(5, opal_btl_base_framework.framework_output,
"btl: tcp: exchange: %d %d IPv6 %s",
Expand All @@ -1202,7 +1227,6 @@ static int mca_btl_tcp_component_exchange(void)
memcpy(&addrs[i].addr, &(inaddr->sin_addr),
sizeof(struct in_addr));
addrs[i].addr_port = mca_btl_tcp_component.tcp_listen_port;
addrs[i].addr_ifkindex = btl->tcp_ifkindex;
addrs[i].addr_family = MCA_BTL_TCP_AF_INET;
opal_output_verbose(5, opal_btl_base_framework.framework_output,
"btl: tcp: exchange: %d %d IPv4 %s",
Expand All @@ -1212,6 +1236,10 @@ static int mca_btl_tcp_component_exchange(void)
BTL_ERROR(("Unexpected address family: %d", addr->sa_family));
return OPAL_ERR_BAD_PARAM;
}

addrs[i].addr_ifkindex = btl->tcp_ifkindex;
addrs[i].addr_mask = btl->tcp_ifmask;
addrs[i].addr_bandwidth = btl->super.btl_bandwidth;
}

OPAL_MODEX_SEND(rc, OPAL_PMIX_GLOBAL,
Expand Down
Loading