Skip to content

Commit

Permalink
Merge pull request #11915 from bosilca/topic/avoid_writev
Browse files Browse the repository at this point in the history
Replace writev by sendmsg
  • Loading branch information
bosilca authored Sep 21, 2023
2 parents a86c131 + 7074e59 commit 2ebdfbb
Show file tree
Hide file tree
Showing 4 changed files with 32 additions and 22 deletions.
12 changes: 5 additions & 7 deletions ompi/group/group.h
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
* Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana
* University Research and Technology
* Corporation. All rights reserved.
* Copyright (c) 2004-2020 The University of Tennessee and The University
* Copyright (c) 2004-2023 The University of Tennessee and The University
* of Tennessee Research Foundation. All rights
* reserved.
* Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
Expand Down Expand Up @@ -459,19 +459,17 @@ static inline struct ompi_proc_t *ompi_group_peer_lookup_existing (ompi_group_t
*/
static inline int ompi_group_proc_lookup_rank (ompi_group_t* group, ompi_proc_t* proc)
{
int i, np, v;
int i, np, rank;
opal_vpid_t v;
assert( NULL != proc );
assert( !ompi_proc_is_sentinel(proc) );
np = ompi_group_size(group);
if( 0 == np ) return MPI_PROC_NULL;
/* heuristic: On comm_world, start the lookup from v=vpid, so that
* when working on comm_world, the search is O(1);
* Otherwise, wild guess: start from a proportional position
* compared to comm_world position. */
* when working on comm_world, on average, the search remains O(1). */
v = proc->super.proc_name.vpid;
v = (v<np)? v: v*ompi_proc_world_size()/np;
for( i = 0; i < np; i++ ) {
int rank = (i+v)%np;
rank = (i+v)%np;
/* procs are lazy initialized and may be a sentinel. Handle both cases. */
ompi_proc_t* p = ompi_group_get_proc_ptr_raw(group, rank);
if(OPAL_LIKELY(!ompi_proc_is_sentinel(p))) {
Expand Down
22 changes: 17 additions & 5 deletions opal/mca/btl/tcp/btl_tcp_frag.c
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
* Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana
* University Research and Technology
* Corporation. All rights reserved.
* Copyright (c) 2004-2020 The University of Tennessee and The University
* Copyright (c) 2004-2023 The University of Tennessee and The University
* of Tennessee Research Foundation. All rights
* reserved.
* Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
Expand Down Expand Up @@ -105,18 +105,30 @@ bool mca_btl_tcp_frag_send(mca_btl_tcp_frag_t *frag, int sd)
{
ssize_t cnt;
size_t i, num_vecs;
struct msghdr msg;
int msg_flags = MSG_DONTWAIT | MSG_NOSIGNAL;

/* non-blocking write, but continue if interrupted */
msg.msg_name = NULL;
msg.msg_namelen = 0;
msg.msg_iov = frag->iov_ptr;
msg.msg_iovlen = frag->iov_cnt;
msg.msg_control = NULL;
msg.msg_controllen = 0;

/* non-blocking write, continue if interrupted */
do {
cnt = writev(sd, frag->iov_ptr, frag->iov_cnt);
/* Use sendmsg to avoid issues with SIGPIPE as described in
* https://blog.erratasec.com/2018/10/tcpip-sockets-and-sigpipe.html#
*/
cnt = sendmsg(sd, &msg, msg_flags);
if (cnt < 0) {
switch (opal_socket_errno) {
case EINTR:
continue;
case EWOULDBLOCK:
return false;
case EFAULT:
BTL_ERROR(("mca_btl_tcp_frag_send: writev error (%p, %lu)\n\t%s(%lu)\n",
BTL_ERROR(("mca_btl_tcp_frag_send: sendmsg error (%p, %lu)\n\t%s(%lu)\n",
frag->iov_ptr[0].iov_base, (unsigned long) frag->iov_ptr[0].iov_len,
strerror(opal_socket_errno), (unsigned long) frag->iov_cnt));
/* send_lock held by caller */
Expand All @@ -125,7 +137,7 @@ bool mca_btl_tcp_frag_send(mca_btl_tcp_frag_t *frag, int sd)
return false;
default:
BTL_PEER_ERROR(frag->endpoint->endpoint_proc->proc_opal,
("mca_btl_tcp_frag_send: writev failed: %s (%d)",
("mca_btl_tcp_frag_send: sendmsg failed: %s (%d)",
strerror(opal_socket_errno), opal_socket_errno));
/* send_lock held by caller */
frag->endpoint->endpoint_state = MCA_BTL_TCP_FAILED;
Expand Down
6 changes: 3 additions & 3 deletions opal/win32/opal_uio.c
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana
University Research and Technology
Corporation. All rights reserved.
Copyright (c) 2004-2005 The University of Tennessee and The University
Copyright (c) 2004-2023 The University of Tennessee and The University
of Tennessee Research Foundation. All rights
reserved.
Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
Expand All @@ -26,12 +26,12 @@
of code to handle the windows error flags
*/

int writev(int fd, struct iovec *iov, int cnt)
ssize_t sendmsg(int fd, const struct msghdr *message, int flags)
{
int err;
DWORD sendlen;

err = WSASend((SOCKET) fd, &(iov->data), cnt, &sendlen, 0, NULL, NULL);
err = WSASendMsg((SOCKET) fd, message, flags, &sendlen, NULL, NULL);

if (err < 0) {
return err;
Expand Down
14 changes: 7 additions & 7 deletions opal/win32/opal_uio.h
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
* Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana
* University Research and Technology
* Corporation. All rights reserved.
* Copyright (c) 2004-2014 The University of Tennessee and The University
* Copyright (c) 2004-2023 The University of Tennessee and The University
* of Tennessee Research Foundation. All rights
* reserved.
* Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
Expand Down Expand Up @@ -33,14 +33,14 @@ struct iovec {
#define iov_len data.len

BEGIN_C_DECLS

/*
* writev:
writev writes data to file descriptor fd, and from the buffers
described by iov. The number of buffers is specified by cnt. The
buffers are used in the order specified. Operates just like write
except that data is taken from iov instead of a contiguous buffer.
* sendmsg:
* writes data to a file descriptor. This is a convenience function to allow
* the TCP BTL to support Windows. Overall is should behave similarly to the
* POSIX sendmsg function.
*/
OPAL_DECLSPEC int writev(int fd, struct iovec *iov, int cnt);
OPAL_DECLSPEC ssize_t sendmsg(int socket, const struct msghdr *message, int flags);

/*
readv reads data from file descriptor fd, and puts the result in the
Expand Down

0 comments on commit 2ebdfbb

Please sign in to comment.