Skip to content

Commit

Permalink
Avoid deadlock by unlocking on close path before receiving signal on …
Browse files Browse the repository at this point in the history
…socket.

A run can fail to return because on both nodes, one thread is holding the lock
pending on receive, which prevents a close send. Visible with 8 gpus, 2 nodes.
  • Loading branch information
tvegas1 authored and bureddy committed Sep 29, 2023
1 parent 8a2ef11 commit d387b8a
Showing 1 changed file with 19 additions and 19 deletions.
38 changes: 19 additions & 19 deletions src/ucx_plugin.c
Original file line number Diff line number Diff line change
Expand Up @@ -350,34 +350,34 @@ static ncclResult_t ucx_get_ctx_and_worker(int dev, ucp_context_h *ctx,
}

static ncclResult_t nccl_ucx_free_worker(ucp_worker_h worker) {
int i;
int dummy;
int i, dummy, count;
struct ep_list *ep, *cur;

pthread_mutex_lock(&nccl_ucx_lock);
for(i = 0; i < ncclNIbDevs; i++) {
if (worker == workers[i].worker) {
workers[i].count--;
if (workers[i].count == 0){
ep = workers[i].eps;
while(ep){
cur = ep;
NCCLCHECK(ncclSocketRecv(ep->sock, &dummy, sizeof(int)));
ep = ep->next;
close(cur->sock->fd);
free(cur);
}
ucp_worker_destroy(workers[i].worker);
ucp_cleanup(workers[i].ctx);
workers[i].eps = NULL;
workers[i].worker = NULL;
workers[i].ctx = NULL;
}
if (workers[i].count > 0 && worker == workers[i].worker) {
count = --workers[i].count;
break;
}
}
pthread_mutex_unlock(&nccl_ucx_lock);

if (i < ncclNIbDevs && count == 0) {
ep = workers[i].eps;
while(ep){
cur = ep;
NCCLCHECK(ncclSocketRecv(ep->sock, &dummy, sizeof(int)));
ep = ep->next;
close(cur->sock->fd);
free(cur);
}
ucp_worker_destroy(workers[i].worker);
ucp_cleanup(workers[i].ctx);
workers[i].eps = NULL;
workers[i].worker = NULL;
workers[i].ctx = NULL;
}

return ncclSuccess;
}

Expand Down

0 comments on commit d387b8a

Please sign in to comment.