Skip to content

Commit

Permalink
Reduce-scatter and Allgather APIs
Browse files Browse the repository at this point in the history
  • Loading branch information
bureddy committed Oct 2, 2023
1 parent d387b8a commit 4e5e81e
Show file tree
Hide file tree
Showing 8 changed files with 358 additions and 76 deletions.
1 change: 1 addition & 0 deletions include/net.h
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,7 @@ typedef enum {NCCL_INIT=1, NCCL_COLL=2, NCCL_P2P=4, NCCL_SHM=8, NCCL_NET=16, NCC

typedef void (*ncclDebugLogger_t)(ncclDebugLogLevel level, unsigned long flags, const char *file, int line, const char *fmt, ...);

#include "net_v8.h"
#include "net_v7.h"
#include "net_v6.h"
#include "net_v5.h"
Expand Down
75 changes: 2 additions & 73 deletions include/net_v7.h
Original file line number Diff line number Diff line change
Expand Up @@ -5,79 +5,8 @@
#ifndef NCCL_NET_V7_H_
#define NCCL_NET_V7_H_

#include "net_device.h"

typedef struct {
char* name; // Used mostly for logging.
char* pciPath; // Path to the PCI device in /sys.
uint64_t guid; // Unique identifier for the NIC chip. Important for
// cards with multiple PCI functions (Physical or virtual).
int ptrSupport; // [NCCL_PTR_HOST|NCCL_PTR_CUDA|NCCL_PTR_DMABUF]
int speed; // Port speed in Mbps.
int port; // Port number.
float latency; // Network latency
int maxComms; // Maximum number of comms we can create
int maxRecvs; // Maximum number of grouped receives.
ncclNetDeviceType netDeviceType; // Network offload type
int netDeviceVersion; // Version number for network offload
} ncclNetProperties_v7_t;

typedef ncclNetProperties_v7_t ncclNetProperties_t;

typedef struct {
// Name of the network (mainly for logs)
const char* name;
// Initialize the network.
ncclResult_t (*init)(ncclDebugLogger_t logFunction);
// Return the number of adapters.
ncclResult_t (*devices)(int* ndev);
// Get various device properties.
ncclResult_t (*getProperties)(int dev, ncclNetProperties_v7_t* props);
// Create a receiving object and provide a handle to connect to it. The
// handle can be up to NCCL_NET_HANDLE_MAXSIZE bytes and will be exchanged
// between ranks to create a connection.
ncclResult_t (*listen)(int dev, void* handle, void** listenComm);
// Connect to a handle and return a sending comm object for that peer.
// This call must not block for the connection to be established, and instead
// should return successfully with sendComm == NULL with the expectation that
// it will be called again until sendComm != NULL.
// If *sendDevComm points to a valid object, then NCCL is requesting device offload for this connection
ncclResult_t (*connect)(int dev, void* handle, void** sendComm, ncclNetDeviceHandle_v7_t** sendDevComm);
// Finalize connection establishment after remote peer has called connect.
// This call must not block for the connection to be established, and instead
// should return successfully with recvComm == NULL with the expectation that
// it will be called again until recvComm != NULL.
// If *recvDevComm points to a valid object, then NCCL is requesting device offload for this connection
ncclResult_t (*accept)(void* listenComm, void** recvComm, ncclNetDeviceHandle_v7_t** recvDevComm);
// Register/Deregister memory. Comm can be either a sendComm or a recvComm.
// Type is either NCCL_PTR_HOST or NCCL_PTR_CUDA.
ncclResult_t (*regMr)(void* comm, void* data, int size, int type, void** mhandle);
/* DMA-BUF support */
ncclResult_t (*regMrDmaBuf)(void* comm, void* data, size_t size, int type, uint64_t offset, int fd, void** mhandle);
ncclResult_t (*deregMr)(void* comm, void* mhandle);
// Asynchronous send to a peer.
// May return request == NULL if the call cannot be performed (or would block)
ncclResult_t (*isend)(void* sendComm, void* data, int size, int tag, void* mhandle, void** request);
// Asynchronous recv from a peer.
// May return request == NULL if the call cannot be performed (or would block)
ncclResult_t (*irecv)(void* recvComm, int n, void** data, int* sizes, int* tags, void** mhandles, void** request);
// Perform a flush/fence to make sure all data received with NCCL_PTR_CUDA is
// visible to the GPU
ncclResult_t (*iflush)(void* recvComm, int n, void** data, int* sizes, void** mhandles, void** request);
// Test whether a request is complete. If size is not NULL, it returns the
// number of bytes sent/received.
ncclResult_t (*test)(void* request, int* done, int* sizes);
// Close and free send/recv comm objects
ncclResult_t (*closeSend)(void* sendComm);
ncclResult_t (*closeRecv)(void* recvComm);
ncclResult_t (*closeListen)(void* listenComm);

// Copy the given mhandle to a dptr in a format usable by this plugin's device code
ncclResult_t (*getDeviceMr)(void* comm, void* mhandle, void** dptr_mhandle);

// Notify the plugin that a recv has completed by the device
ncclResult_t (*irecvConsumed)(void* recvComm, int n, void* request);
} ncclNet_v7_t;
typedef ncclNetProperties_v8_t ncclNetProperties_v7_t;
typedef ncclNet_v8_t ncclNet_v7_t;

// v7 struct for backwards compatibility
typedef struct {
Expand Down
134 changes: 134 additions & 0 deletions include/net_v8.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,134 @@
/*
* Copyright (c) 2017-2023, NVIDIA CORPORATION. All rights reserved.
*/

#ifndef NCCL_NET_V8_H_
#define NCCL_NET_V8_H_
#include "net_device.h"

typedef struct {
char* name; // Used mostly for logging.
char* pciPath; // Path to the PCI device in /sys.
uint64_t guid; // Unique identifier for the NIC chip. Important for
// cards with multiple PCI functions (Physical or virtual).
int ptrSupport; // [NCCL_PTR_HOST|NCCL_PTR_CUDA|NCCL_PTR_DMABUF]
int speed; // Port speed in Mbps.
int port; // Port number.
float latency; // Network latency
int maxComms; // Maximum number of comms we can create
int maxRecvs; // Maximum number of grouped receives.
ncclNetDeviceType netDeviceType; // Network offload type
int netDeviceVersion; // Version number for network offload
} ncclNetProperties_v8_t;

typedef ncclNetProperties_v8_t ncclNetProperties_t;

typedef struct {
// Name of the network (mainly for logs)
const char* name;
// Initialize the network.
ncclResult_t (*init)(ncclDebugLogger_t logFunction);
// Return the number of adapters.
ncclResult_t (*devices)(int* ndev);
// Get various device properties.
ncclResult_t (*getProperties)(int dev, ncclNetProperties_v8_t* props);
// Create a receiving object and provide a handle to connect to it. The
// handle can be up to NCCL_NET_HANDLE_MAXSIZE bytes and will be exchanged
// between ranks to create a connection.
ncclResult_t (*listen)(int dev, void* handle, void** listenComm);
// Connect to a handle and return a sending comm object for that peer.
// This call must not block for the connection to be established, and instead
// should return successfully with sendComm == NULL with the expectation that
// it will be called again until sendComm != NULL.
// If *sendDevComm points to a valid object, then NCCL is requesting device offload for this connection
ncclResult_t (*connect)(int dev, void* handle, void** sendComm, ncclNetDeviceHandle_v7_t** sendDevComm);
// Finalize connection establishment after remote peer has called connect.
// This call must not block for the connection to be established, and instead
// should return successfully with recvComm == NULL with the expectation that
// it will be called again until recvComm != NULL.
// If *recvDevComm points to a valid object, then NCCL is requesting device offload for this connection
ncclResult_t (*accept)(void* listenComm, void** recvComm, ncclNetDeviceHandle_v7_t** recvDevComm);
// Register/Deregister memory. Comm can be either a sendComm or a recvComm.
// Type is either NCCL_PTR_HOST or NCCL_PTR_CUDA.
ncclResult_t (*regMr)(void* comm, void* data, int size, int type, void** mhandle);
/* DMA-BUF support */
ncclResult_t (*regMrDmaBuf)(void* comm, void* data, size_t size, int type, uint64_t offset, int fd, void** mhandle);
ncclResult_t (*deregMr)(void* comm, void* mhandle);
// Asynchronous send to a peer.
// May return request == NULL if the call cannot be performed (or would block)
ncclResult_t (*isend)(void* sendComm, void* data, int size, int tag, void* mhandle, void** request);
// Asynchronous recv from a peer.
// May return request == NULL if the call cannot be performed (or would block)
ncclResult_t (*irecv)(void* recvComm, int n, void** data, int* sizes, int* tags, void** mhandles, void** request);
// Perform a flush/fence to make sure all data received with NCCL_PTR_CUDA is
// visible to the GPU
ncclResult_t (*iflush)(void* recvComm, int n, void** data, int* sizes, void** mhandles, void** request);
// Test whether a request is complete. If size is not NULL, it returns the
// number of bytes sent/received.
ncclResult_t (*test)(void* request, int* done, int* sizes);
// Close and free send/recv comm objects
ncclResult_t (*closeSend)(void* sendComm);
ncclResult_t (*closeRecv)(void* recvComm);
ncclResult_t (*closeListen)(void* listenComm);

// Copy the given mhandle to a dptr in a format usable by this plugin's device code
ncclResult_t (*getDeviceMr)(void* comm, void* mhandle, void** dptr_mhandle);

// Notify the plugin that a recv has completed by the device
ncclResult_t (*irecvConsumed)(void* recvComm, int n, void* request);
} ncclNet_v8_t;

typedef struct {
void* mhandle;
void* address;
uint32_t size;
} ncclNetSGE_v8_t;

typedef struct {
// Name of the collective network (mainly for logs)
const char* name;
// Initialize the collective network.
ncclResult_t (*init)(ncclDebugLogger_t logFunction);
// Return the number of adapters capable of doing collective operations.
// If ndev returns 0, all other functions might be set to NULL.
ncclResult_t (*devices)(int* ndev);
// Get various device properties.
ncclResult_t (*getProperties)(int dev, ncclNetProperties_v8_t* props);
// Create a receiving object and provide a handle to connect to it. The
// handle can be up to NCCL_NET_HANDLE_MAXSIZE bytes and will be exchanged
// between ranks to create connections.
ncclResult_t (*listen)(int dev, void* handle, void** listenComm);
// Create a group for collective operations. handles have been created
// using listen() above. rank indicates caller's rank in the collective network.
ncclResult_t (*connect)(void* handles[], int nranks, int rank, void* listenComm, void** collComm);
// Returns whether a reduction operation on a data type is supported.
// 1 for supported, 0 otherwise.
ncclResult_t (*reduceSupport)(ncclDataType_t dataType, ncclRedOp_t redOp, int* supported);
// Register/Deregister memory. Type is either NCCL_PTR_HOST or NCCL_PTR_CUDA.
ncclResult_t (*regMr)(void* collComm, void* data, int size, int type, void** mhandle);
/* DMA-BUF support */
ncclResult_t (*regMrDmaBuf)(void* collComm, void* data, size_t size, int type, uint64_t offset, int fd, void** mhandle);
ncclResult_t (*deregMr)(void* collComm, void* mhandle);
// Performs an asynchronous allreduce operation on the collective group.
// May return request == NULL if the call cannot be performed (or would block).
ncclResult_t (*iallreduce)(void* collComm, void* sendData, void* recvData, int count,
ncclDataType_t dataType, ncclRedOp_t redOp, void* sendMhandle, void* recvMhandle, void** request);
ncclResult_t (*iallgather)(void* collComm, void* sendData, int nRecvParts, ncclNetSGE_v8_t* recvParts,
size_t bytesPerRank, size_t windowOffset, size_t windowBytes,
void* sendMhandle, void** request);
ncclResult_t (*ireducescatter)(void* collComm, int nSendParts, ncclNetSGE_v8_t* sendParts, void* recvData,
size_t bytesPerRank, size_t windowOffset, size_t windowBytes,
ncclDataType_t dataType, ncclRedOp_t redOp,
void* recvMhandle, void** request);
// Perform a flush/fence to make sure all data received with NCCL_PTR_CUDA is
// visible to the GPU
ncclResult_t (*iflush)(void* collComm, void* data, int size, void* mhandle, void** request);
// Test whether a request is complete. If size is not NULL, it returns the
// number of bytes sent/received.
ncclResult_t (*test)(void* request, int* done, int* size);
// Close and free collective comm objects
ncclResult_t (*closeColl)(void* collComm);
ncclResult_t (*closeListen)(void* listenComm);
} ncclCollNet_v8_t;

#endif // end include guard
22 changes: 22 additions & 0 deletions src/ib_plugin.c
Original file line number Diff line number Diff line change
Expand Up @@ -1144,6 +1144,28 @@ ncclResult_t ncclIbCloseListen(void* listenComm) {
return ncclSuccess;
}

const ncclNet_v8_t ibPlugin_v8 = {
.name = "IBext_v8",
.init = ncclIbInit,
.devices = ncclIbDevices,
.getProperties = ncclIbGetProperties,
.listen = ncclIbListen,
.connect = ncclIbConnect,
.accept = ncclIbAccept,
.regMr = ncclIbRegMr,
.regMrDmaBuf = ncclIbRegMrDmaBuf,
.deregMr = ncclIbDeregMr,
.isend = ncclIbIsend,
.irecv = ncclIbIrecv,
.iflush = ncclIbIflush,
.test = ncclIbTest,
.closeSend = ncclIbCloseSend,
.closeRecv = ncclIbCloseRecv,
.closeListen = ncclIbCloseListen,
NULL /* getDeviceMr */,
NULL /* irecvConsumed */
};

const ncclNet_v7_t ibPlugin_v7 = {
.name = "IBext_v7",
.init = ncclIbInit,
Expand Down
19 changes: 19 additions & 0 deletions src/p2p_plugin.c
Original file line number Diff line number Diff line change
Expand Up @@ -15,14 +15,17 @@
#include "p2p_plugin.h"

#ifdef HAVE_UCX_PLUGIN
extern ncclNet_v8_t ucxPlugin_v8;
extern ncclNet_v7_t ucxPlugin_v7;
extern ncclNet_v6_t ucxPlugin_v6;
extern ncclNet_v5_t ucxPlugin_v5;
extern ncclNet_v8_t ucxRmaPlugin_v8;
extern ncclNet_v7_t ucxRmaPlugin_v7;
extern ncclNet_v6_t ucxRmaPlugin_v6;
extern ncclNet_v5_t ucxRmaPlugin_v5;
#endif

extern ncclNet_v8_t ibPlugin_v8;
extern ncclNet_v7_t ibPlugin_v7;
extern ncclNet_v6_t ibPlugin_v6;
extern ncclNet_v5_t ibPlugin_v5;
Expand All @@ -40,10 +43,16 @@ extern int ncclIbRelaxedOrderingEnabled;
NCCL_PARAM(SharpMaxComms, "SHARP_MAX_COMMS", 1);
NCCL_PARAM(IbAdaptiveRouting, "IB_ADAPTIVE_ROUTING", -2);

ncclResult_t pluginInit_v8(ncclDebugLogger_t logFunction);
ncclResult_t pluginInit_v7(ncclDebugLogger_t logFunction);
ncclResult_t pluginInit_v6(ncclDebugLogger_t logFunction);
ncclResult_t pluginInit_v5(ncclDebugLogger_t logFunction);

ncclNet_v8_t ncclNetPlugin_v8 = {
"NCCL RDMA Plugin v8",
pluginInit_v8,
};

ncclNet_v7_t ncclNetPlugin_v7 = {
"NCCL RDMA Plugin v7",
pluginInit_v7,
Expand Down Expand Up @@ -85,17 +94,20 @@ static void pluginSetup()
}
switch (p2p_plugin) {
case NCCL_P2P_IB:
ncclNetPlugin_v8 = ibPlugin_v8;
ncclNetPlugin_v7 = ibPlugin_v7;
ncclNetPlugin_v6 = ibPlugin_v6;
ncclNetPlugin_v5 = ibPlugin_v5;
break;
#ifdef HAVE_UCX_PLUGIN
case NCCL_P2P_UCX:
ncclNetPlugin_v8 = ucxPlugin_v8;
ncclNetPlugin_v7 = ucxPlugin_v7;
ncclNetPlugin_v6 = ucxPlugin_v6;
ncclNetPlugin_v5 = ucxPlugin_v5;
break;
case NCCL_P2P_UCX_RMA:
ncclNetPlugin_v8 = ucxRmaPlugin_v8;
ncclNetPlugin_v7 = ucxRmaPlugin_v7;
ncclNetPlugin_v6 = ucxRmaPlugin_v6;
ncclNetPlugin_v5 = ucxRmaPlugin_v5;
Expand All @@ -105,6 +117,13 @@ static void pluginSetup()

}

ncclResult_t pluginInit_v8(ncclDebugLogger_t logFunction) {
pluginLogFunction = logFunction;
pluginSetup();
INFO(NCCL_INIT|NCCL_NET, "P2P plugin %s", ncclNetPlugin_v8.name);
return ncclNetPlugin_v8.init(logFunction);
}

ncclResult_t pluginInit_v7(ncclDebugLogger_t logFunction) {
pluginLogFunction = logFunction;
pluginSetup();
Expand Down
Loading

0 comments on commit 4e5e81e

Please sign in to comment.