Skip to content
This repository has been archived by the owner on Apr 23, 2021. It is now read-only.

Commit

Permalink
Switch comments from GPU dialect terms to CUDA terms (NFC).
Browse files Browse the repository at this point in the history
local workgroup -> block, subgroup -> warp, invocation -> thread.

PiperOrigin-RevId: 271946342
  • Loading branch information
chsigg authored and tensorflower-gardener committed Sep 30, 2019
1 parent 8db2fd6 commit b5bea2c
Showing 1 changed file with 7 additions and 8 deletions.
15 changes: 7 additions & 8 deletions lib/Conversion/GPUToNVVM/LowerGpuOpsToNVVMOps.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -119,12 +119,11 @@ struct GPUAllReduceOpLowering : public LLVMOpLowering {
}

private:
// Creates an all_reduce across the local workgroup.
// Creates an all_reduce across the block.
//
// First reduce the elements within a subgroup (i.e. warp). The first
// invocation of each subgroup writes the intermediate result to shared
// memory. After synchronizing the local workgroup, each subgroup reduces all
// values from shared memory.
// First reduce the elements within a warp. The first thread of each warp
// writes the intermediate result to shared memory. After synchronizing the
// block, each warp reduces all values from shared memory.
//
// %warp_reduce = ... (see createWarpReduce)
// %buffer = llvm.mlir.addressof @reduce_buffer : !llvm<"[32 x float]*">
Expand Down Expand Up @@ -188,7 +187,7 @@ struct GPUAllReduceOpLowering : public LLVMOpLowering {
return result;
}

// Creates an all_reduce across the subgroup. Creates a preamble
// Creates an all_reduce across the warp. Creates a preamble
//
// %active_mask = llvm.mlir.constant(-1 : i32) : !llvm.i32
// %mask_and_clamp = llvm.mlir.constant(31 : i32) : !llvm.i32
Expand All @@ -200,7 +199,7 @@ struct GPUAllReduceOpLowering : public LLVMOpLowering {
// %active_mask, %operand, %offset, %mask_and_clamp : !llvm.float
// %operand = llvm.fadd %operand, %value : !llvm.float
//
// Each invocation returns the same result.
// Each thread returns the same result.
//
// Note: this currently only supports reducing exactly 32 values.
Value *createWarpReduce(Location loc, Value *operand,
Expand Down Expand Up @@ -245,7 +244,7 @@ struct GPUAllReduceOpLowering : public LLVMOpLowering {
return rewriter.create<LLVM::AddressOfOp>(loc, globalOp);
}

// Returns the index of the subgroup within the local workgroup.
// Returns the index of the warp within the block.
//
// %warp_size = llvm.mlir.constant(32 : i32) : !llvm.i32
// %thread_idx = nvvm.read.ptx.sreg.tid.x : !llvm.i32
Expand Down

0 comments on commit b5bea2c

Please sign in to comment.