Skip to content

Commit

Permalink
feat(gpu): optimize integer mul when one of the ct holds boolean
Browse files Browse the repository at this point in the history
  • Loading branch information
bbarbakadze authored and agnesLeroy committed Nov 21, 2024
1 parent 40dd2a6 commit 27ccfbd
Show file tree
Hide file tree
Showing 8 changed files with 164 additions and 77 deletions.
18 changes: 10 additions & 8 deletions backends/tfhe-cuda-backend/cuda/include/integer/integer.h
Original file line number Diff line number Diff line change
Expand Up @@ -103,17 +103,19 @@ void cleanup_cuda_full_propagation(void *const *streams,

void scratch_cuda_integer_mult_radix_ciphertext_kb_64(
void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
int8_t **mem_ptr, uint32_t message_modulus, uint32_t carry_modulus,
uint32_t glwe_dimension, uint32_t lwe_dimension, uint32_t polynomial_size,
uint32_t pbs_base_log, uint32_t pbs_level, uint32_t ks_base_log,
uint32_t ks_level, uint32_t grouping_factor, uint32_t num_blocks,
PBS_TYPE pbs_type, bool allocate_gpu_memory);
int8_t **mem_ptr, bool const is_boolean_left, bool const is_boolean_right,
uint32_t message_modulus, uint32_t carry_modulus, uint32_t glwe_dimension,
uint32_t lwe_dimension, uint32_t polynomial_size, uint32_t pbs_base_log,
uint32_t pbs_level, uint32_t ks_base_log, uint32_t ks_level,
uint32_t grouping_factor, uint32_t num_blocks, PBS_TYPE pbs_type,
bool allocate_gpu_memory);

void cuda_integer_mult_radix_ciphertext_kb_64(
void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
void *radix_lwe_out, void const *radix_lwe_left,
void const *radix_lwe_right, void *const *bsks, void *const *ksks,
int8_t *mem_ptr, uint32_t polynomial_size, uint32_t num_blocks);
void *radix_lwe_out, void const *radix_lwe_left, bool const is_bool_left,
void const *radix_lwe_right, bool const is_bool_right, void *const *bsks,
void *const *ksks, int8_t *mem_ptr, uint32_t polynomial_size,
uint32_t num_blocks);

void cleanup_cuda_integer_mult(void *const *streams,
uint32_t const *gpu_indexes, uint32_t gpu_count,
Expand Down
130 changes: 86 additions & 44 deletions backends/tfhe-cuda-backend/cuda/include/integer/integer_utilities.h
Original file line number Diff line number Diff line change
Expand Up @@ -1117,20 +1117,97 @@ template <typename Torus> struct int_sum_ciphertexts_vec_memory {
}
};

template <typename Torus> struct int_zero_out_if_buffer {

int_radix_params params;

Torus *tmp;

cudaStream_t *true_streams;
cudaStream_t *false_streams;
uint32_t active_gpu_count;

int_zero_out_if_buffer(cudaStream_t const *streams,
uint32_t const *gpu_indexes, uint32_t gpu_count,
int_radix_params params, uint32_t num_radix_blocks,
bool allocate_gpu_memory) {
this->params = params;
active_gpu_count = get_active_gpu_count(num_radix_blocks, gpu_count);

Torus big_size =
(params.big_lwe_dimension + 1) * num_radix_blocks * sizeof(Torus);
if (allocate_gpu_memory) {
tmp = (Torus *)cuda_malloc_async(big_size, streams[0], gpu_indexes[0]);
// We may use a different stream to allow concurrent operation
true_streams =
(cudaStream_t *)malloc(active_gpu_count * sizeof(cudaStream_t));
false_streams =
(cudaStream_t *)malloc(active_gpu_count * sizeof(cudaStream_t));
for (uint j = 0; j < active_gpu_count; j++) {
true_streams[j] = cuda_create_stream(gpu_indexes[j]);
false_streams[j] = cuda_create_stream(gpu_indexes[j]);
}
}
}
void release(cudaStream_t const *streams, uint32_t const *gpu_indexes,
uint32_t gpu_count) {
cuda_drop_async(tmp, streams[0], gpu_indexes[0]);
for (uint j = 0; j < active_gpu_count; j++) {
cuda_destroy_stream(true_streams[j], gpu_indexes[j]);
cuda_destroy_stream(false_streams[j], gpu_indexes[j]);
}
free(true_streams);
free(false_streams);
}
};

template <typename Torus> struct int_mul_memory {
Torus *vector_result_sb;
Torus *block_mul_res;
Torus *small_lwe_vector;

int_radix_lut<Torus> *luts_array; // lsb msb
int_radix_lut<Torus> *zero_out_predicate_lut;

int_sum_ciphertexts_vec_memory<Torus> *sum_ciphertexts_mem;
int_zero_out_if_buffer<Torus> *zero_out_mem;

int_radix_params params;
bool boolean_mul = false;

int_mul_memory(cudaStream_t const *streams, uint32_t const *gpu_indexes,
uint32_t gpu_count, int_radix_params params,
bool const is_boolean_left, bool const is_boolean_right,
uint32_t num_radix_blocks, bool allocate_gpu_memory) {
this->boolean_mul = is_boolean_left || is_boolean_right;
this->params = params;

if (boolean_mul) {
auto zero_out_predicate_lut_f = [](Torus block,
Torus condition) -> Torus {
if (condition == 0)
return 0;
else
return block;
};
zero_out_predicate_lut =
new int_radix_lut<Torus>(streams, gpu_indexes, gpu_count, params, 1,
num_radix_blocks, allocate_gpu_memory);
generate_device_accumulator_bivariate<Torus>(
streams[0], gpu_indexes[0],
zero_out_predicate_lut->get_lut(gpu_indexes[0], 0),
params.glwe_dimension, params.polynomial_size, params.message_modulus,
params.carry_modulus, zero_out_predicate_lut_f);
zero_out_predicate_lut->broadcast_lut(streams, gpu_indexes,
gpu_indexes[0]);

zero_out_mem = new int_zero_out_if_buffer<Torus>(
streams, gpu_indexes, gpu_count, params, num_radix_blocks,
allocate_gpu_memory);

return;
}

auto glwe_dimension = params.glwe_dimension;
auto polynomial_size = params.polynomial_size;
auto message_modulus = params.message_modulus;
Expand Down Expand Up @@ -1203,6 +1280,15 @@ template <typename Torus> struct int_mul_memory {

void release(cudaStream_t const *streams, uint32_t const *gpu_indexes,
uint32_t gpu_count) {

if (boolean_mul) {
zero_out_predicate_lut->release(streams, gpu_indexes, gpu_count);
zero_out_mem->release(streams, gpu_indexes, gpu_count);
delete zero_out_mem;
delete zero_out_predicate_lut;

return;
}
cuda_drop_async(vector_result_sb, streams[0], gpu_indexes[0]);
cuda_drop_async(block_mul_res, streams[0], gpu_indexes[0]);
cuda_drop_async(small_lwe_vector, streams[0], gpu_indexes[0]);
Expand Down Expand Up @@ -1598,50 +1684,6 @@ template <typename Torus> struct int_arithmetic_scalar_shift_buffer {
}
};

template <typename Torus> struct int_zero_out_if_buffer {

int_radix_params params;

Torus *tmp;

cudaStream_t *true_streams;
cudaStream_t *false_streams;
uint32_t active_gpu_count;

int_zero_out_if_buffer(cudaStream_t const *streams,
uint32_t const *gpu_indexes, uint32_t gpu_count,
int_radix_params params, uint32_t num_radix_blocks,
bool allocate_gpu_memory) {
this->params = params;
active_gpu_count = get_active_gpu_count(num_radix_blocks, gpu_count);

Torus big_size =
(params.big_lwe_dimension + 1) * num_radix_blocks * sizeof(Torus);
if (allocate_gpu_memory) {
tmp = (Torus *)cuda_malloc_async(big_size, streams[0], gpu_indexes[0]);
// We may use a different stream to allow concurrent operation
true_streams =
(cudaStream_t *)malloc(active_gpu_count * sizeof(cudaStream_t));
false_streams =
(cudaStream_t *)malloc(active_gpu_count * sizeof(cudaStream_t));
for (uint j = 0; j < active_gpu_count; j++) {
true_streams[j] = cuda_create_stream(gpu_indexes[j]);
false_streams[j] = cuda_create_stream(gpu_indexes[j]);
}
}
}
void release(cudaStream_t const *streams, uint32_t const *gpu_indexes,
uint32_t gpu_count) {
cuda_drop_async(tmp, streams[0], gpu_indexes[0]);
for (uint j = 0; j < active_gpu_count; j++) {
cuda_destroy_stream(true_streams[j], gpu_indexes[j]);
cuda_destroy_stream(false_streams[j], gpu_indexes[j]);
}
free(true_streams);
free(false_streams);
}
};

template <typename Torus> struct int_cmux_buffer {
int_radix_lut<Torus> *predicate_lut;
int_radix_lut<Torus> *inverted_predicate_lut;
Expand Down
50 changes: 26 additions & 24 deletions backends/tfhe-cuda-backend/cuda/src/integer/multiplication.cu
Original file line number Diff line number Diff line change
Expand Up @@ -67,11 +67,12 @@ void generate_ids_update_degrees(int *terms_degree, size_t *h_lwe_idx_in,
*/
void scratch_cuda_integer_mult_radix_ciphertext_kb_64(
void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
int8_t **mem_ptr, uint32_t message_modulus, uint32_t carry_modulus,
uint32_t glwe_dimension, uint32_t lwe_dimension, uint32_t polynomial_size,
uint32_t pbs_base_log, uint32_t pbs_level, uint32_t ks_base_log,
uint32_t ks_level, uint32_t grouping_factor, uint32_t num_radix_blocks,
PBS_TYPE pbs_type, bool allocate_gpu_memory) {
int8_t **mem_ptr, bool const is_boolean_left, bool const is_boolean_right,
uint32_t message_modulus, uint32_t carry_modulus, uint32_t glwe_dimension,
uint32_t lwe_dimension, uint32_t polynomial_size, uint32_t pbs_base_log,
uint32_t pbs_level, uint32_t ks_base_log, uint32_t ks_level,
uint32_t grouping_factor, uint32_t num_radix_blocks, PBS_TYPE pbs_type,
bool allocate_gpu_memory) {

int_radix_params params(pbs_type, glwe_dimension, polynomial_size,
polynomial_size * glwe_dimension, lwe_dimension,
Expand All @@ -88,8 +89,8 @@ void scratch_cuda_integer_mult_radix_ciphertext_kb_64(
case 16384:
scratch_cuda_integer_mult_radix_ciphertext_kb<uint64_t>(
(cudaStream_t const *)(streams), gpu_indexes, gpu_count,
(int_mul_memory<uint64_t> **)mem_ptr, num_radix_blocks, params,
allocate_gpu_memory);
(int_mul_memory<uint64_t> **)mem_ptr, is_boolean_left, is_boolean_right,
num_radix_blocks, params, allocate_gpu_memory);
break;
default:
PANIC("Cuda error (integer multiplication): unsupported polynomial size. "
Expand Down Expand Up @@ -126,65 +127,66 @@ void scratch_cuda_integer_mult_radix_ciphertext_kb_64(
*/
void cuda_integer_mult_radix_ciphertext_kb_64(
void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
void *radix_lwe_out, void const *radix_lwe_left,
void const *radix_lwe_right, void *const *bsks, void *const *ksks,
int8_t *mem_ptr, uint32_t polynomial_size, uint32_t num_blocks) {
void *radix_lwe_out, void const *radix_lwe_left, bool const is_bool_left,
void const *radix_lwe_right, bool const is_bool_right, void *const *bsks,
void *const *ksks, int8_t *mem_ptr, uint32_t polynomial_size,
uint32_t num_blocks) {

switch (polynomial_size) {
case 256:
host_integer_mult_radix_kb<uint64_t, AmortizedDegree<256>>(
(cudaStream_t *)(streams), gpu_indexes, gpu_count,
static_cast<uint64_t *>(radix_lwe_out),
static_cast<const uint64_t *>(radix_lwe_left),
static_cast<const uint64_t *>(radix_lwe_right), bsks,
static_cast<const uint64_t *>(radix_lwe_left), is_bool_left,
static_cast<const uint64_t *>(radix_lwe_right), is_bool_right, bsks,
(uint64_t **)(ksks), (int_mul_memory<uint64_t> *)mem_ptr, num_blocks);
break;
case 512:
host_integer_mult_radix_kb<uint64_t, AmortizedDegree<512>>(
(cudaStream_t *)(streams), gpu_indexes, gpu_count,
static_cast<uint64_t *>(radix_lwe_out),
static_cast<const uint64_t *>(radix_lwe_left),
static_cast<const uint64_t *>(radix_lwe_right), bsks,
static_cast<const uint64_t *>(radix_lwe_left), is_bool_left,
static_cast<const uint64_t *>(radix_lwe_right), is_bool_right, bsks,
(uint64_t **)(ksks), (int_mul_memory<uint64_t> *)mem_ptr, num_blocks);
break;
case 1024:
host_integer_mult_radix_kb<uint64_t, AmortizedDegree<1024>>(
(cudaStream_t *)(streams), gpu_indexes, gpu_count,
static_cast<uint64_t *>(radix_lwe_out),
static_cast<const uint64_t *>(radix_lwe_left),
static_cast<const uint64_t *>(radix_lwe_right), bsks,
static_cast<const uint64_t *>(radix_lwe_left), is_bool_left,
static_cast<const uint64_t *>(radix_lwe_right), is_bool_right, bsks,
(uint64_t **)(ksks), (int_mul_memory<uint64_t> *)mem_ptr, num_blocks);
break;
case 2048:
host_integer_mult_radix_kb<uint64_t, AmortizedDegree<2048>>(
(cudaStream_t *)(streams), gpu_indexes, gpu_count,
static_cast<uint64_t *>(radix_lwe_out),
static_cast<const uint64_t *>(radix_lwe_left),
static_cast<const uint64_t *>(radix_lwe_right), bsks,
static_cast<const uint64_t *>(radix_lwe_left), is_bool_left,
static_cast<const uint64_t *>(radix_lwe_right), is_bool_right, bsks,
(uint64_t **)(ksks), (int_mul_memory<uint64_t> *)mem_ptr, num_blocks);
break;
case 4096:
host_integer_mult_radix_kb<uint64_t, AmortizedDegree<4096>>(
(cudaStream_t *)(streams), gpu_indexes, gpu_count,
static_cast<uint64_t *>(radix_lwe_out),
static_cast<const uint64_t *>(radix_lwe_left),
static_cast<const uint64_t *>(radix_lwe_right), bsks,
static_cast<const uint64_t *>(radix_lwe_left), is_bool_left,
static_cast<const uint64_t *>(radix_lwe_right), is_bool_right, bsks,
(uint64_t **)(ksks), (int_mul_memory<uint64_t> *)mem_ptr, num_blocks);
break;
case 8192:
host_integer_mult_radix_kb<uint64_t, AmortizedDegree<8192>>(
(cudaStream_t *)(streams), gpu_indexes, gpu_count,
static_cast<uint64_t *>(radix_lwe_out),
static_cast<const uint64_t *>(radix_lwe_left),
static_cast<const uint64_t *>(radix_lwe_right), bsks,
static_cast<const uint64_t *>(radix_lwe_left), is_bool_left,
static_cast<const uint64_t *>(radix_lwe_right), is_bool_right, bsks,
(uint64_t **)(ksks), (int_mul_memory<uint64_t> *)mem_ptr, num_blocks);
break;
case 16384:
host_integer_mult_radix_kb<uint64_t, AmortizedDegree<16384>>(
(cudaStream_t *)(streams), gpu_indexes, gpu_count,
static_cast<uint64_t *>(radix_lwe_out),
static_cast<const uint64_t *>(radix_lwe_left),
static_cast<const uint64_t *>(radix_lwe_right), bsks,
static_cast<const uint64_t *>(radix_lwe_left), is_bool_left,
static_cast<const uint64_t *>(radix_lwe_right), is_bool_right, bsks,
(uint64_t **)(ksks), (int_mul_memory<uint64_t> *)mem_ptr, num_blocks);
break;
default:
Expand Down
20 changes: 19 additions & 1 deletion backends/tfhe-cuda-backend/cuda/src/integer/multiplication.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@
#include "crypto/keyswitch.cuh"
#include "device.h"
#include "helper_multi_gpu.h"
#include "integer/cmux.cuh"
#include "integer/integer.cuh"
#include "integer/integer_utilities.h"
#include "linear_algebra.h"
Expand Down Expand Up @@ -453,7 +454,8 @@ template <typename Torus, class params>
__host__ void host_integer_mult_radix_kb(
cudaStream_t const *streams, uint32_t const *gpu_indexes,
uint32_t gpu_count, uint64_t *radix_lwe_out, uint64_t const *radix_lwe_left,
uint64_t const *radix_lwe_right, void *const *bsks, uint64_t *const *ksks,
bool const is_bool_left, uint64_t const *radix_lwe_right,
bool const is_bool_right, void *const *bsks, uint64_t *const *ksks,
int_mul_memory<Torus> *mem_ptr, uint32_t num_blocks) {

auto glwe_dimension = mem_ptr->params.glwe_dimension;
Expand All @@ -464,6 +466,20 @@ __host__ void host_integer_mult_radix_kb(

int big_lwe_dimension = glwe_dimension * polynomial_size;

if (is_bool_right) {
zero_out_if<Torus>(streams, gpu_indexes, gpu_count, radix_lwe_out,
radix_lwe_left, radix_lwe_right, mem_ptr->zero_out_mem,
mem_ptr->zero_out_predicate_lut, bsks, ksks, num_blocks);
return;
}

if (is_bool_left) {
zero_out_if<Torus>(streams, gpu_indexes, gpu_count, radix_lwe_out,
radix_lwe_right, radix_lwe_left, mem_ptr->zero_out_mem,
mem_ptr->zero_out_predicate_lut, bsks, ksks, num_blocks);
return;
}

// 'vector_result_lsb' contains blocks from all possible right shifts of
// radix_lwe_left, only nonzero blocks are kept
int lsb_vector_block_count = num_blocks * (num_blocks + 1) / 2;
Expand Down Expand Up @@ -572,9 +588,11 @@ template <typename Torus>
__host__ void scratch_cuda_integer_mult_radix_ciphertext_kb(
cudaStream_t const *streams, uint32_t const *gpu_indexes,
uint32_t gpu_count, int_mul_memory<Torus> **mem_ptr,
bool const is_boolean_left, bool const is_boolean_right,
uint32_t num_radix_blocks, int_radix_params params,
bool allocate_gpu_memory) {
*mem_ptr = new int_mul_memory<Torus>(streams, gpu_indexes, gpu_count, params,
is_boolean_left, is_boolean_right,
num_radix_blocks, allocate_gpu_memory);
}

Expand Down
4 changes: 4 additions & 0 deletions backends/tfhe-cuda-backend/src/bindings.rs
Original file line number Diff line number Diff line change
Expand Up @@ -290,6 +290,8 @@ extern "C" {
gpu_indexes: *const u32,
gpu_count: u32,
mem_ptr: *mut *mut i8,
is_boolean_left: bool,
is_boolean_right: bool,
message_modulus: u32,
carry_modulus: u32,
glwe_dimension: u32,
Expand All @@ -312,7 +314,9 @@ extern "C" {
gpu_count: u32,
radix_lwe_out: *mut ffi::c_void,
radix_lwe_left: *const ffi::c_void,
is_bool_left: bool,
radix_lwe_right: *const ffi::c_void,
is_bool_right: bool,
bsks: *const *mut ffi::c_void,
ksks: *const *mut ffi::c_void,
mem_ptr: *mut i8,
Expand Down
7 changes: 7 additions & 0 deletions tfhe/src/integer/gpu/ciphertext/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -38,6 +38,13 @@ pub trait CudaIntegerRadixCiphertext: Sized {
.all(CudaBlockInfo::carry_is_empty)
}

fn holds_boolean_value(&self) -> bool {
self.as_ref().info.blocks[0].degree.get() <= 1
&& self.as_ref().info.blocks[1..]
.iter()
.all(|cuda_block_info| cuda_block_info.degree.get() == 0)
}

fn is_equal(&self, other: &Self, streams: &CudaStreams) -> bool {
self.as_ref().is_equal(other.as_ref(), streams)
}
Expand Down
Loading

0 comments on commit 27ccfbd

Please sign in to comment.