From 27ccfbd939c055aa11222ca6f585d7d31139a60b Mon Sep 17 00:00:00 2001 From: Beka Barbakadze Date: Wed, 20 Nov 2024 16:03:30 +0400 Subject: [PATCH] feat(gpu): optimize integer mul when one of the ct holds boolean --- .../cuda/include/integer/integer.h | 18 +-- .../cuda/include/integer/integer_utilities.h | 130 ++++++++++++------ .../cuda/src/integer/multiplication.cu | 50 +++---- .../cuda/src/integer/multiplication.cuh | 20 ++- backends/tfhe-cuda-backend/src/bindings.rs | 4 + tfhe/src/integer/gpu/ciphertext/mod.rs | 7 + tfhe/src/integer/gpu/mod.rs | 6 + tfhe/src/integer/gpu/server_key/radix/mul.rs | 6 + 8 files changed, 164 insertions(+), 77 deletions(-) diff --git a/backends/tfhe-cuda-backend/cuda/include/integer/integer.h b/backends/tfhe-cuda-backend/cuda/include/integer/integer.h index 3ac4886018..898daba86d 100644 --- a/backends/tfhe-cuda-backend/cuda/include/integer/integer.h +++ b/backends/tfhe-cuda-backend/cuda/include/integer/integer.h @@ -103,17 +103,19 @@ void cleanup_cuda_full_propagation(void *const *streams, void scratch_cuda_integer_mult_radix_ciphertext_kb_64( void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count, - int8_t **mem_ptr, uint32_t message_modulus, uint32_t carry_modulus, - uint32_t glwe_dimension, uint32_t lwe_dimension, uint32_t polynomial_size, - uint32_t pbs_base_log, uint32_t pbs_level, uint32_t ks_base_log, - uint32_t ks_level, uint32_t grouping_factor, uint32_t num_blocks, - PBS_TYPE pbs_type, bool allocate_gpu_memory); + int8_t **mem_ptr, bool const is_boolean_left, bool const is_boolean_right, + uint32_t message_modulus, uint32_t carry_modulus, uint32_t glwe_dimension, + uint32_t lwe_dimension, uint32_t polynomial_size, uint32_t pbs_base_log, + uint32_t pbs_level, uint32_t ks_base_log, uint32_t ks_level, + uint32_t grouping_factor, uint32_t num_blocks, PBS_TYPE pbs_type, + bool allocate_gpu_memory); void cuda_integer_mult_radix_ciphertext_kb_64( void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count, - void *radix_lwe_out, void const *radix_lwe_left, - void const *radix_lwe_right, void *const *bsks, void *const *ksks, - int8_t *mem_ptr, uint32_t polynomial_size, uint32_t num_blocks); + void *radix_lwe_out, void const *radix_lwe_left, bool const is_bool_left, + void const *radix_lwe_right, bool const is_bool_right, void *const *bsks, + void *const *ksks, int8_t *mem_ptr, uint32_t polynomial_size, + uint32_t num_blocks); void cleanup_cuda_integer_mult(void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count, diff --git a/backends/tfhe-cuda-backend/cuda/include/integer/integer_utilities.h b/backends/tfhe-cuda-backend/cuda/include/integer/integer_utilities.h index 73d191989e..1a985c1ca0 100644 --- a/backends/tfhe-cuda-backend/cuda/include/integer/integer_utilities.h +++ b/backends/tfhe-cuda-backend/cuda/include/integer/integer_utilities.h @@ -1117,20 +1117,97 @@ template struct int_sum_ciphertexts_vec_memory { } }; +template struct int_zero_out_if_buffer { + + int_radix_params params; + + Torus *tmp; + + cudaStream_t *true_streams; + cudaStream_t *false_streams; + uint32_t active_gpu_count; + + int_zero_out_if_buffer(cudaStream_t const *streams, + uint32_t const *gpu_indexes, uint32_t gpu_count, + int_radix_params params, uint32_t num_radix_blocks, + bool allocate_gpu_memory) { + this->params = params; + active_gpu_count = get_active_gpu_count(num_radix_blocks, gpu_count); + + Torus big_size = + (params.big_lwe_dimension + 1) * num_radix_blocks * sizeof(Torus); + if (allocate_gpu_memory) { + tmp = (Torus *)cuda_malloc_async(big_size, streams[0], gpu_indexes[0]); + // We may use a different stream to allow concurrent operation + true_streams = + (cudaStream_t *)malloc(active_gpu_count * sizeof(cudaStream_t)); + false_streams = + (cudaStream_t *)malloc(active_gpu_count * sizeof(cudaStream_t)); + for (uint j = 0; j < active_gpu_count; j++) { + true_streams[j] = cuda_create_stream(gpu_indexes[j]); + false_streams[j] = cuda_create_stream(gpu_indexes[j]); + } + } + } + void release(cudaStream_t const *streams, uint32_t const *gpu_indexes, + uint32_t gpu_count) { + cuda_drop_async(tmp, streams[0], gpu_indexes[0]); + for (uint j = 0; j < active_gpu_count; j++) { + cuda_destroy_stream(true_streams[j], gpu_indexes[j]); + cuda_destroy_stream(false_streams[j], gpu_indexes[j]); + } + free(true_streams); + free(false_streams); + } +}; + template struct int_mul_memory { Torus *vector_result_sb; Torus *block_mul_res; Torus *small_lwe_vector; int_radix_lut *luts_array; // lsb msb + int_radix_lut *zero_out_predicate_lut; + int_sum_ciphertexts_vec_memory *sum_ciphertexts_mem; + int_zero_out_if_buffer *zero_out_mem; int_radix_params params; + bool boolean_mul = false; int_mul_memory(cudaStream_t const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count, int_radix_params params, + bool const is_boolean_left, bool const is_boolean_right, uint32_t num_radix_blocks, bool allocate_gpu_memory) { + this->boolean_mul = is_boolean_left || is_boolean_right; this->params = params; + + if (boolean_mul) { + auto zero_out_predicate_lut_f = [](Torus block, + Torus condition) -> Torus { + if (condition == 0) + return 0; + else + return block; + }; + zero_out_predicate_lut = + new int_radix_lut(streams, gpu_indexes, gpu_count, params, 1, + num_radix_blocks, allocate_gpu_memory); + generate_device_accumulator_bivariate( + streams[0], gpu_indexes[0], + zero_out_predicate_lut->get_lut(gpu_indexes[0], 0), + params.glwe_dimension, params.polynomial_size, params.message_modulus, + params.carry_modulus, zero_out_predicate_lut_f); + zero_out_predicate_lut->broadcast_lut(streams, gpu_indexes, + gpu_indexes[0]); + + zero_out_mem = new int_zero_out_if_buffer( + streams, gpu_indexes, gpu_count, params, num_radix_blocks, + allocate_gpu_memory); + + return; + } + auto glwe_dimension = params.glwe_dimension; auto polynomial_size = params.polynomial_size; auto message_modulus = params.message_modulus; @@ -1203,6 +1280,15 @@ template struct int_mul_memory { void release(cudaStream_t const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count) { + + if (boolean_mul) { + zero_out_predicate_lut->release(streams, gpu_indexes, gpu_count); + zero_out_mem->release(streams, gpu_indexes, gpu_count); + delete zero_out_mem; + delete zero_out_predicate_lut; + + return; + } cuda_drop_async(vector_result_sb, streams[0], gpu_indexes[0]); cuda_drop_async(block_mul_res, streams[0], gpu_indexes[0]); cuda_drop_async(small_lwe_vector, streams[0], gpu_indexes[0]); @@ -1598,50 +1684,6 @@ template struct int_arithmetic_scalar_shift_buffer { } }; -template struct int_zero_out_if_buffer { - - int_radix_params params; - - Torus *tmp; - - cudaStream_t *true_streams; - cudaStream_t *false_streams; - uint32_t active_gpu_count; - - int_zero_out_if_buffer(cudaStream_t const *streams, - uint32_t const *gpu_indexes, uint32_t gpu_count, - int_radix_params params, uint32_t num_radix_blocks, - bool allocate_gpu_memory) { - this->params = params; - active_gpu_count = get_active_gpu_count(num_radix_blocks, gpu_count); - - Torus big_size = - (params.big_lwe_dimension + 1) * num_radix_blocks * sizeof(Torus); - if (allocate_gpu_memory) { - tmp = (Torus *)cuda_malloc_async(big_size, streams[0], gpu_indexes[0]); - // We may use a different stream to allow concurrent operation - true_streams = - (cudaStream_t *)malloc(active_gpu_count * sizeof(cudaStream_t)); - false_streams = - (cudaStream_t *)malloc(active_gpu_count * sizeof(cudaStream_t)); - for (uint j = 0; j < active_gpu_count; j++) { - true_streams[j] = cuda_create_stream(gpu_indexes[j]); - false_streams[j] = cuda_create_stream(gpu_indexes[j]); - } - } - } - void release(cudaStream_t const *streams, uint32_t const *gpu_indexes, - uint32_t gpu_count) { - cuda_drop_async(tmp, streams[0], gpu_indexes[0]); - for (uint j = 0; j < active_gpu_count; j++) { - cuda_destroy_stream(true_streams[j], gpu_indexes[j]); - cuda_destroy_stream(false_streams[j], gpu_indexes[j]); - } - free(true_streams); - free(false_streams); - } -}; - template struct int_cmux_buffer { int_radix_lut *predicate_lut; int_radix_lut *inverted_predicate_lut; diff --git a/backends/tfhe-cuda-backend/cuda/src/integer/multiplication.cu b/backends/tfhe-cuda-backend/cuda/src/integer/multiplication.cu index e866f0c84f..0733a2ad34 100644 --- a/backends/tfhe-cuda-backend/cuda/src/integer/multiplication.cu +++ b/backends/tfhe-cuda-backend/cuda/src/integer/multiplication.cu @@ -67,11 +67,12 @@ void generate_ids_update_degrees(int *terms_degree, size_t *h_lwe_idx_in, */ void scratch_cuda_integer_mult_radix_ciphertext_kb_64( void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count, - int8_t **mem_ptr, uint32_t message_modulus, uint32_t carry_modulus, - uint32_t glwe_dimension, uint32_t lwe_dimension, uint32_t polynomial_size, - uint32_t pbs_base_log, uint32_t pbs_level, uint32_t ks_base_log, - uint32_t ks_level, uint32_t grouping_factor, uint32_t num_radix_blocks, - PBS_TYPE pbs_type, bool allocate_gpu_memory) { + int8_t **mem_ptr, bool const is_boolean_left, bool const is_boolean_right, + uint32_t message_modulus, uint32_t carry_modulus, uint32_t glwe_dimension, + uint32_t lwe_dimension, uint32_t polynomial_size, uint32_t pbs_base_log, + uint32_t pbs_level, uint32_t ks_base_log, uint32_t ks_level, + uint32_t grouping_factor, uint32_t num_radix_blocks, PBS_TYPE pbs_type, + bool allocate_gpu_memory) { int_radix_params params(pbs_type, glwe_dimension, polynomial_size, polynomial_size * glwe_dimension, lwe_dimension, @@ -88,8 +89,8 @@ void scratch_cuda_integer_mult_radix_ciphertext_kb_64( case 16384: scratch_cuda_integer_mult_radix_ciphertext_kb( (cudaStream_t const *)(streams), gpu_indexes, gpu_count, - (int_mul_memory **)mem_ptr, num_radix_blocks, params, - allocate_gpu_memory); + (int_mul_memory **)mem_ptr, is_boolean_left, is_boolean_right, + num_radix_blocks, params, allocate_gpu_memory); break; default: PANIC("Cuda error (integer multiplication): unsupported polynomial size. " @@ -126,65 +127,66 @@ void scratch_cuda_integer_mult_radix_ciphertext_kb_64( */ void cuda_integer_mult_radix_ciphertext_kb_64( void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count, - void *radix_lwe_out, void const *radix_lwe_left, - void const *radix_lwe_right, void *const *bsks, void *const *ksks, - int8_t *mem_ptr, uint32_t polynomial_size, uint32_t num_blocks) { + void *radix_lwe_out, void const *radix_lwe_left, bool const is_bool_left, + void const *radix_lwe_right, bool const is_bool_right, void *const *bsks, + void *const *ksks, int8_t *mem_ptr, uint32_t polynomial_size, + uint32_t num_blocks) { switch (polynomial_size) { case 256: host_integer_mult_radix_kb>( (cudaStream_t *)(streams), gpu_indexes, gpu_count, static_cast(radix_lwe_out), - static_cast(radix_lwe_left), - static_cast(radix_lwe_right), bsks, + static_cast(radix_lwe_left), is_bool_left, + static_cast(radix_lwe_right), is_bool_right, bsks, (uint64_t **)(ksks), (int_mul_memory *)mem_ptr, num_blocks); break; case 512: host_integer_mult_radix_kb>( (cudaStream_t *)(streams), gpu_indexes, gpu_count, static_cast(radix_lwe_out), - static_cast(radix_lwe_left), - static_cast(radix_lwe_right), bsks, + static_cast(radix_lwe_left), is_bool_left, + static_cast(radix_lwe_right), is_bool_right, bsks, (uint64_t **)(ksks), (int_mul_memory *)mem_ptr, num_blocks); break; case 1024: host_integer_mult_radix_kb>( (cudaStream_t *)(streams), gpu_indexes, gpu_count, static_cast(radix_lwe_out), - static_cast(radix_lwe_left), - static_cast(radix_lwe_right), bsks, + static_cast(radix_lwe_left), is_bool_left, + static_cast(radix_lwe_right), is_bool_right, bsks, (uint64_t **)(ksks), (int_mul_memory *)mem_ptr, num_blocks); break; case 2048: host_integer_mult_radix_kb>( (cudaStream_t *)(streams), gpu_indexes, gpu_count, static_cast(radix_lwe_out), - static_cast(radix_lwe_left), - static_cast(radix_lwe_right), bsks, + static_cast(radix_lwe_left), is_bool_left, + static_cast(radix_lwe_right), is_bool_right, bsks, (uint64_t **)(ksks), (int_mul_memory *)mem_ptr, num_blocks); break; case 4096: host_integer_mult_radix_kb>( (cudaStream_t *)(streams), gpu_indexes, gpu_count, static_cast(radix_lwe_out), - static_cast(radix_lwe_left), - static_cast(radix_lwe_right), bsks, + static_cast(radix_lwe_left), is_bool_left, + static_cast(radix_lwe_right), is_bool_right, bsks, (uint64_t **)(ksks), (int_mul_memory *)mem_ptr, num_blocks); break; case 8192: host_integer_mult_radix_kb>( (cudaStream_t *)(streams), gpu_indexes, gpu_count, static_cast(radix_lwe_out), - static_cast(radix_lwe_left), - static_cast(radix_lwe_right), bsks, + static_cast(radix_lwe_left), is_bool_left, + static_cast(radix_lwe_right), is_bool_right, bsks, (uint64_t **)(ksks), (int_mul_memory *)mem_ptr, num_blocks); break; case 16384: host_integer_mult_radix_kb>( (cudaStream_t *)(streams), gpu_indexes, gpu_count, static_cast(radix_lwe_out), - static_cast(radix_lwe_left), - static_cast(radix_lwe_right), bsks, + static_cast(radix_lwe_left), is_bool_left, + static_cast(radix_lwe_right), is_bool_right, bsks, (uint64_t **)(ksks), (int_mul_memory *)mem_ptr, num_blocks); break; default: diff --git a/backends/tfhe-cuda-backend/cuda/src/integer/multiplication.cuh b/backends/tfhe-cuda-backend/cuda/src/integer/multiplication.cuh index 0ebf410125..1e2694e1e2 100644 --- a/backends/tfhe-cuda-backend/cuda/src/integer/multiplication.cuh +++ b/backends/tfhe-cuda-backend/cuda/src/integer/multiplication.cuh @@ -9,6 +9,7 @@ #include "crypto/keyswitch.cuh" #include "device.h" #include "helper_multi_gpu.h" +#include "integer/cmux.cuh" #include "integer/integer.cuh" #include "integer/integer_utilities.h" #include "linear_algebra.h" @@ -453,7 +454,8 @@ template __host__ void host_integer_mult_radix_kb( cudaStream_t const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count, uint64_t *radix_lwe_out, uint64_t const *radix_lwe_left, - uint64_t const *radix_lwe_right, void *const *bsks, uint64_t *const *ksks, + bool const is_bool_left, uint64_t const *radix_lwe_right, + bool const is_bool_right, void *const *bsks, uint64_t *const *ksks, int_mul_memory *mem_ptr, uint32_t num_blocks) { auto glwe_dimension = mem_ptr->params.glwe_dimension; @@ -464,6 +466,20 @@ __host__ void host_integer_mult_radix_kb( int big_lwe_dimension = glwe_dimension * polynomial_size; + if (is_bool_right) { + zero_out_if(streams, gpu_indexes, gpu_count, radix_lwe_out, + radix_lwe_left, radix_lwe_right, mem_ptr->zero_out_mem, + mem_ptr->zero_out_predicate_lut, bsks, ksks, num_blocks); + return; + } + + if (is_bool_left) { + zero_out_if(streams, gpu_indexes, gpu_count, radix_lwe_out, + radix_lwe_right, radix_lwe_left, mem_ptr->zero_out_mem, + mem_ptr->zero_out_predicate_lut, bsks, ksks, num_blocks); + return; + } + // 'vector_result_lsb' contains blocks from all possible right shifts of // radix_lwe_left, only nonzero blocks are kept int lsb_vector_block_count = num_blocks * (num_blocks + 1) / 2; @@ -572,9 +588,11 @@ template __host__ void scratch_cuda_integer_mult_radix_ciphertext_kb( cudaStream_t const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count, int_mul_memory **mem_ptr, + bool const is_boolean_left, bool const is_boolean_right, uint32_t num_radix_blocks, int_radix_params params, bool allocate_gpu_memory) { *mem_ptr = new int_mul_memory(streams, gpu_indexes, gpu_count, params, + is_boolean_left, is_boolean_right, num_radix_blocks, allocate_gpu_memory); } diff --git a/backends/tfhe-cuda-backend/src/bindings.rs b/backends/tfhe-cuda-backend/src/bindings.rs index 9af53285e7..df0f91ab23 100644 --- a/backends/tfhe-cuda-backend/src/bindings.rs +++ b/backends/tfhe-cuda-backend/src/bindings.rs @@ -290,6 +290,8 @@ extern "C" { gpu_indexes: *const u32, gpu_count: u32, mem_ptr: *mut *mut i8, + is_boolean_left: bool, + is_boolean_right: bool, message_modulus: u32, carry_modulus: u32, glwe_dimension: u32, @@ -312,7 +314,9 @@ extern "C" { gpu_count: u32, radix_lwe_out: *mut ffi::c_void, radix_lwe_left: *const ffi::c_void, + is_bool_left: bool, radix_lwe_right: *const ffi::c_void, + is_bool_right: bool, bsks: *const *mut ffi::c_void, ksks: *const *mut ffi::c_void, mem_ptr: *mut i8, diff --git a/tfhe/src/integer/gpu/ciphertext/mod.rs b/tfhe/src/integer/gpu/ciphertext/mod.rs index be9707e7de..28df73adde 100644 --- a/tfhe/src/integer/gpu/ciphertext/mod.rs +++ b/tfhe/src/integer/gpu/ciphertext/mod.rs @@ -38,6 +38,13 @@ pub trait CudaIntegerRadixCiphertext: Sized { .all(CudaBlockInfo::carry_is_empty) } + fn holds_boolean_value(&self) -> bool { + self.as_ref().info.blocks[0].degree.get() <= 1 + && self.as_ref().info.blocks[1..] + .iter() + .all(|cuda_block_info| cuda_block_info.degree.get() == 0) + } + fn is_equal(&self, other: &Self, streams: &CudaStreams) -> bool { self.as_ref().is_equal(other.as_ref(), streams) } diff --git a/tfhe/src/integer/gpu/mod.rs b/tfhe/src/integer/gpu/mod.rs index b41d5c4404..de2038d9b4 100644 --- a/tfhe/src/integer/gpu/mod.rs +++ b/tfhe/src/integer/gpu/mod.rs @@ -474,7 +474,9 @@ pub unsafe fn unchecked_add_integer_radix_assign_async( pub unsafe fn unchecked_mul_integer_radix_kb_assign_async( streams: &CudaStreams, radix_lwe_left: &mut CudaVec, + is_boolean_left: bool, radix_lwe_right: &CudaVec, + is_boolean_right: bool, bootstrapping_key: &CudaVec, keyswitch_key: &CudaVec, message_modulus: MessageModulus, @@ -516,6 +518,8 @@ pub unsafe fn unchecked_mul_integer_radix_kb_assign_async { unchecked_mul_integer_radix_kb_assign_async( stream, &mut ct_left.as_mut().d_blocks.0.d_vec, + is_boolean_left, &ct_right.as_ref().d_blocks.0.d_vec, + is_boolean_right, &d_bsk.d_vec, &self.key_switching_key.d_vec, self.message_modulus, @@ -98,7 +102,9 @@ impl CudaServerKey { unchecked_mul_integer_radix_kb_assign_async( stream, &mut ct_left.as_mut().d_blocks.0.d_vec, + is_boolean_left, &ct_right.as_ref().d_blocks.0.d_vec, + is_boolean_right, &d_multibit_bsk.d_vec, &self.key_switching_key.d_vec, self.message_modulus,