Skip to content

feat(gpu): add necessary entry points for 128 bit compression #2303

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 1 commit into from
May 5, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion backends/tfhe-cuda-backend/build.rs
Original file line number Diff line number Diff line change
Expand Up @@ -78,7 +78,7 @@ fn main() {
"cuda/include/integer/compression/compression.h",
"cuda/include/integer/integer.h",
"cuda/include/zk/zk.h",
"cuda/include/keyswitch.h",
"cuda/include/keyswitch/keyswitch.h",
"cuda/include/keyswitch/ks_enums.h",
"cuda/include/linear_algebra.h",
"cuda/include/fft/fft128.h",
Expand Down
5 changes: 5 additions & 0 deletions backends/tfhe-cuda-backend/cuda/include/ciphertext.h
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,11 @@ void cuda_improve_noise_modulus_switch_64(
void const *lwe_array_in, void const *encrypted_zeros, uint32_t lwe_size,
uint32_t num_lwes, uint32_t num_zeros, double input_variance,
double r_sigma, double bound, uint32_t log_modulus);

void cuda_glwe_sample_extract_128(
void *stream, uint32_t gpu_index, void *lwe_array_out,
void const *glwe_array_in, uint32_t const *nth_array, uint32_t num_nths,
uint32_t lwe_per_glwe, uint32_t glwe_dimension, uint32_t polynomial_size);
}

#endif
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
#include "integer.h"
#include "integer/radix_ciphertext.cuh"
#include "integer/radix_ciphertext.h"
#include "keyswitch.h"
#include "keyswitch/keyswitch.h"
#include "pbs/programmable_bootstrap.cuh"
#include <cmath>
#include <functional>
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,18 @@ void cuda_packing_keyswitch_lwe_list_to_glwe_64(
uint32_t output_polynomial_size, uint32_t base_log, uint32_t level_count,
uint32_t num_lwes);

void scratch_packing_keyswitch_lwe_list_to_glwe_128(
void *stream, uint32_t gpu_index, int8_t **fp_ks_buffer,
uint32_t lwe_dimension, uint32_t glwe_dimension, uint32_t polynomial_size,
uint32_t num_lwes, bool allocate_gpu_memory);

void cuda_packing_keyswitch_lwe_list_to_glwe_128(
void *stream, uint32_t gpu_index, void *glwe_array_out,
void const *lwe_array_in, void const *fp_ksk_array, int8_t *fp_ks_buffer,
uint32_t input_lwe_dimension, uint32_t output_glwe_dimension,
uint32_t output_polynomial_size, uint32_t base_log, uint32_t level_count,
uint32_t num_lwes);

void cleanup_packing_keyswitch_lwe_list_to_glwe(void *stream,
uint32_t gpu_index,
int8_t **fp_ks_buffer,
Expand Down
42 changes: 42 additions & 0 deletions backends/tfhe-cuda-backend/cuda/src/crypto/ciphertext.cu
Original file line number Diff line number Diff line change
Expand Up @@ -96,3 +96,45 @@ void cuda_improve_noise_modulus_switch_64(
static_cast<const uint64_t *>(encrypted_zeros), lwe_size, num_lwes,
num_zeros, input_variance, r_sigma, bound, log_modulus);
}

void cuda_glwe_sample_extract_128(
void *stream, uint32_t gpu_index, void *lwe_array_out,
void const *glwe_array_in, uint32_t const *nth_array, uint32_t num_nths,
uint32_t lwe_per_glwe, uint32_t glwe_dimension, uint32_t polynomial_size) {

switch (polynomial_size) {
case 256:
host_sample_extract<__uint128_t, AmortizedDegree<256>>(
static_cast<cudaStream_t>(stream), gpu_index,
(__uint128_t *)lwe_array_out, (__uint128_t const *)glwe_array_in,
(uint32_t const *)nth_array, num_nths, lwe_per_glwe, glwe_dimension);
break;
case 512:
host_sample_extract<__uint128_t, AmortizedDegree<512>>(
static_cast<cudaStream_t>(stream), gpu_index,
(__uint128_t *)lwe_array_out, (__uint128_t const *)glwe_array_in,
(uint32_t const *)nth_array, num_nths, lwe_per_glwe, glwe_dimension);
break;
case 1024:
host_sample_extract<__uint128_t, AmortizedDegree<1024>>(
static_cast<cudaStream_t>(stream), gpu_index,
(__uint128_t *)lwe_array_out, (__uint128_t const *)glwe_array_in,
(uint32_t const *)nth_array, num_nths, lwe_per_glwe, glwe_dimension);
break;
case 2048:
host_sample_extract<__uint128_t, AmortizedDegree<2048>>(
static_cast<cudaStream_t>(stream), gpu_index,
(__uint128_t *)lwe_array_out, (__uint128_t const *)glwe_array_in,
(uint32_t const *)nth_array, num_nths, lwe_per_glwe, glwe_dimension);
break;
case 4096:
host_sample_extract<__uint128_t, AmortizedDegree<4096>>(
static_cast<cudaStream_t>(stream), gpu_index,
(__uint128_t *)lwe_array_out, (__uint128_t const *)glwe_array_in,
(uint32_t const *)nth_array, num_nths, lwe_per_glwe, glwe_dimension);
break;
default:
PANIC("Cuda error: unsupported polynomial size. Supported "
"N's are powers of two in the interval [256..4096].")
}
}
36 changes: 31 additions & 5 deletions backends/tfhe-cuda-backend/cuda/src/crypto/keyswitch.cu
Original file line number Diff line number Diff line change
@@ -1,8 +1,6 @@
#include "fast_packing_keyswitch.cuh"
#include "keyswitch.cuh"
#include "keyswitch.h"
#include <cstdint>
#include <stdio.h>
#include "keyswitch/keyswitch.h"
#include "packing_keyswitch.cuh"

/* Perform keyswitch on a batch of 32 bits input LWE ciphertexts.
* Head out to the equivalent operation on 64 bits for more details.
Expand Down Expand Up @@ -73,7 +71,7 @@ void cuda_packing_keyswitch_lwe_list_to_glwe_64(
uint32_t output_polynomial_size, uint32_t base_log, uint32_t level_count,
uint32_t num_lwes) {

host_fast_packing_keyswitch_lwe_list_to_glwe<uint64_t, ulonglong4>(
host_packing_keyswitch_lwe_list_to_glwe<uint64_t>(
static_cast<cudaStream_t>(stream), gpu_index,
static_cast<uint64_t *>(glwe_array_out),
static_cast<const uint64_t *>(lwe_array_in),
Expand All @@ -90,3 +88,31 @@ void cleanup_packing_keyswitch_lwe_list_to_glwe(void *stream,
static_cast<cudaStream_t>(stream),
gpu_index, gpu_memory_allocated);
}

void scratch_packing_keyswitch_lwe_list_to_glwe_128(
void *stream, uint32_t gpu_index, int8_t **fp_ks_buffer,
uint32_t lwe_dimension, uint32_t glwe_dimension, uint32_t polynomial_size,
uint32_t num_lwes, bool allocate_gpu_memory) {
scratch_packing_keyswitch_lwe_list_to_glwe<__uint128_t>(
static_cast<cudaStream_t>(stream), gpu_index, fp_ks_buffer, lwe_dimension,
glwe_dimension, polynomial_size, num_lwes, allocate_gpu_memory);
}

/* Perform functional packing keyswitch on a batch of 64 bits input LWE
* ciphertexts.
*/

void cuda_packing_keyswitch_lwe_list_to_glwe_128(
void *stream, uint32_t gpu_index, void *glwe_array_out,
void const *lwe_array_in, void const *fp_ksk_array, int8_t *fp_ks_buffer,
uint32_t input_lwe_dimension, uint32_t output_glwe_dimension,
uint32_t output_polynomial_size, uint32_t base_log, uint32_t level_count,
uint32_t num_lwes) {
host_packing_keyswitch_lwe_list_to_glwe<__uint128_t>(
static_cast<cudaStream_t>(stream), gpu_index,
static_cast<__uint128_t *>(glwe_array_out),
static_cast<const __uint128_t *>(lwe_array_in),
static_cast<const __uint128_t *>(fp_ksk_array), fp_ks_buffer,
input_lwe_dimension, output_glwe_dimension, output_polynomial_size,
base_log, level_count, num_lwes);
}
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,7 @@ template <typename Torus> uint64_t get_shared_mem_size_tgemm() {
// Initialize decomposition by performing rounding
// and decomposing one level of an array of Torus LWEs. Only
// decomposes the mask elements of the incoming LWEs.
template <typename Torus, typename TorusVec>
template <typename Torus>
__global__ void decompose_vectorize_init(Torus const *lwe_in, Torus *lwe_out,
uint32_t lwe_dimension,
uint32_t num_lwe, uint32_t base_log,
Expand Down Expand Up @@ -63,7 +63,7 @@ __global__ void decompose_vectorize_init(Torus const *lwe_in, Torus *lwe_out,
// Continue decomposiion of an array of Torus elements in place. Supposes
// that the array contains already decomposed elements and
// computes the new decomposed level in place.
template <typename Torus, typename TorusVec>
template <typename Torus>
__global__ void
decompose_vectorize_step_inplace(Torus *buffer_in, uint32_t lwe_dimension,
uint32_t num_lwe, uint32_t base_log,
Expand Down Expand Up @@ -101,7 +101,7 @@ decompose_vectorize_step_inplace(Torus *buffer_in, uint32_t lwe_dimension,
// This code is adapted by generalizing the 1d block-tiling
// kernel from https://github.com/siboehm/SGEMM_CUDA
// to any matrix dimension
template <typename Torus, typename TorusVec>
template <typename Torus>
__global__ void tgemm(int M, int N, int K, const Torus *A, const Torus *B,
int stride_B, Torus *C) {

Expand Down Expand Up @@ -251,8 +251,8 @@ __global__ void polynomial_accumulate_monic_monomial_mul_many_neg_and_add_C(
degree, coeffIdx, polynomial_size, 1, true);
}

template <typename Torus, typename TorusVec>
__host__ void host_fast_packing_keyswitch_lwe_list_to_glwe(
template <typename Torus>
__host__ void host_packing_keyswitch_lwe_list_to_glwe(
cudaStream_t stream, uint32_t gpu_index, Torus *glwe_out,
Torus const *lwe_array_in, Torus const *fp_ksk_array, int8_t *fp_ks_buffer,
uint32_t lwe_dimension, uint32_t glwe_dimension, uint32_t polynomial_size,
Expand Down Expand Up @@ -296,10 +296,8 @@ __host__ void host_fast_packing_keyswitch_lwe_list_to_glwe(
dim3 threads_decomp(BLOCK_SIZE_DECOMP, BLOCK_SIZE_DECOMP);

// decompose first level
decompose_vectorize_init<Torus, TorusVec>
<<<grid_decomp, threads_decomp, 0, stream>>>(lwe_array_in, d_mem_0,
lwe_dimension, num_lwes,
base_log, level_count);
decompose_vectorize_init<Torus><<<grid_decomp, threads_decomp, 0, stream>>>(
lwe_array_in, d_mem_0, lwe_dimension, num_lwes, base_log, level_count);
check_cuda_error(cudaGetLastError());

// gemm to ks the individual LWEs to GLWEs
Expand All @@ -310,23 +308,22 @@ __host__ void host_fast_packing_keyswitch_lwe_list_to_glwe(
auto stride_KSK_buffer = glwe_accumulator_size * level_count;

uint32_t shared_mem_size = get_shared_mem_size_tgemm<Torus>();
tgemm<Torus, TorusVec><<<grid_gemm, threads_gemm, shared_mem_size, stream>>>(
tgemm<Torus><<<grid_gemm, threads_gemm, shared_mem_size, stream>>>(
num_lwes, glwe_accumulator_size, lwe_dimension, d_mem_0, fp_ksk_array,
stride_KSK_buffer, d_mem_1);
check_cuda_error(cudaGetLastError());

auto ksk_block_size = glwe_accumulator_size;

for (int li = 1; li < level_count; ++li) {
decompose_vectorize_step_inplace<Torus, TorusVec>
decompose_vectorize_step_inplace<Torus>
<<<grid_decomp, threads_decomp, 0, stream>>>(
d_mem_0, lwe_dimension, num_lwes, base_log, level_count);
check_cuda_error(cudaGetLastError());

tgemm<Torus, TorusVec>
<<<grid_gemm, threads_gemm, shared_mem_size, stream>>>(
num_lwes, glwe_accumulator_size, lwe_dimension, d_mem_0,
fp_ksk_array + li * ksk_block_size, stride_KSK_buffer, d_mem_1);
tgemm<Torus><<<grid_gemm, threads_gemm, shared_mem_size, stream>>>(
num_lwes, glwe_accumulator_size, lwe_dimension, d_mem_0,
fp_ksk_array + li * ksk_block_size, stride_KSK_buffer, d_mem_1);
check_cuda_error(cudaGetLastError());
}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -2,8 +2,8 @@
#define CUDA_INTEGER_COMPRESSION_CUH

#include "ciphertext.h"
#include "crypto/fast_packing_keyswitch.cuh"
#include "crypto/keyswitch.cuh"
#include "crypto/packing_keyswitch.cuh"
#include "device.h"
#include "integer/compression/compression.h"
#include "integer/compression/compression_utilities.h"
Expand Down Expand Up @@ -116,7 +116,7 @@ host_integer_compress(cudaStream_t const *streams, uint32_t const *gpu_indexes,
while (rem_lwes > 0) {
auto chunk_size = min(rem_lwes, mem_ptr->lwe_per_glwe);

host_fast_packing_keyswitch_lwe_list_to_glwe<Torus, ulonglong4>(
host_packing_keyswitch_lwe_list_to_glwe<Torus>(
streams[0], gpu_indexes[0], glwe_out, lwe_subset, fp_ksk[0],
fp_ks_buffer, input_lwe_dimension, compression_params.glwe_dimension,
compression_params.polynomial_size, compression_params.ks_base_log,
Expand Down
Original file line number Diff line number Diff line change
@@ -1,10 +1,10 @@
#ifndef SETUP_AND_TEARDOWN_H
#define SETUP_AND_TEARDOWN_H

#include "keyswitch/keyswitch.h"
#include "pbs/programmable_bootstrap.h"
#include "pbs/programmable_bootstrap_multibit.h"
#include <device.h>
#include <keyswitch.h>
#include <utils.h>

void programmable_bootstrap_classical_setup(
Expand Down
41 changes: 41 additions & 0 deletions backends/tfhe-cuda-backend/src/bindings.rs
Original file line number Diff line number Diff line change
Expand Up @@ -60,6 +60,19 @@ unsafe extern "C" {
log_modulus: u32,
);
}
unsafe extern "C" {
pub fn cuda_glwe_sample_extract_128(
stream: *mut ffi::c_void,
gpu_index: u32,
lwe_array_out: *mut ffi::c_void,
glwe_array_in: *const ffi::c_void,
nth_array: *const u32,
num_nths: u32,
lwe_per_glwe: u32,
glwe_dimension: u32,
polynomial_size: u32,
);
}
pub const PBS_TYPE_MULTI_BIT: PBS_TYPE = 0;
pub const PBS_TYPE_CLASSICAL: PBS_TYPE = 1;
pub type PBS_TYPE = ffi::c_uint;
Expand Down Expand Up @@ -1429,6 +1442,34 @@ unsafe extern "C" {
num_lwes: u32,
);
}
unsafe extern "C" {
pub fn scratch_packing_keyswitch_lwe_list_to_glwe_128(
stream: *mut ffi::c_void,
gpu_index: u32,
fp_ks_buffer: *mut *mut i8,
lwe_dimension: u32,
glwe_dimension: u32,
polynomial_size: u32,
num_lwes: u32,
allocate_gpu_memory: bool,
);
}
unsafe extern "C" {
pub fn cuda_packing_keyswitch_lwe_list_to_glwe_128(
stream: *mut ffi::c_void,
gpu_index: u32,
glwe_array_out: *mut ffi::c_void,
lwe_array_in: *const ffi::c_void,
fp_ksk_array: *const ffi::c_void,
fp_ks_buffer: *mut i8,
input_lwe_dimension: u32,
output_glwe_dimension: u32,
output_polynomial_size: u32,
base_log: u32,
level_count: u32,
num_lwes: u32,
);
}
unsafe extern "C" {
pub fn cleanup_packing_keyswitch_lwe_list_to_glwe(
stream: *mut ffi::c_void,
Expand Down
2 changes: 1 addition & 1 deletion backends/tfhe-cuda-backend/wrapper.h
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
#include "cuda/include/integer/compression/compression.h"
#include "cuda/include/integer/integer.h"
#include "cuda/include/zk/zk.h"
#include "cuda/include/keyswitch.h"
#include "cuda/include/keyswitch/keyswitch.h"
#include "cuda/include/keyswitch/ks_enums.h"
#include "cuda/include/linear_algebra.h"
#include "cuda/include/fft/fft128.h"
Expand Down
6 changes: 3 additions & 3 deletions tfhe/benches/core_crypto/ks_bench.rs
Original file line number Diff line number Diff line change
Expand Up @@ -497,7 +497,7 @@ mod cuda {
use tfhe::core_crypto::gpu::glwe_ciphertext_list::CudaGlweCiphertextList;
use tfhe::core_crypto::gpu::lwe_ciphertext_list::CudaLweCiphertextList;
use tfhe::core_crypto::gpu::{
cuda_keyswitch_lwe_ciphertext, cuda_keyswitch_lwe_ciphertext_list_into_glwe_ciphertext,
cuda_keyswitch_lwe_ciphertext, cuda_keyswitch_lwe_ciphertext_list_into_glwe_ciphertext_64,
get_number_of_gpus, CudaStreams,
};
use tfhe::core_crypto::prelude::*;
Expand Down Expand Up @@ -796,7 +796,7 @@ mod cuda {
{
bench_group.bench_function(&bench_id, |b| {
b.iter(|| {
cuda_keyswitch_lwe_ciphertext_list_into_glwe_ciphertext(
cuda_keyswitch_lwe_ciphertext_list_into_glwe_ciphertext_64(
gpu_keys.pksk.as_ref().unwrap(),
&d_input_lwe_list,
&mut d_output_glwe,
Expand Down Expand Up @@ -879,7 +879,7 @@ mod cuda {
((i, input_lwe_list), output_glwe_list),
local_stream,
)| {
cuda_keyswitch_lwe_ciphertext_list_into_glwe_ciphertext(
cuda_keyswitch_lwe_ciphertext_list_into_glwe_ciphertext_64(
gpu_keys_vec[i].pksk.as_ref().unwrap(),
input_lwe_list,
output_glwe_list,
Expand Down
Loading
Loading