chore(gpu): stf experiment

agnesLeroy · agnesLeroy · commit a44db3a91edc · 2025-04-10T09:25:41.000+02:00
diff --git a/backends/tfhe-cuda-backend/cuda/CMakeLists.txt b/backends/tfhe-cuda-backend/cuda/CMakeLists.txt
@@ -88,11 +88,34 @@ else()
   set(OPTIMIZATION_FLAGS "${OPTIMIZATION_FLAGS} -O3")
 endif()
 
+# Fetch CPM.cmake directly from GitHub if not already present
+include(FetchContent)
+FetchContent_Declare(
+  CPM
+  GIT_REPOSITORY https://github.com/cpm-cmake/CPM.cmake
+  GIT_TAG v0.38.5 # replace with the desired version or main for latest
+)
+FetchContent_MakeAvailable(CPM)
+
+include(${cpm_SOURCE_DIR}/cmake/CPM.cmake)
+
+# This will automatically clone CCCL from GitHub and make the exported cmake targets available
+cpmaddpackage(
+  NAME
+  CCCL
+  GITHUB_REPOSITORY
+  "nvidia/cccl"
+  GIT_TAG
+  "main"
+  # The following is required to make the `CCCL::cudax` target available:
+  OPTIONS
+  "CCCL_ENABLE_UNSTABLE ON")
+
 # in production, should use -arch=sm_70 --ptxas-options=-v to see register spills -lineinfo for better debugging
 set(CMAKE_CUDA_FLAGS
     "${CMAKE_CUDA_FLAGS} -ccbin ${CMAKE_CXX_COMPILER} ${OPTIMIZATION_FLAGS}\
   -std=c++17 --no-exceptions  --expt-relaxed-constexpr -rdc=true \
-  --use_fast_math -Xcompiler -fPIC")
+  --use_fast_math -Xcompiler -fPIC  -DCCCL_DISABLE_EXCEPTIONS  -DCUDASTF_DISABLE_CODE_GENERATION")
 
 set(INCLUDE_DIR include)
 
@@ -101,6 +124,8 @@ enable_testing()
 add_subdirectory(tests_and_benchmarks)
 target_include_directories(tfhe_cuda_backend PRIVATE ${INCLUDE_DIR})
 
+target_link_libraries(tfhe_cuda_backend PRIVATE CCCL::CCCL CCCL::cudax cuda)
+
 # This is required for rust cargo build
 install(TARGETS tfhe_cuda_backend DESTINATION .)
 
diff --git a/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_cg_multibit.cuh b/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_cg_multibit.cuh
@@ -16,8 +16,11 @@
 #include "programmable_bootstrap.cuh"
 #include "programmable_bootstrap_multibit.cuh"
 #include "types/complex/operations.cuh"
+#include <cuda/experimental/stf.cuh>
 #include <vector>
 
+namespace cudastf = cuda::experimental::stf;
+
 template <typename Torus, class params, sharedMemDegree SMD>
 __global__ void __launch_bounds__(params::degree / params::opt)
     device_multi_bit_programmable_bootstrap_cg_accumulate(
@@ -384,25 +387,49 @@ __host__ void host_cg_multi_bit_programmable_bootstrap(
     uint32_t base_log, uint32_t level_count, uint32_t num_samples,
     uint32_t num_many_lut, uint32_t lut_stride) {
 
+  // Generate a CUDA graph if the USE_CUDA_GRAPH is set to a non-null value
+  const char *use_graph_env = getenv("USE_CUDA_GRAPH");
+
+  cudastf::context ctx(stream);
+  if (use_graph_env && atoi(use_graph_env) != 0) {
+    ctx = cudastf::graph_ctx(stream);
+  }
+
   auto lwe_chunk_size = buffer->lwe_chunk_size;
 
+  auto buffer_token = ctx.logical_token();
+
   for (uint32_t lwe_offset = 0; lwe_offset < (lwe_dimension / grouping_factor);
        lwe_offset += lwe_chunk_size) {
 
+    auto key_token = ctx.logical_token();
+    auto result_token = ctx.logical_token();
+
     // Compute a keybundle
-    execute_compute_keybundle<Torus, params>(
-        stream, gpu_index, lwe_array_in, lwe_input_indexes, bootstrapping_key,
-        buffer, num_samples, lwe_dimension, glwe_dimension, polynomial_size,
-        grouping_factor, level_count, lwe_offset);
+    ctx.task(key_token.write(), buffer_token.write())
+            .set_symbol("compute_keybundle")
+            ->*[&](cudaStream_t stf_stream) {
+                  execute_compute_keybundle<Torus, params>(
+                      stf_stream, gpu_index, lwe_array_in, lwe_input_indexes,
+                      bootstrapping_key, buffer, num_samples, lwe_dimension,
+                      glwe_dimension, polynomial_size, grouping_factor,
+                      level_count, lwe_offset);
+                };
 
     // Accumulate
-    execute_cg_external_product_loop<Torus, params>(
-        stream, gpu_index, lut_vector, lut_vector_indexes, lwe_array_in,
-        lwe_input_indexes, lwe_array_out, lwe_output_indexes, buffer,
-        num_samples, lwe_dimension, glwe_dimension, polynomial_size,
-        grouping_factor, base_log, level_count, lwe_offset, num_many_lut,
-        lut_stride);
+    ctx.task(key_token.read(), buffer_token.rw(), result_token.write())
+            .set_symbol("accumulate")
+            ->*
+        [&](cudaStream_t stf_stream) {
+          execute_cg_external_product_loop<Torus, params>(
+              stf_stream, gpu_index, lut_vector, lut_vector_indexes,
+              lwe_array_in, lwe_input_indexes, lwe_array_out,
+              lwe_output_indexes, buffer, num_samples, lwe_dimension,
+              glwe_dimension, polynomial_size, grouping_factor, base_log,
+              level_count, lwe_offset, num_many_lut, lut_stride);
+        };
   }
+  ctx.finalize();
 }
 
 // Verify if the grid size satisfies the cooperative group constraints
diff --git a/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_multibit.cuh b/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_multibit.cuh
@@ -16,8 +16,11 @@
 #include "polynomial/polynomial_math.cuh"
 #include "programmable_bootstrap_cg_classic.cuh"
 #include "types/complex/operations.cuh"
+#include <cuda/experimental/stf.cuh>
 #include <vector>
 
+namespace cudastf = cuda::experimental::stf;
+
 template <typename Torus, class params>
 __device__ uint32_t calculates_monomial_degree(const Torus *lwe_array_group,
                                                uint32_t ggsw_idx,
@@ -683,46 +686,70 @@ __host__ void host_multi_bit_programmable_bootstrap(
     uint32_t base_log, uint32_t level_count, uint32_t num_samples,
     uint32_t num_many_lut, uint32_t lut_stride) {
 
+  // Generate a CUDA graph if the USE_CUDA_GRAPH is set to a non-null value
+  const char *use_graph_env = getenv("USE_CUDA_GRAPH");
+
+  cudastf::context ctx(stream);
+  if (use_graph_env && atoi(use_graph_env) != 0) {
+    ctx = cudastf::graph_ctx(stream);
+  }
+
+  auto buffer_token = ctx.logical_token();
   auto lwe_chunk_size = buffer->lwe_chunk_size;
 
   for (uint32_t lwe_offset = 0; lwe_offset < (lwe_dimension / grouping_factor);
        lwe_offset += lwe_chunk_size) {
 
+    auto key_token = ctx.logical_token();
+    auto result_token = ctx.logical_token();
+
     // Compute a keybundle
-    execute_compute_keybundle<Torus, params>(
-        stream, gpu_index, lwe_array_in, lwe_input_indexes, bootstrapping_key,
-        buffer, num_samples, lwe_dimension, glwe_dimension, polynomial_size,
-        grouping_factor, level_count, lwe_offset);
+    ctx.task(key_token.write(), buffer_token.write())
+            .set_symbol("compute_keybundle")
+            ->*[&](cudaStream_t stf_stream) {
+                  execute_compute_keybundle<Torus, params>(
+                      stf_stream, gpu_index, lwe_array_in, lwe_input_indexes,
+                      bootstrapping_key, buffer, num_samples, lwe_dimension,
+                      glwe_dimension, polynomial_size, grouping_factor,
+                      level_count, lwe_offset);
+                };
     // Accumulate
     uint32_t chunk_size = std::min(
         lwe_chunk_size, (lwe_dimension / grouping_factor) - lwe_offset);
     for (uint32_t j = 0; j < chunk_size; j++) {
-      bool is_first_iter = (j + lwe_offset) == 0;
-      bool is_last_iter =
-          (j + lwe_offset) + 1 == (lwe_dimension / grouping_factor);
-      if (is_first_iter) {
-        execute_step_one<Torus, params, true>(
-            stream, gpu_index, lut_vector, lut_vector_indexes, lwe_array_in,
-            lwe_input_indexes, buffer, num_samples, lwe_dimension,
-            glwe_dimension, polynomial_size, base_log, level_count);
-      } else {
-        execute_step_one<Torus, params, false>(
-            stream, gpu_index, lut_vector, lut_vector_indexes, lwe_array_in,
-            lwe_input_indexes, buffer, num_samples, lwe_dimension,
-            glwe_dimension, polynomial_size, base_log, level_count);
-      }
-
-      if (is_last_iter) {
-        execute_step_two<Torus, params, true>(
-            stream, gpu_index, lwe_array_out, lwe_output_indexes, buffer,
-            num_samples, glwe_dimension, polynomial_size, level_count, j,
-            num_many_lut, lut_stride);
-      } else {
-        execute_step_two<Torus, params, false>(
-            stream, gpu_index, lwe_array_out, lwe_output_indexes, buffer,
-            num_samples, glwe_dimension, polynomial_size, level_count, j,
-            num_many_lut, lut_stride);
-      }
+      ctx.task(key_token.read(), buffer_token.rw(), result_token.rw())
+              .set_symbol("step_one_two")
+              ->*
+          [&](cudaStream_t stf_stream) {
+            bool is_first_iter = (j + lwe_offset) == 0;
+            bool is_last_iter =
+                (j + lwe_offset) + 1 == (lwe_dimension / grouping_factor);
+            if (is_first_iter) {
+              execute_step_one<Torus, params, true>(
+                  stf_stream, gpu_index, lut_vector, lut_vector_indexes,
+                  lwe_array_in, lwe_input_indexes, buffer, num_samples,
+                  lwe_dimension, glwe_dimension, polynomial_size, base_log,
+                  level_count);
+            } else {
+              execute_step_one<Torus, params, false>(
+                  stf_stream, gpu_index, lut_vector, lut_vector_indexes,
+                  lwe_array_in, lwe_input_indexes, buffer, num_samples,
+                  lwe_dimension, glwe_dimension, polynomial_size, base_log,
+                  level_count);
+            }
+
+            if (is_last_iter) {
+              execute_step_two<Torus, params, true>(
+                  stf_stream, gpu_index, lwe_array_out, lwe_output_indexes,
+                  buffer, num_samples, glwe_dimension, polynomial_size,
+                  level_count, j, num_many_lut, lut_stride);
+            } else {
+              execute_step_two<Torus, params, false>(
+                  stf_stream, gpu_index, lwe_array_out, lwe_output_indexes,
+                  buffer, num_samples, glwe_dimension, polynomial_size,
+                  level_count, j, num_many_lut, lut_stride);
+            }
+          };
     }
   }
 }
diff --git a/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_tbc_multibit.cuh b/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_tbc_multibit.cuh
@@ -16,8 +16,11 @@
 #include "polynomial/polynomial_math.cuh"
 #include "programmable_bootstrap.cuh"
 #include "types/complex/operations.cuh"
+#include <cuda/experimental/stf.cuh>
 #include <vector>
 
+namespace cudastf = cuda::experimental::stf;
+
 template <typename Torus, class params, sharedMemDegree SMD>
 __global__ void __launch_bounds__(params::degree / params::opt)
     device_multi_bit_programmable_bootstrap_tbc_accumulate(
@@ -404,23 +407,44 @@ __host__ void host_tbc_multi_bit_programmable_bootstrap(
     uint32_t num_many_lut, uint32_t lut_stride) {
   cuda_set_device(gpu_index);
 
+  // Generate a CUDA graph if the USE_CUDA_GRAPH is set to a non-null value
+  const char *use_graph_env = getenv("USE_CUDA_GRAPH");
+
+  cudastf::context ctx(stream);
+  if (use_graph_env && atoi(use_graph_env) != 0) {
+    ctx = cudastf::graph_ctx(stream);
+  }
+
   auto lwe_chunk_size = buffer->lwe_chunk_size;
+  auto buffer_token = ctx.logical_token();
   for (uint32_t lwe_offset = 0; lwe_offset < (lwe_dimension / grouping_factor);
        lwe_offset += lwe_chunk_size) {
 
+    auto key_token = ctx.logical_token();
+    auto result_token = ctx.logical_token();
     // Compute a keybundle
-    execute_compute_keybundle<Torus, params>(
-        stream, gpu_index, lwe_array_in, lwe_input_indexes, bootstrapping_key,
-        buffer, num_samples, lwe_dimension, glwe_dimension, polynomial_size,
-        grouping_factor, level_count, lwe_offset);
+    ctx.task(key_token.write(), buffer_token.write())
+            .set_symbol("compute_keybundle")
+            ->*[&](cudaStream_t stf_stream) {
+                  execute_compute_keybundle<Torus, params>(
+                      stf_stream, gpu_index, lwe_array_in, lwe_input_indexes,
+                      bootstrapping_key, buffer, num_samples, lwe_dimension,
+                      glwe_dimension, polynomial_size, grouping_factor,
+                      level_count, lwe_offset);
+                };
 
     // Accumulate
-    execute_tbc_external_product_loop<Torus, params>(
-        stream, gpu_index, lut_vector, lut_vector_indexes, lwe_array_in,
-        lwe_input_indexes, lwe_array_out, lwe_output_indexes, buffer,
-        num_samples, lwe_dimension, glwe_dimension, polynomial_size,
-        grouping_factor, base_log, level_count, lwe_offset, num_many_lut,
-        lut_stride);
+    ctx.task(key_token.read(), buffer_token.rw(), result_token.write())
+            .set_symbol("accumulate")
+            ->*
+        [&](cudaStream_t stf_stream) {
+          execute_tbc_external_product_loop<Torus, params>(
+              stream, gpu_index, lut_vector, lut_vector_indexes, lwe_array_in,
+              lwe_input_indexes, lwe_array_out, lwe_output_indexes, buffer,
+              num_samples, lwe_dimension, glwe_dimension, polynomial_size,
+              grouping_factor, base_log, level_count, lwe_offset, num_many_lut,
+              lut_stride);
+        };
   }
 }