From d6d4b67de1b3cd5c437df896083f928c37819fbf Mon Sep 17 00:00:00 2001 From: Michael Schellenberger Costa Date: Fri, 26 Jun 2026 10:41:58 +0200 Subject: [PATCH 1/5] Add builds for MSVC cccl_c_parallel --- ci/matrix.yaml | 3 ++- ci/windows/build_cccl_c_parallel_v2.ps1 | 28 +++++++++++++++++++++++++ 2 files changed, 30 insertions(+), 1 deletion(-) create mode 100644 ci/windows/build_cccl_c_parallel_v2.ps1 diff --git a/ci/matrix.yaml b/ci/matrix.yaml index 3653930095e..962463d8dc5 100644 --- a/ci/matrix.yaml +++ b/ci/matrix.yaml @@ -22,6 +22,7 @@ workflows: # # IMPORTANT: Do NOT delete or remove the `override:` key below, even when it is empty. override: + - {jobs: ['test'], project: 'cccl_c_parallel_v2', ctk: '13.X', cxx: ['gcc13', 'msvc'], gpu: 'rtx2080'} pull_request: # Old CTK: Oldest/newest supported host compilers: @@ -79,7 +80,7 @@ workflows: # Eventually v2 will replace v1 as the default and run across the # entire matrix. Currently blocked on libnvfatbin availability on # Windows containers, and for CUDA <12.4. - - {jobs: ['test'], project: 'cccl_c_parallel_v2', ctk: '13.X', cxx: ['gcc13'], gpu: 'rtx2080'} + - {jobs: ['test'], project: 'cccl_c_parallel_v2', ctk: '13.X', cxx: ['gcc13', 'msvc'], gpu: 'rtx2080'} # Python against c.parallel v2 (HostJIT-based). Single point of coverage # for the v2 Python path; the main `python` matrix continues to test # against v1 until v2 replaces it. diff --git a/ci/windows/build_cccl_c_parallel_v2.ps1 b/ci/windows/build_cccl_c_parallel_v2.ps1 new file mode 100644 index 00000000000..529a37b113c --- /dev/null +++ b/ci/windows/build_cccl_c_parallel_v2.ps1 @@ -0,0 +1,28 @@ +Param( + [Parameter(Mandatory = $false)] + [Alias("arch")] + [string]$CUDA_ARCH = "", + [Parameter(Mandatory = $false)] + [Alias("cmake-options")] + [string]$CMAKE_OPTIONS = "" +) + +$ErrorActionPreference = "Stop" + +$CURRENT_PATH = Split-Path $pwd -leaf +If($CURRENT_PATH -ne "ci") { + Write-Host "Moving to ci folder" + pushd "$PSScriptRoot/.." +} + +Remove-Module -Name build_common -ErrorAction SilentlyContinue +Import-Module $PSScriptRoot/build_common.psm1 -ArgumentList @(20, $CUDA_ARCH, $CMAKE_OPTIONS) + +$PRESET = "cccl-c-parallel-v2" +$LOCAL_CMAKE_OPTIONS = "" + +configure_and_build_preset "CCCL C Parallel" $PRESET $LOCAL_CMAKE_OPTIONS + +If($CURRENT_PATH -ne "ci") { + popd +} From 347ce7b3ea10e34534dd4869b9291294028fccd9 Mon Sep 17 00:00:00 2001 From: Michael Schellenberger Costa Date: Mon, 29 Jun 2026 18:32:47 +0200 Subject: [PATCH 2/5] Disable SEH exceptions for catch2 --- c/parallel.v2/CMakeLists.txt | 1 + 1 file changed, 1 insertion(+) diff --git a/c/parallel.v2/CMakeLists.txt b/c/parallel.v2/CMakeLists.txt index b82f51e1204..a7db647bd20 100644 --- a/c/parallel.v2/CMakeLists.txt +++ b/c/parallel.v2/CMakeLists.txt @@ -90,6 +90,7 @@ if (WIN32) cccl.c.parallel.v2 PRIVATE $<$:-Xcompiler=/wd4459> ) + target_compile_definitions(cccl.c.parallel.v2 PRIVATE CATCH_CONFIG_NO_WINDOWS_SEH) endif() target_compile_definitions( From 6ce607815353c5fdbf719064a2334896e7cfe446 Mon Sep 17 00:00:00 2001 From: romanso <3954220+romanso@users.noreply.github.com> Date: Tue, 30 Jun 2026 16:40:43 +0200 Subject: [PATCH 3/5] [c.parallel.v2] Run host-JIT Clang frontend on an 8 MB stack (fix Windows stack overflow) The host JIT invokes clang::CompilerInstance::ExecuteAction directly, which bypasses the clang driver's runWithSufficientStackSpace() guard. On Windows the default main-thread stack is 1 MB, so Clang's deep frontend recursion (recursive-descent parsing / template instantiation) overflows the stack while compiling heavier kernels such as radix_sort and segmented_reduce. On Linux the 8 MB default stack hides it. Run each ExecuteAction on a worker thread whose stack matches Clang's own DesiredStackSize (8 MB) -- the value already proven sufficient on Linux. --- c/parallel.v2/src/hostjit/compiler.cpp | 27 ++++++++++++++++++++++---- 1 file changed, 23 insertions(+), 4 deletions(-) diff --git a/c/parallel.v2/src/hostjit/compiler.cpp b/c/parallel.v2/src/hostjit/compiler.cpp index f5697672f8b..19e4fba09ed 100644 --- a/c/parallel.v2/src/hostjit/compiler.cpp +++ b/c/parallel.v2/src/hostjit/compiler.cpp @@ -25,6 +25,7 @@ #include #include #include +#include #include #include @@ -85,6 +86,24 @@ static void initialize_llvm() llvm_initialized = true; } +// Embedding clang as a library bypasses the clang driver's +// runWithSufficientStackSpace guard, so the frontend runs on the caller's stack. +// On Windows the default main-thread stack is only 1 MB, which the deep +// (recursive-descent / template-instantiation) frontend overflows on heavier +// kernels such as radix_sort / segmented_reduce; Linux's 8 MB default hides it. +// Run the frontend on a worker thread sized to match clang's own +// DesiredStackSize (8 MB), which is the proven-sufficient value on Linux. +inline constexpr unsigned kFrontendStackSize = 8u << 20; + +template +static bool runWithLargeStack(Fn&& fn) +{ + bool result = false; + llvm::thread worker(std::optional(kFrontendStackSize), [&] { result = fn(); }); + worker.join(); + return result; +} + #ifdef _WIN32 // Generate a minimal COFF import library for a given DLL. // This allows linking without requiring the Windows SDK or MSVC .lib files. @@ -240,7 +259,7 @@ class CUDACompiler::Impl compiler.getFrontendOpts().OutputFile = pch_output_path; clang::GeneratePCHAction pch_action; - bool success = compiler.ExecuteAction(pch_action); + bool success = runWithLargeStack([&] { return compiler.ExecuteAction(pch_action); }); diag_stream.flush(); diagnostics += diag_output; @@ -480,7 +499,7 @@ class CUDACompiler::Impl llvm::LLVMContext llvm_context; clang::EmitLLVMOnlyAction emit_llvm_action(&llvm_context); - bool success = compiler.ExecuteAction(emit_llvm_action); + bool success = runWithLargeStack([&] { return compiler.ExecuteAction(emit_llvm_action); }); if (config.trace_includes && compiler.hasSourceManager()) { @@ -858,7 +877,7 @@ class CUDACompiler::Impl llvm::LLVMContext llvm_context; clang::EmitLLVMOnlyAction emit_llvm_action(&llvm_context); - bool success = compiler.ExecuteAction(emit_llvm_action); + bool success = runWithLargeStack([&] { return compiler.ExecuteAction(emit_llvm_action); }); if (success) { @@ -1085,7 +1104,7 @@ class CUDACompiler::Impl } clang::EmitObjAction emit_action; - bool success = compiler.ExecuteAction(emit_action); + bool success = runWithLargeStack([&] { return compiler.ExecuteAction(emit_action); }); if (config.trace_includes && compiler.hasSourceManager()) { From c5d167d161f7edf57d09bfd11b0b4ef3b65aa920 Mon Sep 17 00:00:00 2001 From: Michael Schellenberger Costa Date: Tue, 30 Jun 2026 19:02:44 +0200 Subject: [PATCH 4/5] Drop override --- ci/matrix.yaml | 1 - 1 file changed, 1 deletion(-) diff --git a/ci/matrix.yaml b/ci/matrix.yaml index 962463d8dc5..7ce64b079b8 100644 --- a/ci/matrix.yaml +++ b/ci/matrix.yaml @@ -22,7 +22,6 @@ workflows: # # IMPORTANT: Do NOT delete or remove the `override:` key below, even when it is empty. override: - - {jobs: ['test'], project: 'cccl_c_parallel_v2', ctk: '13.X', cxx: ['gcc13', 'msvc'], gpu: 'rtx2080'} pull_request: # Old CTK: Oldest/newest supported host compilers: From 1e302f5ba63216e4eb4a63654725ac564b5740fa Mon Sep 17 00:00:00 2001 From: Michael Schellenberger Costa Date: Tue, 30 Jun 2026 19:32:32 +0200 Subject: [PATCH 5/5] Fix formatting --- c/parallel.v2/CMakeLists.txt | 5 ++++- c/parallel.v2/src/hostjit/compiler.cpp | 22 ++++++++++++++++------ 2 files changed, 20 insertions(+), 7 deletions(-) diff --git a/c/parallel.v2/CMakeLists.txt b/c/parallel.v2/CMakeLists.txt index a7db647bd20..52004ebdc58 100644 --- a/c/parallel.v2/CMakeLists.txt +++ b/c/parallel.v2/CMakeLists.txt @@ -90,7 +90,10 @@ if (WIN32) cccl.c.parallel.v2 PRIVATE $<$:-Xcompiler=/wd4459> ) - target_compile_definitions(cccl.c.parallel.v2 PRIVATE CATCH_CONFIG_NO_WINDOWS_SEH) + target_compile_definitions( + cccl.c.parallel.v2 + PRIVATE CATCH_CONFIG_NO_WINDOWS_SEH + ) endif() target_compile_definitions( diff --git a/c/parallel.v2/src/hostjit/compiler.cpp b/c/parallel.v2/src/hostjit/compiler.cpp index 19e4fba09ed..1585dd5973c 100644 --- a/c/parallel.v2/src/hostjit/compiler.cpp +++ b/c/parallel.v2/src/hostjit/compiler.cpp @@ -24,8 +24,8 @@ #include #include #include -#include #include +#include #include #include @@ -99,7 +99,9 @@ template static bool runWithLargeStack(Fn&& fn) { bool result = false; - llvm::thread worker(std::optional(kFrontendStackSize), [&] { result = fn(); }); + llvm::thread worker(std::optional(kFrontendStackSize), [&] { + result = fn(); + }); worker.join(); return result; } @@ -259,7 +261,9 @@ class CUDACompiler::Impl compiler.getFrontendOpts().OutputFile = pch_output_path; clang::GeneratePCHAction pch_action; - bool success = runWithLargeStack([&] { return compiler.ExecuteAction(pch_action); }); + bool success = runWithLargeStack([&] { + return compiler.ExecuteAction(pch_action); + }); diag_stream.flush(); diagnostics += diag_output; @@ -499,7 +503,9 @@ class CUDACompiler::Impl llvm::LLVMContext llvm_context; clang::EmitLLVMOnlyAction emit_llvm_action(&llvm_context); - bool success = runWithLargeStack([&] { return compiler.ExecuteAction(emit_llvm_action); }); + bool success = runWithLargeStack([&] { + return compiler.ExecuteAction(emit_llvm_action); + }); if (config.trace_includes && compiler.hasSourceManager()) { @@ -877,7 +883,9 @@ class CUDACompiler::Impl llvm::LLVMContext llvm_context; clang::EmitLLVMOnlyAction emit_llvm_action(&llvm_context); - bool success = runWithLargeStack([&] { return compiler.ExecuteAction(emit_llvm_action); }); + bool success = runWithLargeStack([&] { + return compiler.ExecuteAction(emit_llvm_action); + }); if (success) { @@ -1104,7 +1112,9 @@ class CUDACompiler::Impl } clang::EmitObjAction emit_action; - bool success = runWithLargeStack([&] { return compiler.ExecuteAction(emit_action); }); + bool success = runWithLargeStack([&] { + return compiler.ExecuteAction(emit_action); + }); if (config.trace_includes && compiler.hasSourceManager()) {