diff --git a/cpp/include/raft/core/bitset.cuh b/cpp/include/raft/core/bitset.cuh
index 5616a9019c..e66b4a7989 100644
--- a/cpp/include/raft/core/bitset.cuh
+++ b/cpp/include/raft/core/bitset.cuh
@@ -10,6 +10,7 @@
 #include <raft/core/device_container_policy.hpp>
 #include <raft/core/device_mdarray.hpp>
 #include <raft/core/operators.hpp>
+#include <raft/core/resource/dry_run_flag.hpp>
 #include <raft/core/resources.hpp>
 #include <raft/linalg/map.cuh>
 #include <raft/linalg/reduce.cuh>
@@ -166,6 +167,8 @@ void bitset_view<bitset_t, index_t>::repeat(const raft::resources& res,
                                             index_t times,
                                             bitset_t* output_device_ptr) const
 {
+  // Only a copy and kernel run below this point.
+  if (resource::get_dry_run_flag(res)) { return; }
   constexpr index_t bits_per_element = sizeof(bitset_t) * 8;
 
   if (bitset_len_ % bits_per_element == 0) {
diff --git a/cpp/include/raft/core/bitset.hpp b/cpp/include/raft/core/bitset.hpp
index fe47557ce4..3a8a363c62 100644
--- a/cpp/include/raft/core/bitset.hpp
+++ b/cpp/include/raft/core/bitset.hpp
@@ -8,6 +8,7 @@
 #include <raft/core/detail/macros.hpp>
 #include <raft/core/device_container_policy.hpp>
 #include <raft/core/device_mdarray.hpp>
+#include <raft/core/resource/dry_run_flag.hpp>
 #include <raft/core/resource/thrust_policy.hpp>
 #include <raft/core/resources.hpp>
 #include <raft/util/integer_utils.hpp>
@@ -133,9 +134,11 @@ struct bitset_view {
     auto count_gpu_scalar = raft::make_device_scalar<index_t>(res, 0.0);
     count(res, count_gpu_scalar.view());
     index_t count_cpu = 0;
-    raft::update_host(
-      &count_cpu, count_gpu_scalar.data_handle(), 1, resource::get_cuda_stream(res));
-    resource::sync_stream(res);
+    if (!resource::get_dry_run_flag(res)) {
+      raft::update_host(
+        &count_cpu, count_gpu_scalar.data_handle(), 1, resource::get_cuda_stream(res));
+      resource::sync_stream(res);
+    }
     return count_cpu;
   }
 
@@ -408,9 +411,11 @@ struct bitset {
     auto count_gpu_scalar = raft::make_device_scalar<index_t>(res, 0.0);
     count(res, count_gpu_scalar.view());
     index_t count_cpu = 0;
-    raft::update_host(
-      &count_cpu, count_gpu_scalar.data_handle(), 1, resource::get_cuda_stream(res));
-    resource::sync_stream(res);
+    if (!resource::get_dry_run_flag(res)) {
+      raft::update_host(
+        &count_cpu, count_gpu_scalar.data_handle(), 1, resource::get_cuda_stream(res));
+      resource::sync_stream(res);
+    }
     return count_cpu;
   }
   /**
diff --git a/cpp/include/raft/core/coo_matrix.hpp b/cpp/include/raft/core/coo_matrix.hpp
index 45bf3d3d54..f201b27afe 100644
--- a/cpp/include/raft/core/coo_matrix.hpp
+++ b/cpp/include/raft/core/coo_matrix.hpp
@@ -180,8 +180,8 @@ class coordinate_structure : public coordinate_structure_t<RowType, ColType, NZT
   void initialize_sparsity(nnz_type nnz)
   {
     sparse_structure_type::initialize_sparsity(nnz);
-    c_rows_.resize(nnz);
-    c_cols_.resize(nnz);
+    c_rows_.reallocate(nnz);
+    c_cols_.reallocate(nnz);
   }
 
  protected:
diff --git a/cpp/include/raft/core/csr_matrix.hpp b/cpp/include/raft/core/csr_matrix.hpp
index 0cc177dab0..d6892e139b 100644
--- a/cpp/include/raft/core/csr_matrix.hpp
+++ b/cpp/include/raft/core/csr_matrix.hpp
@@ -189,8 +189,8 @@ class compressed_structure
   void initialize_sparsity(NZType nnz) override
   {
     sparse_structure_type::initialize_sparsity(nnz);
-    c_indptr_.resize(this->get_n_rows() + 1);
-    c_indices_.resize(nnz);
+    c_indptr_.reallocate(this->get_n_rows() + 1);
+    c_indices_.reallocate(nnz);
   }
 
  protected:
diff --git a/cpp/include/raft/core/detail/copy.hpp b/cpp/include/raft/core/detail/copy.hpp
index 785665a99a..354d619411 100644
--- a/cpp/include/raft/core/detail/copy.hpp
+++ b/cpp/include/raft/core/detail/copy.hpp
@@ -11,6 +11,7 @@
 #include <raft/core/host_mdspan.hpp>
 #include <raft/core/logger.hpp>
 #include <raft/core/mdspan.hpp>
+#include <raft/core/resource/dry_run_flag.hpp>
 #include <raft/core/resource/stream_view.hpp>
 #include <raft/core/resources.hpp>
 
@@ -399,6 +400,10 @@ mdspan_copyable_t<DstType, SrcType> copy(resources const& res, DstType&& dst, Sr
     RAFT_EXPECTS(src.extent(i) == dst.extent(i), "Must copy between mdspans of the same shape");
   }
 
+  // Dry-run guard: raft::copy is a pure data-movement utility with no
+  // allocations that callers would need tracked.
+  if (resource::get_dry_run_flag(res)) { return; }
+
   if constexpr (config::use_intermediate_src) {
 #ifndef RAFT_DISABLE_CUDA
     // Copy to intermediate source on device, then perform necessary
diff --git a/cpp/include/raft/core/device_container_policy.hpp b/cpp/include/raft/core/device_container_policy.hpp
index 30233b69e6..acabec54ff 100644
--- a/cpp/include/raft/core/device_container_policy.hpp
+++ b/cpp/include/raft/core/device_container_policy.hpp
@@ -127,6 +127,29 @@ class device_uvector {
 
   void resize(size_type size) { data_.resize(size, data_.stream()); }
 
+  /**
+   * @brief Resize the internal buffer without copying old data.
+   *
+   * Unlike resize(), this never copies old data.
+   * Thus, unlike in resize(), there's no point in time where the old and the new buffers are both
+   * alive, and the peak memory usage is lower.
+   *
+   * Unlike resize(), this deallocates the old buffer even if the new size is smaller.
+   * This ensures the memory is released promptly.
+   */
+  void reallocate(size_type size)
+  {
+    if (size != data_.size()) {
+      auto stream = data_.stream();
+      auto mr     = data_.memory_resource();
+      // Resize and shrink rmm::device_uvector: force deallocation without copying old data
+      data_.resize(0, data_.stream());
+      data_.shrink_to_fit(data_.stream());
+      // Assign a new value after the old one is deallocated
+      data_ = rmm::device_uvector<T>(size, stream, mr);
+    }
+  }
+
   [[nodiscard]] auto data() noexcept -> pointer { return data_.data(); }
   [[nodiscard]] auto data() const noexcept -> const_pointer { return data_.data(); }
 };
diff --git a/cpp/include/raft/core/device_mdarray.hpp b/cpp/include/raft/core/device_mdarray.hpp
index f7f564283c..28bae1ce1f 100644
--- a/cpp/include/raft/core/device_mdarray.hpp
+++ b/cpp/include/raft/core/device_mdarray.hpp
@@ -9,6 +9,7 @@
 #include <raft/core/device_container_policy.hpp>
 #include <raft/core/device_mdspan.hpp>
 #include <raft/core/mdarray.hpp>
+#include <raft/core/resource/dry_run_flag.hpp>
 #include <raft/core/resources.hpp>
 
 #include <rmm/resource_ref.hpp>
@@ -164,7 +165,7 @@ auto make_device_scalar(raft::resources const& handle, ElementType const& v)
   using policy_t = typename device_scalar<ElementType, IndexType>::container_policy_type;
   policy_t policy{};
   auto scalar = device_scalar<ElementType, IndexType>{handle, extents, policy};
-  scalar(0)   = v;
+  if (!resource::get_dry_run_flag(handle)) { scalar(0) = v; }
   return scalar;
 }
 
diff --git a/cpp/include/raft/core/dry_run_resources.hpp b/cpp/include/raft/core/dry_run_resources.hpp
new file mode 100644
index 0000000000..50e06973e5
--- /dev/null
+++ b/cpp/include/raft/core/dry_run_resources.hpp
@@ -0,0 +1,253 @@
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 2026, NVIDIA CORPORATION.
+ * SPDX-License-Identifier: Apache-2.0
+ */
+#pragma once
+
+#include <raft/core/memory_stats_resources.hpp>
+#include <raft/core/resource/device_memory_resource.hpp>
+#include <raft/core/resource/dry_run_flag.hpp>
+#include <raft/core/resource/managed_memory_resource.hpp>
+#include <raft/core/resource/pinned_memory_resource.hpp>
+#include <raft/core/resources.hpp>
+#include <raft/mr/dry_run_resource.hpp>
+#include <raft/mr/host_device_resource.hpp>
+#include <raft/mr/host_memory_resource.hpp>
+
+#include <rmm/cuda_stream_view.hpp>
+#include <rmm/mr/per_device_resource.hpp>
+#include <rmm/resource_ref.hpp>
+
+#include <cuda/stream_ref>
+
+#include <cstddef>
+#include <cstdint>
+#include <memory>
+#include <utility>
+
+namespace raft {
+
+/**
+ * @defgroup dry_run_memory Dry-run memory resources
+ * @{
+ */
+
+/**
+ * @brief Resources handle that wraps all reachable memory resources with
+ *        dry-run adaptors and tracks peak allocation usage.
+ *
+ * Inherits from raft::resources, so it can be passed anywhere a
+ * raft::resources& is expected.  On construction the handle:
+ *   - If dry-run mode is already active, does nothing (no-op).
+ *   - Materializes all tracked resource types (host, device, pinned,
+ *     managed, workspace, large_workspace).
+ *   - Takes a snapshot of the original resources to keep them alive.
+ *   - Wraps each with dry_run_resource.
+ *   - Replaces global host and device resources with dry-run versions.
+ *   - Sets the dry-run flag.
+ *
+ * On destruction the handle resets the flag and restores global resources.
+ * Composable with memory_tracking_resources in either order.
+ */
+class dry_run_resources : public resources {
+ public:
+  explicit dry_run_resources(const resources& existing)
+    : resources(existing),
+      active_(!resource::get_dry_run_flag(existing)),
+      old_host_(raft::mr::get_default_host_resource()),
+      old_device_(rmm::mr::get_current_device_resource_ref())
+  {
+    if (active_) init();
+  }
+
+  ~dry_run_resources() override
+  {
+    if (!active_) return;
+    resource::set_dry_run_flag(*this, false);
+    raft::mr::set_default_host_resource(old_host_);
+    rmm::mr::set_current_device_resource(old_device_);
+
+    // Drop all base-class entries so that probe container RAII cleanup runs
+    // while old_device_ and snapshot_ are still alive
+    resources_.clear();
+    factories_.clear();
+  }
+
+  dry_run_resources(dry_run_resources const&)            = delete;
+  dry_run_resources& operator=(dry_run_resources const&) = delete;
+  dry_run_resources(dry_run_resources&&)                 = delete;
+  dry_run_resources& operator=(dry_run_resources&&)      = delete;
+
+  [[nodiscard]] auto get_bytes_peak() const -> memory_stats
+  {
+    if (!active_) return {};
+    return {
+      .device_workspace       = ws_stats_->get_peak_bytes(),
+      .device_large_workspace = lws_stats_->get_peak_bytes(),
+      .device_global          = device_stats_->get_peak_bytes(),
+      .device_managed         = managed_stats_->get_peak_bytes(),
+      .host                   = host_stats_->get_peak_bytes(),
+      .host_pinned            = pinned_stats_->get_peak_bytes(),
+    };
+  }
+
+  [[nodiscard]] auto get_bytes_current() const -> memory_stats
+  {
+    if (!active_) return {};
+    return {
+      .device_workspace       = ws_stats_->get_allocated_bytes(),
+      .device_large_workspace = lws_stats_->get_allocated_bytes(),
+      .device_global          = device_stats_->get_allocated_bytes(),
+      .device_managed         = managed_stats_->get_allocated_bytes(),
+      .host                   = host_stats_->get_allocated_bytes(),
+      .host_pinned            = pinned_stats_->get_allocated_bytes(),
+    };
+  }
+
+ private:
+  // Declaration order determines destruction order.
+  // snapshot_ is destroyed last (keeps original resource shared_ptrs alive
+  // while dry-run adaptors hold non-owning refs into them).
+  // old_device_ is destroyed after device_adaptor_ so the probe can
+  // deallocate through it during device_adaptor_ destruction.
+  std::vector<pair_resource> snapshot_;
+
+  bool active_;
+  raft::mr::host_resource old_host_;
+  raft::mr::device_resource old_device_;
+
+  using host_dry_run_t   = raft::mr::dry_run_resource<raft::mr::host_resource_ref>;
+  using device_dry_run_t = raft::mr::dry_run_resource<rmm::device_async_resource_ref>;
+  std::unique_ptr<host_dry_run_t> host_adaptor_;
+  std::unique_ptr<device_dry_run_t> device_adaptor_;
+
+  using counter_t = raft::mr::detail::dry_run_memory_counter;
+  std::shared_ptr<counter_t> host_stats_;
+  std::shared_ptr<counter_t> pinned_stats_;
+  std::shared_ptr<counter_t> managed_stats_;
+  std::shared_ptr<counter_t> ws_stats_;
+  std::shared_ptr<counter_t> lws_stats_;
+  std::shared_ptr<counter_t> device_stats_;
+
+  void init()
+  {
+    // Independent-counting invariant
+    // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+    // 1. Force-initialize all lazily-created resources (workspace, large workspace,
+    //    pinned, managed) so that their factories resolve against the *original*
+    //    global device MR, not a tracking wrapper we install later.
+    // 2. Capture every upstream ref while it still points to the original resource.
+    // 3. Snapshot the resource map to keep the originals alive.
+    // 4. Only *then* replace the global device resource with the tracking bridge.
+    // 5. Wrap each captured upstream with a separate dry_run_resource adaptor.
+    //
+    // Because step 2 happens before step 4, workspace/lws allocations flow through
+    // their own adaptor directly to the original device MR, bypassing the device adaptor.
+    // Each allocation is therefore counted in exactly one category, and
+    // memory_stats::total() returns an accurate, non-overlapping sum.
+    auto* ws         = resource::get_workspace_resource(*this);
+    auto ws_free     = resource::get_workspace_free_bytes(*this);
+    auto ws_upstream = ws->get_upstream_resource();
+    auto lws_ref     = resource::get_large_workspace_resource_ref(*this);
+    auto pinned_ref  = resource::get_pinned_memory_resource_ref(*this);
+    auto managed_ref = resource::get_managed_memory_resource_ref(*this);
+
+    // Snapshot keeps original resource objects alive while dry-run
+    // adaptors hold non-owning refs into them.
+    snapshot_ = resources_;
+
+    // --- Host (global) ---
+    {
+      host_adaptor_ = std::make_unique<host_dry_run_t>(raft::mr::host_resource_ref{old_host_});
+      host_stats_   = host_adaptor_->get_counter();
+      mr::set_default_host_resource(mr::host_resource_ref{*host_adaptor_});
+    }
+
+    // --- Pinned ---
+    {
+      mr::dry_run_resource<mr::host_device_resource_ref> dr{pinned_ref};
+      pinned_stats_ = dr.get_counter();
+      resource::set_pinned_memory_resource(*this, std::move(dr));
+    }
+
+    // --- Managed ---
+    {
+      mr::dry_run_resource<mr::host_device_resource_ref> dr{managed_ref};
+      managed_stats_ = dr.get_counter();
+      resource::set_managed_memory_resource(*this, std::move(dr));
+    }
+
+    // --- Device (global) ---
+    // Invalidate the cached thrust policy (the resource_ref it captured
+    // will be stale once we replace the global device resource).
+    factories_.at(resource::resource_type::THRUST_POLICY) = std::make_pair(
+      resource::resource_type::LAST_KEY, std::make_shared<resource::empty_resource_factory>());
+    resources_.at(resource::resource_type::THRUST_POLICY) = std::make_pair(
+      resource::resource_type::LAST_KEY, std::make_shared<resource::empty_resource>());
+    {
+      device_dry_run_t dr{rmm::device_async_resource_ref{old_device_}};
+      device_stats_   = dr.get_counter();
+      device_adaptor_ = std::make_unique<device_dry_run_t>(std::move(dr));
+      rmm::mr::set_current_device_resource(*device_adaptor_);
+    }
+
+    // --- Workspace ---
+    {
+      mr::dry_run_resource<rmm::device_async_resource_ref> dr{ws_upstream};
+      ws_stats_ = dr.get_counter();
+      resource::set_workspace_resource(*this, std::move(dr), ws_free);
+    }
+
+    // --- Large workspace ---
+    {
+      mr::dry_run_resource<rmm::device_async_resource_ref> dr{lws_ref};
+      lws_stats_ = dr.get_counter();
+      resource::set_large_workspace_resource(*this, std::move(dr));
+    }
+
+    resource::set_dry_run_flag(*this, true);
+  }
+};
+
+/** @} */
+
+}  // namespace raft
+
+namespace raft::util {
+
+/**
+ * @brief Execute an action in dry-run mode and return peak memory usage.
+ *
+ * Creates an independent copy of the resources handle with all memory resources
+ * replaced by dry-run versions, executes the action, and returns peak usage stats.
+ *
+ * The action receives the dry-run resources handle (as const raft::resources&)
+ * and can check the dry-run flag via raft::resource::get_dry_run_flag(res) to
+ * skip kernel execution.
+ *
+ * @tparam Action A callable with signature void(const raft::resources&, Args...).
+ * @tparam Args Additional argument types to forward to the action.
+ * @param res The raft resources handle.
+ * @param action The action to execute in dry-run mode.
+ * @param args Additional arguments to forward to the action.
+ * @return memory_stats with peak memory usage from the dry run.
+ *
+ * @code{.cpp}
+ * raft::resources res;
+ * auto stats = raft::util::dry_run_execute(res, [](const raft::resources& r) {
+ *   my_algorithm(r);
+ * });
+ * std::cout << "Peak workspace: " << stats.device_workspace << " bytes\n";
+ * @endcode
+ */
+template <typename Action, typename... Args>
+auto dry_run_execute(const raft::resources& res, Action&& action, Args&&... args)
+  -> raft::memory_stats
+{
+  raft::dry_run_resources dry_res(res);
+  std::forward<Action>(action)(static_cast<const raft::resources&>(dry_res),
+                               std::forward<Args>(args)...);
+  return dry_res.get_bytes_peak();
+}
+
+}  // namespace raft::util
diff --git a/cpp/include/raft/core/host_container_policy.hpp b/cpp/include/raft/core/host_container_policy.hpp
index 6839431945..296b4d1710 100644
--- a/cpp/include/raft/core/host_container_policy.hpp
+++ b/cpp/include/raft/core/host_container_policy.hpp
@@ -105,6 +105,27 @@ requires cuda::mr::synchronous_resource_with<MR, cuda::mr::host_accessible>
     *this = std::move(new_container);
   }
 
+  /**
+   * @brief Resize the internal buffer without copying old data.
+   *
+   * Unlike resize(), this never copies old data.
+   * Thus, unlike in resize(), there's no point in time where the old and the new buffers are both
+   * alive, and the peak memory usage is lower.
+   *
+   * Unlike resize(), this deallocates the old buffer even if the new size is smaller.
+   * This ensures the memory is released promptly.
+   */
+  void reallocate(size_type count)
+  {
+    if (bytesize_ == sizeof(value_type) * count) { return; }
+    if (data_ != nullptr) {
+      mr_.deallocate_sync(data_, bytesize_);
+      data_ = nullptr;
+    }
+    auto tmp = host_container{count, mr_};
+    std::swap(tmp, *this);
+  }
+
   [[nodiscard]] auto data() noexcept -> pointer { return data_; }
   [[nodiscard]] auto data() const noexcept -> const_pointer { return data_; }
 };
diff --git a/cpp/include/raft/core/host_mdarray.hpp b/cpp/include/raft/core/host_mdarray.hpp
index 712170b00e..09857cd2c1 100644
--- a/cpp/include/raft/core/host_mdarray.hpp
+++ b/cpp/include/raft/core/host_mdarray.hpp
@@ -9,6 +9,7 @@
 #include <raft/core/host_container_policy.hpp>
 #include <raft/core/host_mdspan.hpp>
 #include <raft/core/mdarray.hpp>
+#include <raft/core/resource/dry_run_flag.hpp>
 #include <raft/core/resources.hpp>
 
 #include <cstdint>
@@ -224,7 +225,7 @@ auto make_host_scalar(raft::resources const& res, ElementType const& v)
   using policy_t = typename host_scalar<ElementType, IndexType>::container_policy_type;
   policy_t policy;
   auto scalar = host_scalar<ElementType, IndexType>{res, extents, policy};
-  scalar(0)   = v;
+  if (!resource::get_dry_run_flag(res)) { scalar(0) = v; }
   return scalar;
 }
 
diff --git a/cpp/include/raft/core/managed_mdarray.hpp b/cpp/include/raft/core/managed_mdarray.hpp
index d6084a69ad..57e9eaf7bb 100644
--- a/cpp/include/raft/core/managed_mdarray.hpp
+++ b/cpp/include/raft/core/managed_mdarray.hpp
@@ -9,6 +9,7 @@
 #include <raft/core/managed_container_policy.hpp>
 #include <raft/core/managed_mdspan.hpp>
 #include <raft/core/mdarray.hpp>
+#include <raft/core/resource/dry_run_flag.hpp>
 #include <raft/core/resources.hpp>
 
 #include <cstdint>
@@ -118,7 +119,7 @@ auto make_managed_scalar(raft::resources const& handle, ElementType const& v)
   using policy_t = typename managed_scalar<ElementType>::container_policy_type;
   policy_t policy{};
   auto scalar = managed_scalar<ElementType>{handle, extents, policy};
-  scalar(0)   = v;
+  if (!resource::get_dry_run_flag(handle)) { scalar(0) = v; }
   return scalar;
 }
 
diff --git a/cpp/include/raft/core/pinned_mdarray.hpp b/cpp/include/raft/core/pinned_mdarray.hpp
index 287430b69a..0ad69ceb17 100644
--- a/cpp/include/raft/core/pinned_mdarray.hpp
+++ b/cpp/include/raft/core/pinned_mdarray.hpp
@@ -9,6 +9,7 @@
 #include <raft/core/mdarray.hpp>
 #include <raft/core/pinned_container_policy.hpp>
 #include <raft/core/pinned_mdspan.hpp>
+#include <raft/core/resource/dry_run_flag.hpp>
 #include <raft/core/resources.hpp>
 
 #include <cstdint>
@@ -118,7 +119,7 @@ auto make_pinned_scalar(raft::resources const& handle, ElementType const& v)
   using policy_t = typename pinned_scalar<ElementType>::container_policy_type;
   policy_t policy{};
   auto scalar = pinned_scalar<ElementType>{handle, extents, policy};
-  scalar(0)   = v;
+  if (!resource::get_dry_run_flag(handle)) { scalar(0) = v; }
   return scalar;
 }
 
diff --git a/cpp/include/raft/core/resource/cuda_stream.hpp b/cpp/include/raft/core/resource/cuda_stream.hpp
index b66c16f199..a20653db5f 100644
--- a/cpp/include/raft/core/resource/cuda_stream.hpp
+++ b/cpp/include/raft/core/resource/cuda_stream.hpp
@@ -6,6 +6,7 @@
 
 #include <raft/core/detail/macros.hpp>
 #include <raft/core/interruptible.hpp>
+#include <raft/core/resource/dry_run_flag.hpp>
 #include <raft/core/resource/resource_types.hpp>
 #include <raft/core/resources.hpp>
 #include <raft/util/cudart_utils.hpp>
@@ -84,13 +85,18 @@ inline void set_cuda_stream(resources const& res, rmm::cuda_stream_view stream_v
  */
 inline void sync_stream(const resources& res, rmm::cuda_stream_view stream)
 {
+  if (raft::resource::get_dry_run_flag(res)) { return; }
   interruptible::synchronize(stream);
 }
 
 /**
  * @brief synchronize main stream on the resources instance
  */
-inline void sync_stream(const resources& res) { sync_stream(res, get_cuda_stream(res)); }
+inline void sync_stream(const resources& res)
+{
+  if (raft::resource::get_dry_run_flag(res)) { return; }
+  sync_stream(res, get_cuda_stream(res));
+}
 
 /**
  * @}
diff --git a/cpp/include/raft/core/resource/dry_run_flag.hpp b/cpp/include/raft/core/resource/dry_run_flag.hpp
new file mode 100644
index 0000000000..4d0c9e27b5
--- /dev/null
+++ b/cpp/include/raft/core/resource/dry_run_flag.hpp
@@ -0,0 +1,89 @@
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 2026, NVIDIA CORPORATION.
+ * SPDX-License-Identifier: Apache-2.0
+ */
+#pragma once
+
+#include <raft/core/resource/resource_types.hpp>
+#include <raft/core/resources.hpp>
+
+#include <memory>
+
+namespace raft::resource {
+
+/**
+ * @defgroup dry_run_flag Dry-run flag resource
+ * @{
+ */
+
+/**
+ * @brief Resource that holds a boolean dry-run flag.
+ *
+ * When the dry-run flag is set, algorithms should skip kernel execution
+ * and only perform allocations to measure memory usage.
+ */
+class dry_run_flag_resource : public resource {
+ public:
+  dry_run_flag_resource() = default;
+  explicit dry_run_flag_resource(bool value) : flag_(value) {}
+  ~dry_run_flag_resource() override = default;
+
+  auto get_resource() -> void* override { return &flag_; }
+
+  void set(bool value) { flag_ = value; }
+  [[nodiscard]] auto get() const -> bool { return flag_; }
+
+ private:
+  bool flag_{false};
+};
+
+/**
+ * @brief Factory that creates a dry_run_flag_resource.
+ */
+class dry_run_flag_resource_factory : public resource_factory {
+ public:
+  explicit dry_run_flag_resource_factory(bool initial_value = false) : initial_value_(initial_value)
+  {
+  }
+
+  auto get_resource_type() -> resource_type override { return resource_type::DRY_RUN_FLAG; }
+  auto make_resource() -> resource* override { return new dry_run_flag_resource(initial_value_); }
+
+ private:
+  bool initial_value_;
+};
+
+/**
+ * @brief Get the dry-run flag from a resources handle.
+ *
+ * @param res raft resources object
+ * @return true if dry-run mode is active
+ */
+inline auto get_dry_run_flag(resources const& res) -> bool
+{
+  if (!res.has_resource_factory(resource_type::DRY_RUN_FLAG)) {
+    res.add_resource_factory(std::make_shared<dry_run_flag_resource_factory>());
+  }
+  return *res.get_resource<bool>(resource_type::DRY_RUN_FLAG);
+}
+
+/**
+ * @brief Set the dry-run flag on a resources handle.
+ *
+ * @param res raft resources object
+ * @param value true to enable dry-run mode, false to disable
+ */
+inline void set_dry_run_flag(resources const& res, bool value)
+{
+  if (!res.has_resource_factory(resource_type::DRY_RUN_FLAG)) {
+    res.add_resource_factory(std::make_shared<dry_run_flag_resource_factory>(value));
+  } else {
+    // The resource may already be instantiated; update it directly
+    auto* flag = res.get_resource<bool>(resource_type::DRY_RUN_FLAG);
+    *flag      = value;
+  }
+}
+
+/** @} */
+
+}  // namespace raft::resource
diff --git a/cpp/include/raft/core/resource/resource_types.hpp b/cpp/include/raft/core/resource/resource_types.hpp
index e3af719eda..ae2c9b21cf 100644
--- a/cpp/include/raft/core/resource/resource_types.hpp
+++ b/cpp/include/raft/core/resource/resource_types.hpp
@@ -42,6 +42,7 @@ enum resource_type {
   MULTI_GPU,                 // resource that tracks resource of each device in multi-gpu world
   PINNED_MEMORY_RESOURCE,    // memory resource for pinned (page-locked) host allocations
   MANAGED_MEMORY_RESOURCE,   // resource for managed (unified) allocations
+  DRY_RUN_FLAG,              // dry-run mode flag for allocation profiling
 
   LAST_KEY  // reserved for the last key
 };
diff --git a/cpp/include/raft/core/sparse_types.hpp b/cpp/include/raft/core/sparse_types.hpp
index 1657a8e494..3b7d9b9c59 100644
--- a/cpp/include/raft/core/sparse_types.hpp
+++ b/cpp/include/raft/core/sparse_types.hpp
@@ -178,7 +178,7 @@ class sparse_matrix {
 
   ~sparse_matrix() noexcept(std::is_nothrow_destructible<container_type>::value) = default;
 
-  void initialize_sparsity(nnz_type nnz) { c_elements_.resize(nnz); };
+  void initialize_sparsity(nnz_type nnz) { c_elements_.reallocate(nnz); };
 
   raft::span<ElementType, is_device> get_elements()
   {
diff --git a/cpp/include/raft/label/classlabels.cuh b/cpp/include/raft/label/classlabels.cuh
index d02bf8feaf..02b6f3cb93 100644
--- a/cpp/include/raft/label/classlabels.cuh
+++ b/cpp/include/raft/label/classlabels.cuh
@@ -8,11 +8,37 @@
 #pragma once
 
 #include <raft/core/detail/macros.hpp>
+#include <raft/core/resource/cuda_stream.hpp>
+#include <raft/core/resource/dry_run_flag.hpp>
+#include <raft/core/resources.hpp>
 #include <raft/label/detail/classlabels.cuh>
 
 namespace raft {
 namespace label {
 
+/**
+ * Get unique class labels.
+ *
+ * The y array is assumed to store class labels. The unique values are selected
+ * from this array.
+ *
+ * @tparam value_t numeric type of the arrays with class labels
+ * @param [in] handle raft resources handle (dry-run aware)
+ * @param [inout] unique output unique labels
+ * @param [in] y device array of labels, size [n]
+ * @param [in] n number of labels
+ * @returns number of unique labels (upper bound in dry-run mode)
+ */
+template <typename value_t>
+int getUniquelabels(raft::resources const& handle,
+                    rmm::device_uvector<value_t>& unique,
+                    value_t* y,
+                    size_t n)
+{
+  return detail::getUniquelabels<value_t>(
+    resource::get_dry_run_flag(handle), unique, y, n, resource::get_cuda_stream(handle));
+}
+
 /**
  * Get unique class labels.
  *
diff --git a/cpp/include/raft/label/detail/classlabels.cuh b/cpp/include/raft/label/detail/classlabels.cuh
index f0e9a14f69..2a3d7b50eb 100644
--- a/cpp/include/raft/label/detail/classlabels.cuh
+++ b/cpp/include/raft/label/detail/classlabels.cuh
@@ -30,15 +30,17 @@ namespace detail {
  * from this array.
  *
  * \tparam value_t numeric type of the arrays with class labels
- * \param [in] y device array of labels, size [n]
- * \param [in] n number of labels
+ * \param [in] dry_run if true, perform allocations but skip CUDA work
  * \param [out] unique device array of unique labels, unallocated on entry,
  *   on exit it has size [n_unique]
- * \param [out] n_unique number of unique labels
+ * \param [in] y device array of labels, size [n]
+ * \param [in] n number of labels
  * \param [in] stream cuda stream
+ * \return number of unique labels (upper bound when dry_run is true)
  */
 template <typename value_t>
-int getUniquelabels(rmm::device_uvector<value_t>& unique, value_t* y, size_t n, cudaStream_t stream)
+int getUniquelabels(
+  bool dry_run, rmm::device_uvector<value_t>& unique, value_t* y, size_t n, cudaStream_t stream)
 {
   rmm::device_scalar<int> d_num_selected(stream);
   rmm::device_uvector<value_t> workspace(n, stream);
@@ -54,6 +56,11 @@ int getUniquelabels(rmm::device_uvector<value_t>& unique, value_t* y, size_t n,
   bytes = std::max(bytes, bytes2);
   rmm::device_uvector<char> cub_storage(bytes, stream);
 
+  if (dry_run) {
+    if (unique.size() < n) { unique = rmm::device_uvector<value_t>(n, stream); }
+    return static_cast<int>(n);
+  }
+
   // Select Unique classes
   cub::DeviceRadixSort::SortKeys(
     cub_storage.data(), bytes, y, workspace.data(), n, 0, sizeof(value_t) * 8, stream);
@@ -73,6 +80,26 @@ int getUniquelabels(rmm::device_uvector<value_t>& unique, value_t* y, size_t n,
   return n_unique;
 }
 
+/**
+ * Get unique class labels.
+ *
+ * The y array is assumed to store class labels. The unique values are selected
+ * from this array.
+ *
+ * \tparam value_t numeric type of the arrays with class labels
+ * \param [out] unique device array of unique labels, unallocated on entry,
+ *   on exit it has size [n_unique]
+ * \param [in] y device array of labels, size [n]
+ * \param [in] n number of labels
+ * \param [in] stream cuda stream
+ * \return number of unique labels
+ */
+template <typename value_t>
+int getUniquelabels(rmm::device_uvector<value_t>& unique, value_t* y, size_t n, cudaStream_t stream)
+{
+  return getUniquelabels(false, unique, y, n, stream);
+}
+
 /**
  * Assign one versus rest labels.
  *
diff --git a/cpp/include/raft/linalg/add.cuh b/cpp/include/raft/linalg/add.cuh
index 4171b53a27..d87d146a51 100644
--- a/cpp/include/raft/linalg/add.cuh
+++ b/cpp/include/raft/linalg/add.cuh
@@ -13,6 +13,7 @@
 #include <raft/core/device_mdspan.hpp>
 #include <raft/core/host_mdspan.hpp>
 #include <raft/core/resource/cuda_stream.hpp>
+#include <raft/core/resource/dry_run_flag.hpp>
 #include <raft/util/input_validation.hpp>
 
 namespace raft {
@@ -103,6 +104,7 @@ template <typename InType,
           typename = raft::enable_if_output_device_mdspan<OutType>>
 void add(raft::resources const& handle, InType in1, InType in2, OutType out)
 {
+  if (resource::get_dry_run_flag(handle)) { return; }
   using in_value_t  = typename InType::value_type;
   using out_value_t = typename OutType::value_type;
 
@@ -140,6 +142,7 @@ void add_scalar(raft::resources const& handle,
                 OutType out,
                 raft::device_scalar_view<const typename InType::value_type, ScalarIdxType> scalar)
 {
+  if (resource::get_dry_run_flag(handle)) { return; }
   using in_value_t  = typename InType::value_type;
   using out_value_t = typename OutType::value_type;
 
@@ -175,6 +178,7 @@ void add_scalar(raft::resources const& handle,
                 OutType out,
                 raft::host_scalar_view<const typename InType::value_type, ScalarIdxType> scalar)
 {
+  if (resource::get_dry_run_flag(handle)) { return; }
   using in_value_t  = typename InType::value_type;
   using out_value_t = typename OutType::value_type;
 
diff --git a/cpp/include/raft/linalg/coalesced_reduction.cuh b/cpp/include/raft/linalg/coalesced_reduction.cuh
index 818eee0ec3..835c81bf7a 100644
--- a/cpp/include/raft/linalg/coalesced_reduction.cuh
+++ b/cpp/include/raft/linalg/coalesced_reduction.cuh
@@ -13,6 +13,7 @@
 #include <raft/core/device_mdspan.hpp>
 #include <raft/core/operators.hpp>
 #include <raft/core/resource/cuda_stream.hpp>
+#include <raft/core/resource/dry_run_flag.hpp>
 #include <raft/core/resources.hpp>
 
 namespace raft {
@@ -63,7 +64,7 @@ void coalescedReduction(OutType* dots,
                         FinalLambda final_op   = raft::identity_op())
 {
   detail::coalescedReduction<InType, OutType, IdxType>(
-    dots, data, D, N, init, stream, inplace, main_op, reduce_op, final_op);
+    false, dots, data, D, N, init, stream, inplace, main_op, reduce_op, final_op);
 }
 
 /**
@@ -121,30 +122,32 @@ void coalesced_reduction(raft::resources const& handle,
     RAFT_EXPECTS(static_cast<IdxType>(dots.size()) == data.extent(0),
                  "Output should be equal to number of rows in Input");
 
-    coalescedReduction(dots.data_handle(),
-                       data.data_handle(),
-                       data.extent(1),
-                       data.extent(0),
-                       init,
-                       resource::get_cuda_stream(handle),
-                       inplace,
-                       main_op,
-                       reduce_op,
-                       final_op);
+    detail::coalescedReduction(resource::get_dry_run_flag(handle),
+                               dots.data_handle(),
+                               data.data_handle(),
+                               data.extent(1),
+                               data.extent(0),
+                               init,
+                               resource::get_cuda_stream(handle),
+                               inplace,
+                               main_op,
+                               reduce_op,
+                               final_op);
   } else if constexpr (std::is_same_v<LayoutPolicy, raft::col_major>) {
     RAFT_EXPECTS(static_cast<IdxType>(dots.size()) == data.extent(1),
                  "Output should be equal to number of columns in Input");
 
-    coalescedReduction(dots.data_handle(),
-                       data.data_handle(),
-                       data.extent(0),
-                       data.extent(1),
-                       init,
-                       resource::get_cuda_stream(handle),
-                       inplace,
-                       main_op,
-                       reduce_op,
-                       final_op);
+    detail::coalescedReduction(resource::get_dry_run_flag(handle),
+                               dots.data_handle(),
+                               data.data_handle(),
+                               data.extent(0),
+                               data.extent(1),
+                               init,
+                               resource::get_cuda_stream(handle),
+                               inplace,
+                               main_op,
+                               reduce_op,
+                               final_op);
   }
 }
 
diff --git a/cpp/include/raft/linalg/detail/axpy.cuh b/cpp/include/raft/linalg/detail/axpy.cuh
index 6347522138..488cad5bec 100644
--- a/cpp/include/raft/linalg/detail/axpy.cuh
+++ b/cpp/include/raft/linalg/detail/axpy.cuh
@@ -9,6 +9,7 @@
 
 #include <raft/core/detail/macros.hpp>
 #include <raft/core/resource/cublas_handle.hpp>
+#include <raft/core/resource/dry_run_flag.hpp>
 #include <raft/core/resources.hpp>
 
 #include <cublas_v2.h>
@@ -26,6 +27,7 @@ void axpy(raft::resources const& handle,
           const int incy,
           cudaStream_t stream)
 {
+  if (resource::get_dry_run_flag(handle)) { return; }
   auto cublas_h = resource::get_cublas_handle(handle);
   cublas_device_pointer_mode<DevicePointerMode> pmode(cublas_h);
   RAFT_CUBLAS_TRY(cublasaxpy(cublas_h, n, alpha, x, incx, y, incy, stream));
diff --git a/cpp/include/raft/linalg/detail/cholesky_r1_update.cuh b/cpp/include/raft/linalg/detail/cholesky_r1_update.cuh
index ae1f82a74f..2c3131451c 100644
--- a/cpp/include/raft/linalg/detail/cholesky_r1_update.cuh
+++ b/cpp/include/raft/linalg/detail/cholesky_r1_update.cuh
@@ -11,6 +11,7 @@
 #include <raft/core/detail/macros.hpp>
 #include <raft/core/resource/cublas_handle.hpp>
 #include <raft/core/resource/cuda_stream.hpp>
+#include <raft/core/resource/dry_run_flag.hpp>
 #include <raft/core/resources.hpp>
 #include <raft/linalg/binary_op.cuh>
 
@@ -54,6 +55,7 @@ void choleskyRank1Update(raft::resources const& handle,
     *n_bytes = offset + 1 * sizeof(math_t);
     return;
   }
+  if (resource::get_dry_run_flag(handle)) { return; }
   math_t* s    = reinterpret_cast<math_t*>(((char*)workspace) + offset);
   math_t* L_22 = L + (n - 1) * ld + n - 1;
 
diff --git a/cpp/include/raft/linalg/detail/coalesced_reduction-inl.cuh b/cpp/include/raft/linalg/detail/coalesced_reduction-inl.cuh
index 4cc549f79e..e1ec857b8a 100644
--- a/cpp/include/raft/linalg/detail/coalesced_reduction-inl.cuh
+++ b/cpp/include/raft/linalg/detail/coalesced_reduction-inl.cuh
@@ -499,7 +499,8 @@ template <typename ThickPolicy,
           typename MainLambda   = raft::identity_op,
           typename ReduceLambda = raft::add_op,
           typename FinalLambda  = raft::identity_op>
-void coalescedReductionThick(OutType* dots,
+void coalescedReductionThick(bool dry_run,
+                             OutType* dots,
                              const InType* data,
                              IdxType D,
                              IdxType N,
@@ -518,6 +519,8 @@ void coalescedReductionThick(OutType* dots,
 
   rmm::device_uvector<OutType> buffer(N * ThickPolicy::BlocksPerRow, stream);
 
+  if (dry_run) { return; }
+
   /* We apply a two-step reduction:
    *  1. coalescedReductionThickKernel reduces the [N x D] input data to [N x BlocksPerRow]. It
    *     applies the main_op but not the final op.
@@ -551,7 +554,8 @@ template <typename InType,
           typename MainLambda   = raft::identity_op,
           typename ReduceLambda = raft::add_op,
           typename FinalLambda  = raft::identity_op>
-void coalescedReductionThickDispatcher(OutType* dots,
+void coalescedReductionThickDispatcher(bool dry_run,
+                                       OutType* dots,
                                        const InType* data,
                                        IdxType D,
                                        IdxType N,
@@ -565,7 +569,7 @@ void coalescedReductionThickDispatcher(OutType* dots,
   // Note: multiple elements per thread to take advantage of the sequential reduction and loop
   // unrolling
   coalescedReductionThick<ReductionThickPolicy<256, 64>, ReductionThinPolicy<32, 128, 1>>(
-    dots, data, D, N, init, stream, inplace, main_op, reduce_op, final_op);
+    dry_run, dots, data, D, N, init, stream, inplace, main_op, reduce_op, final_op);
 }
 
 // Primitive to perform reductions along the coalesced dimension of the matrix, i.e. reduce along
@@ -580,7 +584,8 @@ template <typename InType,
           typename MainLambda   = raft::identity_op,
           typename ReduceLambda = raft::add_op,
           typename FinalLambda  = raft::identity_op>
-void coalescedReduction(OutType* dots,
+void coalescedReduction(bool dry_run,
+                        OutType* dots,
                         const InType* data,
                         IdxType D,
                         IdxType N,
@@ -601,12 +606,16 @@ void coalescedReduction(OutType* dots,
    */
   const IdxType numSMs = raft::getMultiProcessorCount();
   if (D <= IdxType(512) || (N >= IdxType(16) * numSMs && D < IdxType(2048))) {
+    if (dry_run) { return; }
     coalescedReductionThinDispatcher(
       dots, data, D, N, init, stream, inplace, main_op, reduce_op, final_op);
   } else if (N < numSMs && D >= IdxType(1 << 17)) {
+    // Must call through to coalescedReductionThick even in dry-run so workspace
+    // allocations are recorded (coalescedReductionThick allocates before guarding).
     coalescedReductionThickDispatcher(
-      dots, data, D, N, init, stream, inplace, main_op, reduce_op, final_op);
+      dry_run, dots, data, D, N, init, stream, inplace, main_op, reduce_op, final_op);
   } else {
+    if (dry_run) { return; }
     coalescedReductionMediumDispatcher(
       dots, data, D, N, init, stream, inplace, main_op, reduce_op, final_op);
   }
diff --git a/cpp/include/raft/linalg/detail/cublaslt_wrappers.hpp b/cpp/include/raft/linalg/detail/cublaslt_wrappers.hpp
index 2337413fbd..06d087d755 100644
--- a/cpp/include/raft/linalg/detail/cublaslt_wrappers.hpp
+++ b/cpp/include/raft/linalg/detail/cublaslt_wrappers.hpp
@@ -10,6 +10,7 @@
 #include <raft/core/resource/cublaslt_handle.hpp>
 #include <raft/core/resource/cuda_stream.hpp>
 #include <raft/core/resource/custom_resource.hpp>
+#include <raft/core/resource/dry_run_flag.hpp>
 #include <raft/core/resources.hpp>
 #include <raft/util/cache.hpp>
 #include <raft/util/cuda_data_type.hpp>
@@ -284,6 +285,8 @@ template <bool DevicePointerMode = false, typename S, typename A, typename B, ty
                                   uint64_t ldc,
                                   cudaStream_t stream)
 {
+  // We pass nullptr to the workspace, so the extra memory usage should be zero.
+  if (resource::get_dry_run_flag(res)) { return; }
   common::nvtx::range<common::nvtx::domain::raft> batch_scope(
     "linalg::matmul(m = %d, n = %d, k = %d)", m, n, k);
   std::shared_ptr<matmul_desc> mm_desc{nullptr};
diff --git a/cpp/include/raft/linalg/detail/eig.cuh b/cpp/include/raft/linalg/detail/eig.cuh
index 5dca01d87d..38f41deb21 100644
--- a/cpp/include/raft/linalg/detail/eig.cuh
+++ b/cpp/include/raft/linalg/detail/eig.cuh
@@ -10,6 +10,7 @@
 #include <raft/core/detail/macros.hpp>
 #include <raft/core/resource/cusolver_dn_handle.hpp>
 #include <raft/core/resource/detail/stream_sync_event.hpp>
+#include <raft/core/resource/dry_run_flag.hpp>
 #include <raft/core/resources.hpp>
 #include <raft/matrix/copy.cuh>
 #include <raft/util/cudart_utils.hpp>
@@ -45,9 +46,13 @@ void eigDC_legacy(raft::resources const& handle,
                                                eig_vals,
                                                &lwork));
 
+  // TODO(achirkin): Consider using the workspace resource for these temporary allocations.
   rmm::device_uvector<math_t> d_work(lwork, stream);
   rmm::device_scalar<int> d_dev_info(stream);
 
+  // The workspace is already allocated, no more allocation are foreseeable.
+  if (resource::get_dry_run_flag(handle)) { return; }
+
   raft::matrix::copy(handle,
                      make_device_matrix_view<const math_t>(in, n_rows, n_cols),
                      make_device_matrix_view<math_t>(eig_vectors, n_rows, n_cols));
@@ -122,6 +127,12 @@ void eigDC(raft::resources const& handle,
   rmm::device_scalar<int> d_dev_info(stream_new);
   std::vector<math_t> h_work(workspaceHost / sizeof(math_t));
 
+  if (resource::get_dry_run_flag(handle)) {
+    // No more allocations beyond this points, but need to cleanup.
+    RAFT_CUSOLVER_TRY(cusolverDnDestroyParams(dn_params));
+    return;
+  }
+
   raft::copy(eig_vectors, in, n_rows * n_cols, stream_new);
 
   RAFT_CUSOLVER_TRY(cusolverDnxsyevd(cusolverH,
@@ -188,7 +199,9 @@ void eigSelDC(raft::resources const& handle,
 
   rmm::device_uvector<math_t> d_work(lwork, stream);
   rmm::device_scalar<int> d_dev_info(stream);
-  rmm::device_uvector<math_t> d_eig_vectors(0, stream);
+  rmm::device_uvector<math_t> d_eig_vectors(memUsage == COPY_INPUT ? n_rows * n_cols : 0, stream);
+
+  if (resource::get_dry_run_flag(handle)) { return; }
 
   if (memUsage == OVERWRITE_INPUT) {
     RAFT_CUSOLVER_TRY(cusolverDnsyevdx(cusolverH,
@@ -209,7 +222,6 @@ void eigSelDC(raft::resources const& handle,
                                        d_dev_info.data(),
                                        stream));
   } else if (memUsage == COPY_INPUT) {
-    d_eig_vectors.resize(n_rows * n_cols, stream);
     raft::matrix::copy(handle,
                        make_device_matrix_view<const math_t>(in, n_rows, n_cols),
                        make_device_matrix_view(eig_vectors, n_rows, n_cols));
@@ -286,6 +298,12 @@ void eigJacobi(raft::resources const& handle,
   rmm::device_uvector<math_t> d_work(lwork, stream);
   rmm::device_scalar<int> dev_info(stream);
 
+  if (resource::get_dry_run_flag(handle)) {
+    // No more allocations beyond this points, but need to cleanup.
+    RAFT_CUSOLVER_TRY(cusolverDnDestroySyevjInfo(syevj_params));
+    return;
+  }
+
   raft::matrix::copy(handle,
                      make_device_matrix_view<const math_t>(in, n_rows, n_cols),
                      make_device_matrix_view(eig_vectors, n_rows, n_cols));
diff --git a/cpp/include/raft/linalg/detail/gemv.hpp b/cpp/include/raft/linalg/detail/gemv.hpp
index 8e5760f706..5ddcbf9ad9 100644
--- a/cpp/include/raft/linalg/detail/gemv.hpp
+++ b/cpp/include/raft/linalg/detail/gemv.hpp
@@ -9,6 +9,7 @@
 
 #include <raft/core/detail/macros.hpp>
 #include <raft/core/resource/cublas_handle.hpp>
+#include <raft/core/resource/dry_run_flag.hpp>
 #include <raft/core/resources.hpp>
 
 #include <cublas_v2.h>
@@ -32,6 +33,7 @@ void gemv(raft::resources const& handle,
           const int incy,
           cudaStream_t stream)
 {
+  if (resource::get_dry_run_flag(handle)) { return; }
   cublasHandle_t cublas_h = resource::get_cublas_handle(handle);
   detail::cublas_device_pointer_mode<DevicePointerMode> pmode(cublas_h);
   RAFT_CUBLAS_TRY(detail::cublasgemv(cublas_h,
@@ -110,6 +112,7 @@ void gemv(raft::resources const& handle,
           const math_t beta,
           cudaStream_t stream)
 {
+  if (resource::get_dry_run_flag(handle)) { return; }
   cublasHandle_t cublas_h = resource::get_cublas_handle(handle);
   cublasOperation_t op_a  = trans_a ? CUBLAS_OP_T : CUBLAS_OP_N;
   RAFT_CUBLAS_TRY(
diff --git a/cpp/include/raft/linalg/detail/lstsq.cuh b/cpp/include/raft/linalg/detail/lstsq.cuh
index 2f0d2aa5c3..8df37527c9 100644
--- a/cpp/include/raft/linalg/detail/lstsq.cuh
+++ b/cpp/include/raft/linalg/detail/lstsq.cuh
@@ -10,6 +10,7 @@
 #include <raft/core/resource/cublas_handle.hpp>
 #include <raft/core/resource/cuda_stream_pool.hpp>
 #include <raft/core/resource/cusolver_dn_handle.hpp>
+#include <raft/core/resource/dry_run_flag.hpp>
 #include <raft/linalg/detail/cublas_wrappers.hpp>
 #include <raft/linalg/detail/cusolver_wrappers.hpp>
 #include <raft/linalg/eig.cuh>
@@ -131,6 +132,9 @@ void lstsqSvdQR(raft::resources const& handle,
                                         + 1                // devInfo
                                       ,
                                       stream);
+
+  if (resource::get_dry_run_flag(handle)) { return; }
+
   math_t* cusolverWorkSet = workset.data();
   math_t* U               = cusolverWorkSet + cusolverWorkSetSize;
   math_t* Vt              = U + n_rows * minmn;
@@ -205,6 +209,12 @@ void lstsqSvdJacobi(raft::resources const& handle,
                                         + 1                // devInfo
                                       ,
                                       stream);
+
+  if (resource::get_dry_run_flag(handle)) {
+    RAFT_CUSOLVER_TRY(cusolverDnDestroyGesvdjInfo(gesvdj_params));
+    return;
+  }
+
   math_t* cusolverWorkSet = workset.data();
   math_t* U               = cusolverWorkSet + cusolverWorkSetSize;
   math_t* V               = U + n_rows * minmn;
@@ -249,21 +259,27 @@ void lstsqEig(raft::resources const& handle,
 {
   rmm::cuda_stream_view mainStream   = rmm::cuda_stream_view(stream);
   rmm::cuda_stream_view multAbStream = resource::get_next_usable_stream(handle);
+  bool dry_run                       = resource::get_dry_run_flag(handle);
   bool concurrent;
-  // Check if the two streams can run concurrently. This is needed because a legacy default stream
-  // would synchronize with other blocking streams. To avoid synchronization in such case, we try to
-  // use an additional stream from the pool.
-  if (!are_implicitly_synchronized(mainStream, multAbStream)) {
-    concurrent = true;
-  } else if (resource::get_stream_pool_size(handle) > 1) {
-    mainStream = resource::get_next_usable_stream(handle);
-    concurrent = true;
+  if (dry_run) {
+    concurrent = false;
   } else {
-    multAbStream = mainStream;
-    concurrent   = false;
+    // Check if the two streams can run concurrently. This is needed because a legacy default stream
+    // would synchronize with other blocking streams. To avoid synchronization in such case, we try
+    // to use an additional stream from the pool.
+    if (!are_implicitly_synchronized(mainStream, multAbStream)) {
+      concurrent = true;
+    } else if (resource::get_stream_pool_size(handle) > 1) {
+      mainStream = resource::get_next_usable_stream(handle);
+      concurrent = true;
+    } else {
+      multAbStream = mainStream;
+      concurrent   = false;
+    }
   }
 
   rmm::device_uvector<math_t> workset(n_cols * n_cols * 3 + n_cols * 2, mainStream);
+
   // the event is created only if the given raft handle is capable of running
   // at least two CUDA streams without implicit synchronization.
   DeviceEvent worksetDone(concurrent);
@@ -303,8 +319,8 @@ void lstsqEig(raft::resources const& handle,
   raft::common::nvtx::pop_range();
 
   // QS  <- Q invS
-  raft::linalg::matrixVectorOp<false, true>(
-    QS, Q, S, n_cols, n_cols, DivideByNonZero<math_t>(), mainStream);
+  raft::linalg::detail::matrixVectorOp<false, true>(
+    dry_run, QS, Q, S, n_cols, n_cols, DivideByNonZero<math_t>(), mainStream);
   // covA <- QS Q* == Q invS Q* == inv(A* A)
   raft::linalg::gemm(handle,
                      QS,
@@ -393,6 +409,8 @@ void lstsqQR(raft::resources const& handle,
 
   rmm::device_uvector<math_t> d_work(lwork, stream);
 
+  if (resource::get_dry_run_flag(handle)) { return; }
+
   // #TODO: Call from public API when ready
   RAFT_CUSOLVER_TRY(raft::linalg::detail::cusolverDngeqrf(
     cusolverH, m, n, A, lda, d_tau.data(), d_work.data(), lwork, d_info.data(), stream));
diff --git a/cpp/include/raft/linalg/detail/map.cuh b/cpp/include/raft/linalg/detail/map.cuh
index 714869aaa5..97df85a3ff 100644
--- a/cpp/include/raft/linalg/detail/map.cuh
+++ b/cpp/include/raft/linalg/detail/map.cuh
@@ -8,6 +8,7 @@
 #include <raft/core/detail/macros.hpp>
 #include <raft/core/device_mdspan.hpp>
 #include <raft/core/resource/cuda_stream.hpp>
+#include <raft/core/resource/dry_run_flag.hpp>
 #include <raft/core/resources.hpp>
 #include <raft/util/cuda_utils.cuh>
 #include <raft/util/input_validation.hpp>
@@ -208,6 +209,7 @@ template <bool PassOffset,
           typename = raft::enable_if_input_device_mdspan<InTypes...>>
 void map(const raft::resources& res, OutType out, Func f, InTypes... ins)
 {
+  if (resource::get_dry_run_flag(res)) { return; }
   RAFT_EXPECTS(raft::is_row_or_column_major(out), "Output must be contiguous");
   (map_check_shape(out, ins), ...);
 
diff --git a/cpp/include/raft/linalg/detail/matrix_vector_op.cuh b/cpp/include/raft/linalg/detail/matrix_vector_op.cuh
index af9632a7da..c238d0961e 100644
--- a/cpp/include/raft/linalg/detail/matrix_vector_op.cuh
+++ b/cpp/include/raft/linalg/detail/matrix_vector_op.cuh
@@ -7,6 +7,7 @@
 
 #include <raft/core/detail/macros.hpp>
 #include <raft/core/resource/cuda_stream.hpp>
+#include <raft/core/resource/dry_run_flag.hpp>
 #include <raft/matrix/linewise_op.cuh>
 
 namespace raft {
@@ -20,7 +21,8 @@ template <bool rowMajor,
           typename VecT,
           typename IdxType = int,
           int TPB          = 256>
-void matrixVectorOp(MatT* out,
+void matrixVectorOp(bool dry_run,
+                    MatT* out,
                     const MatT* matrix,
                     const VecT* vec,
                     IdxType D,
@@ -28,6 +30,7 @@ void matrixVectorOp(MatT* out,
                     Lambda op,
                     cudaStream_t stream)
 {
+  if (dry_run) { return; }
   raft::resources handle;
   resource::set_cuda_stream(handle, stream);
   constexpr raft::Apply apply =
@@ -57,7 +60,8 @@ template <bool rowMajor,
           typename Vec2T,
           typename IdxType = int,
           int TPB          = 256>
-void matrixVectorOp(MatT* out,
+void matrixVectorOp(bool dry_run,
+                    MatT* out,
                     const MatT* matrix,
                     const Vec1T* vec1,
                     const Vec2T* vec2,
@@ -66,6 +70,7 @@ void matrixVectorOp(MatT* out,
                     Lambda op,
                     cudaStream_t stream)
 {
+  if (dry_run) { return; }
   raft::resources handle;
   resource::set_cuda_stream(handle, stream);
   constexpr raft::Apply apply =
diff --git a/cpp/include/raft/linalg/detail/norm.cuh b/cpp/include/raft/linalg/detail/norm.cuh
index 782438fdd2..9a563ee23d 100644
--- a/cpp/include/raft/linalg/detail/norm.cuh
+++ b/cpp/include/raft/linalg/detail/norm.cuh
@@ -20,18 +20,23 @@ template <NormType norm_type,
           typename IdxType,
           typename Lambda,
           typename OutType = Type>
-void rowNormCaller(
-  OutType* dots, const Type* data, IdxType D, IdxType N, cudaStream_t stream, Lambda fin_op)
+void rowNormCaller(bool dry_run,
+                   OutType* dots,
+                   const Type* data,
+                   IdxType D,
+                   IdxType N,
+                   cudaStream_t stream,
+                   Lambda fin_op)
 {
   if constexpr (norm_type == L1Norm) {
-    raft::linalg::reduce<rowMajor, true, Type, OutType, IdxType>(
-      dots, data, D, N, (OutType)0, stream, false, raft::abs_op(), raft::add_op(), fin_op);
+    reduce<rowMajor, true, Type, OutType, IdxType>(
+      dry_run, dots, data, D, N, (OutType)0, stream, false, raft::abs_op(), raft::add_op(), fin_op);
   } else if constexpr (norm_type == L2Norm) {
-    raft::linalg::reduce<rowMajor, true, Type, OutType, IdxType>(
-      dots, data, D, N, (OutType)0, stream, false, raft::sq_op(), raft::add_op(), fin_op);
+    reduce<rowMajor, true, Type, OutType, IdxType>(
+      dry_run, dots, data, D, N, (OutType)0, stream, false, raft::sq_op(), raft::add_op(), fin_op);
   } else if constexpr (norm_type == LinfNorm) {
-    raft::linalg::reduce<rowMajor, true, Type, OutType, IdxType>(
-      dots, data, D, N, (OutType)0, stream, false, raft::abs_op(), raft::max_op(), fin_op);
+    reduce<rowMajor, true, Type, OutType, IdxType>(
+      dry_run, dots, data, D, N, (OutType)0, stream, false, raft::abs_op(), raft::max_op(), fin_op);
   } else {
     THROW("Unsupported norm type: %d", norm_type);
   }
@@ -43,18 +48,23 @@ template <NormType norm_type,
           typename IdxType,
           typename Lambda,
           typename OutType = Type>
-void colNormCaller(
-  OutType* dots, const Type* data, IdxType D, IdxType N, cudaStream_t stream, Lambda fin_op)
+void colNormCaller(bool dry_run,
+                   OutType* dots,
+                   const Type* data,
+                   IdxType D,
+                   IdxType N,
+                   cudaStream_t stream,
+                   Lambda fin_op)
 {
   if constexpr (norm_type == L1Norm) {
-    raft::linalg::reduce<rowMajor, false, Type, OutType, IdxType>(
-      dots, data, D, N, (OutType)0, stream, false, raft::abs_op(), raft::add_op(), fin_op);
+    reduce<rowMajor, false, Type, OutType, IdxType>(
+      dry_run, dots, data, D, N, (OutType)0, stream, false, raft::abs_op(), raft::add_op(), fin_op);
   } else if constexpr (norm_type == L2Norm) {
-    raft::linalg::reduce<rowMajor, false, Type, OutType, IdxType>(
-      dots, data, D, N, (OutType)0, stream, false, raft::sq_op(), raft::add_op(), fin_op);
+    reduce<rowMajor, false, Type, OutType, IdxType>(
+      dry_run, dots, data, D, N, (OutType)0, stream, false, raft::sq_op(), raft::add_op(), fin_op);
   } else if constexpr (norm_type == LinfNorm) {
-    raft::linalg::reduce<rowMajor, false, Type, OutType, IdxType>(
-      dots, data, D, N, (OutType)0, stream, false, raft::abs_op(), raft::max_op(), fin_op);
+    reduce<rowMajor, false, Type, OutType, IdxType>(
+      false, dots, data, D, N, (OutType)0, stream, false, raft::abs_op(), raft::max_op(), fin_op);
   } else {
     THROW("Unsupported norm type: %d", norm_type);
   }
diff --git a/cpp/include/raft/linalg/detail/qr.cuh b/cpp/include/raft/linalg/detail/qr.cuh
index bf981ecae0..41e6ad87fd 100644
--- a/cpp/include/raft/linalg/detail/qr.cuh
+++ b/cpp/include/raft/linalg/detail/qr.cuh
@@ -10,6 +10,7 @@
 
 #include <raft/core/detail/macros.hpp>
 #include <raft/core/resource/cusolver_dn_handle.hpp>
+#include <raft/core/resource/dry_run_flag.hpp>
 #include <raft/core/resources.hpp>
 #include <raft/matrix/triangular.cuh>
 
@@ -40,15 +41,26 @@ void qrGetQ_inplace(
 {
   RAFT_EXPECTS(n_rows >= n_cols, "QR decomposition expects n_rows >= n_cols.");
   cusolverDnHandle_t cusolver = resource::get_cusolver_dn_handle(handle);
+  auto is_dry_run             = resource::get_dry_run_flag(handle);
 
   rmm::device_uvector<math_t> tau(n_cols, stream);
-  RAFT_CUDA_TRY(cudaMemsetAsync(tau.data(), 0, sizeof(math_t) * n_cols, stream));
+  if (!is_dry_run) {
+    RAFT_CUDA_TRY(cudaMemsetAsync(tau.data(), 0, sizeof(math_t) * n_cols, stream));
+  }
 
   rmm::device_scalar<int> dev_info(stream);
-  int ws_size;
+  int ws_size_Dngeqrf;
+  int ws_size_Dnorgqr;
+
+  RAFT_CUSOLVER_TRY(
+    cusolverDngeqrf_bufferSize(cusolver, n_rows, n_cols, Q, n_rows, &ws_size_Dngeqrf));
+  RAFT_CUSOLVER_TRY(cusolverDnorgqr_bufferSize(
+    cusolver, n_rows, n_cols, n_cols, Q, n_rows, tau.data(), &ws_size_Dnorgqr));
+
+  rmm::device_uvector<math_t> workspace(std::max(ws_size_Dngeqrf, ws_size_Dnorgqr), stream);
+
+  if (is_dry_run) { return; }
 
-  RAFT_CUSOLVER_TRY(cusolverDngeqrf_bufferSize(cusolver, n_rows, n_cols, Q, n_rows, &ws_size));
-  rmm::device_uvector<math_t> workspace(ws_size, stream);
   RAFT_CUSOLVER_TRY(cusolverDngeqrf(cusolver,
                                     n_rows,
                                     n_cols,
@@ -56,13 +68,10 @@ void qrGetQ_inplace(
                                     n_rows,
                                     tau.data(),
                                     workspace.data(),
-                                    ws_size,
+                                    ws_size_Dngeqrf,
                                     dev_info.data(),
                                     stream));
 
-  RAFT_CUSOLVER_TRY(
-    cusolverDnorgqr_bufferSize(cusolver, n_rows, n_cols, n_cols, Q, n_rows, tau.data(), &ws_size));
-  workspace.resize(ws_size, stream);
   RAFT_CUSOLVER_TRY(cusolverDnorgqr(cusolver,
                                     n_rows,
                                     n_cols,
@@ -71,7 +80,7 @@ void qrGetQ_inplace(
                                     n_rows,
                                     tau.data(),
                                     workspace.data(),
-                                    ws_size,
+                                    ws_size_Dnorgqr,
                                     dev_info.data(),
                                     stream));
 }
@@ -84,7 +93,7 @@ void qrGetQ(raft::resources const& handle,
             int n_cols,
             cudaStream_t stream)
 {
-  raft::copy(Q, M, n_rows * n_cols, stream);
+  if (!resource::get_dry_run_flag(handle)) { raft::copy(Q, M, n_rows * n_cols, stream); }
   qrGetQ_inplace(handle, Q, n_rows, n_cols, stream);
 }
 
@@ -100,19 +109,32 @@ void qrGetQR(raft::resources const& handle,
   cusolverDnHandle_t cusolverH = resource::get_cusolver_dn_handle(handle);
 
   int m = n_rows, n = n_cols;
+  int R_full_nrows = m, R_full_ncols = n;
+  int Q_nrows = m, Q_ncols = n;
+  int Lwork_Dngeqrf, Lwork_Dnorgqr;
   rmm::device_uvector<math_t> R_full(m * n, stream);
   rmm::device_uvector<math_t> tau(std::min(m, n), stream);
+  rmm::device_scalar<int> devInfo(stream);
+
+  RAFT_CUSOLVER_TRY(cusolverDngeqrf_bufferSize(
+    cusolverH, R_full_nrows, R_full_ncols, R_full.data(), R_full_nrows, &Lwork_Dngeqrf));
+  RAFT_CUSOLVER_TRY(cusolverDnorgqr_bufferSize(cusolverH,
+                                               Q_nrows,
+                                               Q_ncols,
+                                               std::min(Q_ncols, Q_nrows),
+                                               Q,
+                                               Q_nrows,
+                                               tau.data(),
+                                               &Lwork_Dnorgqr));
+
+  rmm::device_uvector<math_t> workspace(std::max(Lwork_Dngeqrf, Lwork_Dnorgqr), stream);
+
+  if (resource::get_dry_run_flag(handle)) { return; }
+
   RAFT_CUDA_TRY(cudaMemsetAsync(tau.data(), 0, sizeof(math_t) * std::min(m, n), stream));
-  int R_full_nrows = m, R_full_ncols = n;
   RAFT_CUDA_TRY(
     cudaMemcpyAsync(R_full.data(), M, sizeof(math_t) * m * n, cudaMemcpyDeviceToDevice, stream));
 
-  int Lwork;
-  rmm::device_scalar<int> devInfo(stream);
-
-  RAFT_CUSOLVER_TRY(cusolverDngeqrf_bufferSize(
-    cusolverH, R_full_nrows, R_full_ncols, R_full.data(), R_full_nrows, &Lwork));
-  rmm::device_uvector<math_t> workspace(Lwork, stream);
   RAFT_CUSOLVER_TRY(cusolverDngeqrf(cusolverH,
                                     R_full_nrows,
                                     R_full_ncols,
@@ -120,7 +142,7 @@ void qrGetQR(raft::resources const& handle,
                                     R_full_nrows,
                                     tau.data(),
                                     workspace.data(),
-                                    Lwork,
+                                    Lwork_Dngeqrf,
                                     devInfo.data(),
                                     stream));
 
@@ -131,11 +153,7 @@ void qrGetQR(raft::resources const& handle,
 
   RAFT_CUDA_TRY(
     cudaMemcpyAsync(Q, R_full.data(), sizeof(math_t) * m * n, cudaMemcpyDeviceToDevice, stream));
-  int Q_nrows = m, Q_ncols = n;
 
-  RAFT_CUSOLVER_TRY(cusolverDnorgqr_bufferSize(
-    cusolverH, Q_nrows, Q_ncols, std::min(Q_ncols, Q_nrows), Q, Q_nrows, tau.data(), &Lwork));
-  workspace.resize(Lwork, stream);
   RAFT_CUSOLVER_TRY(cusolverDnorgqr(cusolverH,
                                     Q_nrows,
                                     Q_ncols,
@@ -144,7 +162,7 @@ void qrGetQR(raft::resources const& handle,
                                     Q_nrows,
                                     tau.data(),
                                     workspace.data(),
-                                    Lwork,
+                                    Lwork_Dnorgqr,
                                     devInfo.data(),
                                     stream));
 }
diff --git a/cpp/include/raft/linalg/detail/reduce.cuh b/cpp/include/raft/linalg/detail/reduce.cuh
index 2a689649b4..f58dc12f67 100644
--- a/cpp/include/raft/linalg/detail/reduce.cuh
+++ b/cpp/include/raft/linalg/detail/reduce.cuh
@@ -22,7 +22,8 @@ template <bool rowMajor,
           typename MainLambda   = raft::identity_op,
           typename ReduceLambda = raft::add_op,
           typename FinalLambda  = raft::identity_op>
-void reduce(OutType* dots,
+void reduce(bool dry_run,
+            OutType* dots,
             const InType* data,
             IdxType D,
             IdxType N,
@@ -34,17 +35,19 @@ void reduce(OutType* dots,
             FinalLambda final_op   = raft::identity_op())
 {
   if constexpr (rowMajor && alongRows) {
-    raft::linalg::coalescedReduction<InType, OutType, IdxType>(
-      dots, data, D, N, init, stream, inplace, main_op, reduce_op, final_op);
+    coalescedReduction<InType, OutType, IdxType>(
+      dry_run, dots, data, D, N, init, stream, inplace, main_op, reduce_op, final_op);
   } else if constexpr (rowMajor && !alongRows) {
+    if (dry_run) { return; }  // no allocations in strided reduction
     raft::linalg::stridedReduction<InType, OutType, IdxType>(
       dots, data, D, N, init, stream, inplace, main_op, reduce_op, final_op);
   } else if constexpr (!rowMajor && alongRows) {
+    if (dry_run) { return; }  // no allocations in strided reduction
     raft::linalg::stridedReduction<InType, OutType, IdxType>(
       dots, data, N, D, init, stream, inplace, main_op, reduce_op, final_op);
   } else {
-    raft::linalg::coalescedReduction<InType, OutType, IdxType>(
-      dots, data, N, D, init, stream, inplace, main_op, reduce_op, final_op);
+    coalescedReduction<InType, OutType, IdxType>(
+      dry_run, dots, data, N, D, init, stream, inplace, main_op, reduce_op, final_op);
   }
 }
 
diff --git a/cpp/include/raft/linalg/detail/rsvd.cuh b/cpp/include/raft/linalg/detail/rsvd.cuh
index 8adf3bfb48..7220feea6a 100644
--- a/cpp/include/raft/linalg/detail/rsvd.cuh
+++ b/cpp/include/raft/linalg/detail/rsvd.cuh
@@ -9,6 +9,7 @@
 #include <raft/core/resource/cublas_handle.hpp>
 #include <raft/core/resource/cuda_stream.hpp>
 #include <raft/core/resource/cusolver_dn_handle.hpp>
+#include <raft/core/resource/dry_run_flag.hpp>
 #include <raft/linalg/eig.cuh>
 #include <raft/linalg/gemm.cuh>
 #include <raft/linalg/qr.cuh>
@@ -86,6 +87,8 @@ void randomized_svd(const raft::resources& handle,
   auto h_workspace = raft::make_host_vector<char>(workspaceHost);
   auto devInfo     = raft::make_device_scalar<int>(handle, 0);
 
+  if (resource::get_dry_run_flag(handle)) { return; }
+
   RAFT_CUSOLVER_TRY(cusolverDnxgesvdr(cusolverH,
                                       jobu,
                                       jobv,
@@ -155,6 +158,7 @@ void rsvdFixedRank(raft::resources const& handle,
                    int max_sweeps,
                    cudaStream_t stream)
 {
+  bool is_dry_run              = resource::get_dry_run_flag(handle);
   cusolverDnHandle_t cusolverH = resource::get_cusolver_dn_handle(handle);
   cublasHandle_t cublasH       = resource::get_cublas_handle(handle);
 
@@ -172,7 +176,9 @@ void rsvdFixedRank(raft::resources const& handle,
 
   // Build temporary U, S, V matrices
   rmm::device_uvector<math_t> S_vec_tmp(l, stream);
-  RAFT_CUDA_TRY(cudaMemsetAsync(S_vec_tmp.data(), 0, sizeof(math_t) * l, stream));
+  if (!is_dry_run) {
+    RAFT_CUDA_TRY(cudaMemsetAsync(S_vec_tmp.data(), 0, sizeof(math_t) * l, stream));
+  }
 
   // build random matrix
   rmm::device_uvector<math_t> RN(n * l, stream);
@@ -188,9 +194,11 @@ void rsvdFixedRank(raft::resources const& handle,
   rmm::device_uvector<math_t> Z(n * l, stream);
   rmm::device_uvector<math_t> Yorth(m * l, stream);
   rmm::device_uvector<math_t> Zorth(n * l, stream);
-  RAFT_CUDA_TRY(cudaMemsetAsync(Z.data(), 0, sizeof(math_t) * n * l, stream));
-  RAFT_CUDA_TRY(cudaMemsetAsync(Yorth.data(), 0, sizeof(math_t) * m * l, stream));
-  RAFT_CUDA_TRY(cudaMemsetAsync(Zorth.data(), 0, sizeof(math_t) * n * l, stream));
+  if (!is_dry_run) {
+    RAFT_CUDA_TRY(cudaMemsetAsync(Z.data(), 0, sizeof(math_t) * n * l, stream));
+    RAFT_CUDA_TRY(cudaMemsetAsync(Yorth.data(), 0, sizeof(math_t) * m * l, stream));
+    RAFT_CUDA_TRY(cudaMemsetAsync(Zorth.data(), 0, sizeof(math_t) * n * l, stream));
+  }
 
   // power sampling scheme
   for (int j = 1; j < q; j++) {
@@ -237,30 +245,40 @@ void rsvdFixedRank(raft::resources const& handle,
 
   // orthogonalize on exit from loop to get Q
   rmm::device_uvector<math_t> Q(m * l, stream);
-  RAFT_CUDA_TRY(cudaMemsetAsync(Q.data(), 0, sizeof(math_t) * m * l, stream));
+  if (!is_dry_run) { RAFT_CUDA_TRY(cudaMemsetAsync(Q.data(), 0, sizeof(math_t) * m * l, stream)); }
   raft::linalg::qrGetQ(handle, Y.data(), Q.data(), m, l, stream);
 
   // either QR of B^T method, or eigendecompose BB^T method
   if (!use_bbt) {
     // form Bt = Mt*Q : nxm * mxl = nxl
     rmm::device_uvector<math_t> Bt(n * l, stream);
-    RAFT_CUDA_TRY(cudaMemsetAsync(Bt.data(), 0, sizeof(math_t) * n * l, stream));
+    if (!is_dry_run) {
+      RAFT_CUDA_TRY(cudaMemsetAsync(Bt.data(), 0, sizeof(math_t) * n * l, stream));
+    }
     raft::linalg::gemm(
       handle, M, m, n, Q.data(), Bt.data(), n, l, CUBLAS_OP_T, CUBLAS_OP_N, alpha, beta, stream);
 
     // compute QR factorization of Bt
     // M is mxn ; Q is mxn ; R is min(m,n) x min(m,n) */
     rmm::device_uvector<math_t> Qhat(n * l, stream);
-    RAFT_CUDA_TRY(cudaMemsetAsync(Qhat.data(), 0, sizeof(math_t) * n * l, stream));
+    if (!is_dry_run) {
+      RAFT_CUDA_TRY(cudaMemsetAsync(Qhat.data(), 0, sizeof(math_t) * n * l, stream));
+    }
     rmm::device_uvector<math_t> Rhat(l * l, stream);
-    RAFT_CUDA_TRY(cudaMemsetAsync(Rhat.data(), 0, sizeof(math_t) * l * l, stream));
+    if (!is_dry_run) {
+      RAFT_CUDA_TRY(cudaMemsetAsync(Rhat.data(), 0, sizeof(math_t) * l * l, stream));
+    }
     raft::linalg::qrGetQR(handle, Bt.data(), Qhat.data(), Rhat.data(), n, l, stream);
 
     // compute SVD of Rhat (lxl)
     rmm::device_uvector<math_t> Uhat(l * l, stream);
-    RAFT_CUDA_TRY(cudaMemsetAsync(Uhat.data(), 0, sizeof(math_t) * l * l, stream));
+    if (!is_dry_run) {
+      RAFT_CUDA_TRY(cudaMemsetAsync(Uhat.data(), 0, sizeof(math_t) * l * l, stream));
+    }
     rmm::device_uvector<math_t> Vhat(l * l, stream);
-    RAFT_CUDA_TRY(cudaMemsetAsync(Vhat.data(), 0, sizeof(math_t) * l * l, stream));
+    if (!is_dry_run) {
+      RAFT_CUDA_TRY(cudaMemsetAsync(Vhat.data(), 0, sizeof(math_t) * l * l, stream));
+    }
     if (use_jacobi)
       raft::linalg::svdJacobi(handle,
                               Rhat.data(),
@@ -351,9 +369,13 @@ void rsvdFixedRank(raft::resources const& handle,
 
     // compute eigendecomposition of BBt
     rmm::device_uvector<math_t> Uhat(l * l, stream);
-    RAFT_CUDA_TRY(cudaMemsetAsync(Uhat.data(), 0, sizeof(math_t) * l * l, stream));
+    if (!is_dry_run) {
+      RAFT_CUDA_TRY(cudaMemsetAsync(Uhat.data(), 0, sizeof(math_t) * l * l, stream));
+    }
     rmm::device_uvector<math_t> Uhat_dup(l * l, stream);
-    RAFT_CUDA_TRY(cudaMemsetAsync(Uhat_dup.data(), 0, sizeof(math_t) * l * l, stream));
+    if (!is_dry_run) {
+      RAFT_CUDA_TRY(cudaMemsetAsync(Uhat_dup.data(), 0, sizeof(math_t) * l * l, stream));
+    }
 
     raft::matrix::upper_triangular(
       handle,
@@ -398,9 +420,13 @@ void rsvdFixedRank(raft::resources const& handle,
     // Sigma^{-1}[(p+1):l, (p+1):l] nxl * lxk * kxk = nxk
     if (gen_right_vec) {
       rmm::device_uvector<math_t> Sinv(k * k, stream);
-      RAFT_CUDA_TRY(cudaMemsetAsync(Sinv.data(), 0, sizeof(math_t) * k * k, stream));
+      if (!is_dry_run) {
+        RAFT_CUDA_TRY(cudaMemsetAsync(Sinv.data(), 0, sizeof(math_t) * k * k, stream));
+      }
       rmm::device_uvector<math_t> UhatSinv(l * k, stream);
-      RAFT_CUDA_TRY(cudaMemsetAsync(UhatSinv.data(), 0, sizeof(math_t) * l * k, stream));
+      if (!is_dry_run) {
+        RAFT_CUDA_TRY(cudaMemsetAsync(UhatSinv.data(), 0, sizeof(math_t) * l * k, stream));
+      }
       math_t scalar = 1.0;
       raft::matrix::reciprocal(
         handle,
diff --git a/cpp/include/raft/linalg/detail/svd.cuh b/cpp/include/raft/linalg/detail/svd.cuh
index 15396324cc..7589edd6f9 100644
--- a/cpp/include/raft/linalg/detail/svd.cuh
+++ b/cpp/include/raft/linalg/detail/svd.cuh
@@ -13,6 +13,7 @@
 #include <raft/core/resource/cublas_handle.hpp>
 #include <raft/core/resource/cuda_stream.hpp>
 #include <raft/core/resource/cusolver_dn_handle.hpp>
+#include <raft/core/resource/dry_run_flag.hpp>
 #include <raft/core/resources.hpp>
 #include <raft/linalg/eig.cuh>
 #include <raft/linalg/gemm.cuh>
@@ -60,6 +61,8 @@ void svdQR(raft::resources const& handle,
   RAFT_CUSOLVER_TRY(cusolverDngesvd_bufferSize<T>(cusolverH, n_rows, n_cols, &lwork));
   rmm::device_uvector<T> d_work(lwork, stream);
 
+  if (resource::get_dry_run_flag(handle)) { return; }
+
   char jobu  = 'S';
   char jobvt = 'A';
 
@@ -217,6 +220,11 @@ void svdJacobi(raft::resources const& handle,
 
   rmm::device_uvector<math_t> d_work(lwork, stream);
 
+  if (resource::get_dry_run_flag(handle)) {
+    RAFT_CUSOLVER_TRY(cusolverDnDestroyGesvdjInfo(gesvdj_params));
+    return;
+  }
+
   RAFT_CUSOLVER_TRY(cusolverDngesvdj(cusolverH,
                                      CUSOLVER_EIG_MODE_VECTOR,
                                      econ,
@@ -281,16 +289,19 @@ bool evaluateSVDByL2Norm(raft::resources const& handle,
                          math_t tol,
                          cudaStream_t stream)
 {
-  cublasHandle_t cublasH = resource::get_cublas_handle(handle);
-
   int m = n_rows, n = n_cols;
+  bool is_dry_run = resource::get_dry_run_flag(handle);
 
   // form product matrix
   rmm::device_uvector<math_t> P_d(m * n, stream);
   rmm::device_uvector<math_t> S_mat(k * k, stream);
-  RAFT_CUDA_TRY(cudaMemsetAsync(P_d.data(), 0, sizeof(math_t) * m * n, stream));
-  RAFT_CUDA_TRY(cudaMemsetAsync(S_mat.data(), 0, sizeof(math_t) * k * k, stream));
 
+  if (!is_dry_run) {
+    RAFT_CUDA_TRY(cudaMemsetAsync(P_d.data(), 0, sizeof(math_t) * m * n, stream));
+    RAFT_CUDA_TRY(cudaMemsetAsync(S_mat.data(), 0, sizeof(math_t) * k * k, stream));
+  }
+
+  // These RAFT functions have their own dry-run guards at the leaf level
   raft::matrix::set_diagonal(handle,
                              make_device_vector_view<const math_t>(S_vec, k),
                              make_device_matrix_view<math_t>(S_mat.data(), k, k));
@@ -308,8 +319,12 @@ bool evaluateSVDByL2Norm(raft::resources const& handle,
   // calculate percent error
   const math_t alpha = 1.0, beta = -1.0;
   rmm::device_uvector<math_t> A_minus_P(m * n, stream);
+
+  if (is_dry_run) { return false; }
+
   RAFT_CUDA_TRY(cudaMemsetAsync(A_minus_P.data(), 0, sizeof(math_t) * m * n, stream));
 
+  cublasHandle_t cublasH = resource::get_cublas_handle(handle);
   RAFT_CUBLAS_TRY(cublasgeam(cublasH,
                              CUBLAS_OP_N,
                              CUBLAS_OP_N,
diff --git a/cpp/include/raft/linalg/detail/transpose.cuh b/cpp/include/raft/linalg/detail/transpose.cuh
index 82fdb1c6f7..bf068d7049 100644
--- a/cpp/include/raft/linalg/detail/transpose.cuh
+++ b/cpp/include/raft/linalg/detail/transpose.cuh
@@ -11,6 +11,7 @@
 #include <raft/core/device_mdspan.hpp>
 #include <raft/core/resource/cublas_handle.hpp>
 #include <raft/core/resource/cuda_stream.hpp>
+#include <raft/core/resource/dry_run_flag.hpp>
 #include <raft/core/resources.hpp>
 
 #include <rmm/exec_policy.hpp>
@@ -88,6 +89,7 @@ void transpose_half(raft::resources const& handle,
                     const IndexType stride_out = 1)
 {
   if (n_cols == 0 || n_rows == 0) return;
+  if (resource::get_dry_run_flag(handle)) { return; }
   auto stream = resource::get_cuda_stream(handle);
 
   int dev_id, sm_count;
@@ -135,6 +137,7 @@ void transpose(raft::resources const& handle,
                int n_cols,
                cudaStream_t stream)
 {
+  if (resource::get_dry_run_flag(handle)) { return; }
   int out_n_rows = n_cols;
   int out_n_cols = n_rows;
 
@@ -189,6 +192,7 @@ void transpose_row_major_impl(
   raft::mdspan<T, raft::matrix_extent<IndexType>, LayoutPolicy, AccessorPolicy> in,
   raft::mdspan<T, raft::matrix_extent<IndexType>, LayoutPolicy, AccessorPolicy> out)
 {
+  if (resource::get_dry_run_flag(handle)) { return; }
   auto out_n_rows   = in.extent(1);
   auto out_n_cols   = in.extent(0);
   T constexpr kOne  = 1;
@@ -231,6 +235,7 @@ void transpose_col_major_impl(
   raft::mdspan<T, raft::matrix_extent<IndexType>, LayoutPolicy, AccessorPolicy> in,
   raft::mdspan<T, raft::matrix_extent<IndexType>, LayoutPolicy, AccessorPolicy> out)
 {
+  if (resource::get_dry_run_flag(handle)) { return; }
   auto out_n_rows   = in.extent(1);
   auto out_n_cols   = in.extent(0);
   T constexpr kOne  = 1;
diff --git a/cpp/include/raft/linalg/divide.cuh b/cpp/include/raft/linalg/divide.cuh
index b5cbacbce3..cbe5aec0f3 100644
--- a/cpp/include/raft/linalg/divide.cuh
+++ b/cpp/include/raft/linalg/divide.cuh
@@ -12,6 +12,7 @@
 #include <raft/core/detail/macros.hpp>
 #include <raft/core/device_mdspan.hpp>
 #include <raft/core/resource/cuda_stream.hpp>
+#include <raft/core/resource/dry_run_flag.hpp>
 #include <raft/util/cuda_utils.cuh>
 #include <raft/util/input_validation.hpp>
 
@@ -62,6 +63,7 @@ void divide_scalar(raft::resources const& handle,
                    OutType out,
                    raft::host_scalar_view<const typename InType::value_type, ScalarIdxType> scalar)
 {
+  if (resource::get_dry_run_flag(handle)) { return; }
   using in_value_t  = typename InType::value_type;
   using out_value_t = typename OutType::value_type;
 
diff --git a/cpp/include/raft/linalg/dot.cuh b/cpp/include/raft/linalg/dot.cuh
index c8684341a8..b0e4792338 100644
--- a/cpp/include/raft/linalg/dot.cuh
+++ b/cpp/include/raft/linalg/dot.cuh
@@ -12,6 +12,7 @@
 #include <raft/core/host_mdspan.hpp>
 #include <raft/core/resource/cublas_handle.hpp>
 #include <raft/core/resource/cuda_stream.hpp>
+#include <raft/core/resource/dry_run_flag.hpp>
 #include <raft/core/resources.hpp>
 #include <raft/linalg/detail/cublas_wrappers.hpp>
 
@@ -42,6 +43,7 @@ void dot(raft::resources const& handle,
 {
   RAFT_EXPECTS(x.size() == y.size(),
                "Size mismatch between x and y input vectors in raft::linalg::dot");
+  if (resource::get_dry_run_flag(handle)) { return; }
 
   RAFT_CUBLAS_TRY(detail::cublasdot(resource::get_cublas_handle(handle),
                                     x.size(),
@@ -72,6 +74,7 @@ void dot(raft::resources const& handle,
 {
   RAFT_EXPECTS(x.size() == y.size(),
                "Size mismatch between x and y input vectors in raft::linalg::dot");
+  if (resource::get_dry_run_flag(handle)) { return; }
 
   RAFT_CUBLAS_TRY(detail::cublasdot(resource::get_cublas_handle(handle),
                                     x.size(),
diff --git a/cpp/include/raft/linalg/map_reduce.cuh b/cpp/include/raft/linalg/map_reduce.cuh
index 66d8a1d6a2..2a678738ea 100644
--- a/cpp/include/raft/linalg/map_reduce.cuh
+++ b/cpp/include/raft/linalg/map_reduce.cuh
@@ -12,6 +12,7 @@
 #include <raft/core/detail/macros.hpp>
 #include <raft/core/device_mdspan.hpp>
 #include <raft/core/resource/cuda_stream.hpp>
+#include <raft/core/resource/dry_run_flag.hpp>
 
 namespace raft {
 namespace linalg {
@@ -91,6 +92,7 @@ void map_reduce(raft::resources const& handle,
                 ReduceLambda op,
                 Args... args)
 {
+  if (resource::get_dry_run_flag(handle)) { return; }
   mapReduce<InValueType, MapOp, ReduceLambda, IndexType, 256, OutValueType, Args...>(
     out.data_handle(),
     in.extent(0),
diff --git a/cpp/include/raft/linalg/matrix_vector_op.cuh b/cpp/include/raft/linalg/matrix_vector_op.cuh
index 6eca1ea9e8..766d2a433b 100644
--- a/cpp/include/raft/linalg/matrix_vector_op.cuh
+++ b/cpp/include/raft/linalg/matrix_vector_op.cuh
@@ -13,6 +13,7 @@
 #include <raft/core/detail/macros.hpp>
 #include <raft/core/device_mdspan.hpp>
 #include <raft/core/resource/cuda_stream.hpp>
+#include <raft/core/resource/dry_run_flag.hpp>
 #include <raft/core/resources.hpp>
 #include <raft/core/types.hpp>
 #include <raft/util/input_validation.hpp>
@@ -57,7 +58,7 @@ void matrixVectorOp(MatT* out,
                     Lambda op,
                     cudaStream_t stream)
 {
-  detail::matrixVectorOp<rowMajor, bcastAlongRows>(out, matrix, vec, D, N, op, stream);
+  detail::matrixVectorOp<rowMajor, bcastAlongRows>(false, out, matrix, vec, D, N, op, stream);
 }
 
 /**
@@ -101,7 +102,8 @@ void matrixVectorOp(MatT* out,
                     Lambda op,
                     cudaStream_t stream)
 {
-  detail::matrixVectorOp<rowMajor, bcastAlongRows>(out, matrix, vec1, vec2, D, N, op, stream);
+  detail::matrixVectorOp<rowMajor, bcastAlongRows>(
+    false, out, matrix, vec1, vec2, D, N, op, stream);
 }
 
 /**
@@ -157,13 +159,14 @@ void matrix_vector_op(raft::resources const& handle,
                  "Size mismatch between matrix and vector");
   }
 
-  matrixVectorOp<rowMajor, bcastAlongRows>(out.data_handle(),
-                                           matrix.data_handle(),
-                                           vec.data_handle(),
-                                           out.extent(1),
-                                           out.extent(0),
-                                           op,
-                                           resource::get_cuda_stream(handle));
+  detail::matrixVectorOp<rowMajor, bcastAlongRows>(resource::get_dry_run_flag(handle),
+                                                   out.data_handle(),
+                                                   matrix.data_handle(),
+                                                   vec.data_handle(),
+                                                   out.extent(1),
+                                                   out.extent(0),
+                                                   op,
+                                                   resource::get_cuda_stream(handle));
 }
 
 /**
@@ -222,14 +225,15 @@ void matrix_vector_op(raft::resources const& handle,
                  "Size mismatch between matrix and vector");
   }
 
-  matrixVectorOp<rowMajor, bcastAlongRows>(out.data_handle(),
-                                           matrix.data_handle(),
-                                           vec1.data_handle(),
-                                           vec2.data_handle(),
-                                           out.extent(1),
-                                           out.extent(0),
-                                           op,
-                                           resource::get_cuda_stream(handle));
+  detail::matrixVectorOp<rowMajor, bcastAlongRows>(resource::get_dry_run_flag(handle),
+                                                   out.data_handle(),
+                                                   matrix.data_handle(),
+                                                   vec1.data_handle(),
+                                                   vec2.data_handle(),
+                                                   out.extent(1),
+                                                   out.extent(0),
+                                                   op,
+                                                   resource::get_cuda_stream(handle));
 }
 
 /** @} */  // end of group matrix_vector_op
diff --git a/cpp/include/raft/linalg/mean_squared_error.cuh b/cpp/include/raft/linalg/mean_squared_error.cuh
index f14a64a7c8..b700e92495 100644
--- a/cpp/include/raft/linalg/mean_squared_error.cuh
+++ b/cpp/include/raft/linalg/mean_squared_error.cuh
@@ -12,6 +12,7 @@
 #include <raft/core/detail/macros.hpp>
 #include <raft/core/device_mdspan.hpp>
 #include <raft/core/resource/cuda_stream.hpp>
+#include <raft/core/resource/dry_run_flag.hpp>
 
 namespace raft {
 namespace linalg {
@@ -58,6 +59,7 @@ void mean_squared_error(raft::resources const& handle,
                         raft::device_scalar_view<OutValueType, IndexType> out,
                         OutValueType weight)
 {
+  if (resource::get_dry_run_flag(handle)) { return; }
   RAFT_EXPECTS(A.size() == B.size(), "Size mismatch between inputs");
 
   meanSquaredError(out.data_handle(),
diff --git a/cpp/include/raft/linalg/multiply.cuh b/cpp/include/raft/linalg/multiply.cuh
index 30d9be2611..7a901500a0 100644
--- a/cpp/include/raft/linalg/multiply.cuh
+++ b/cpp/include/raft/linalg/multiply.cuh
@@ -13,6 +13,7 @@
 #include <raft/core/device_mdspan.hpp>
 #include <raft/core/host_mdspan.hpp>
 #include <raft/core/resource/cuda_stream.hpp>
+#include <raft/core/resource/dry_run_flag.hpp>
 #include <raft/util/input_validation.hpp>
 
 namespace raft {
@@ -64,6 +65,7 @@ void multiply_scalar(
   OutType out,
   raft::host_scalar_view<const typename InType::value_type, ScalarIdxType> scalar)
 {
+  if (resource::get_dry_run_flag(handle)) { return; }
   using in_value_t  = typename InType::value_type;
   using out_value_t = typename OutType::value_type;
 
diff --git a/cpp/include/raft/linalg/norm.cuh b/cpp/include/raft/linalg/norm.cuh
index 7395c41925..a98e61d72a 100644
--- a/cpp/include/raft/linalg/norm.cuh
+++ b/cpp/include/raft/linalg/norm.cuh
@@ -15,6 +15,7 @@
 #include <raft/core/mdspan.hpp>
 #include <raft/core/operators.hpp>
 #include <raft/core/resource/cuda_stream.hpp>
+#include <raft/core/resource/dry_run_flag.hpp>
 #include <raft/core/types.hpp>
 #include <raft/linalg/norm_types.hpp>
 #include <raft/util/input_validation.hpp>
@@ -55,7 +56,7 @@ void rowNorm(OutType* dots,
              cudaStream_t stream,
              Lambda fin_op = raft::identity_op())
 {
-  detail::rowNormCaller<norm_type, rowMajor>(dots, data, D, N, stream, fin_op);
+  detail::rowNormCaller<norm_type, rowMajor>(false, dots, data, D, N, stream, fin_op);
 }
 
 /**
@@ -86,7 +87,7 @@ void colNorm(OutType* dots,
              cudaStream_t stream,
              Lambda fin_op = raft::identity_op())
 {
-  detail::colNormCaller<norm_type, rowMajor>(dots, data, D, N, stream, fin_op);
+  detail::colNormCaller<norm_type, rowMajor>(false, dots, data, D, N, stream, fin_op);
 }
 
 /**
@@ -129,21 +130,23 @@ void norm(raft::resources const& handle,
   if constexpr (along_rows) {
     RAFT_EXPECTS(static_cast<IndexType>(out.size()) == in.extent(0),
                  "Output should be equal to number of rows in Input");
-    rowNorm<norm_type, row_major>(out.data_handle(),
-                                  in.data_handle(),
-                                  in.extent(1),
-                                  in.extent(0),
-                                  resource::get_cuda_stream(handle),
-                                  fin_op);
+    detail::rowNormCaller<norm_type, row_major>(resource::get_dry_run_flag(handle),
+                                                out.data_handle(),
+                                                in.data_handle(),
+                                                in.extent(1),
+                                                in.extent(0),
+                                                resource::get_cuda_stream(handle),
+                                                fin_op);
   } else {
     RAFT_EXPECTS(static_cast<IndexType>(out.size()) == in.extent(1),
                  "Output should be equal to number of columns in Input");
-    colNorm<norm_type, row_major>(out.data_handle(),
-                                  in.data_handle(),
-                                  in.extent(1),
-                                  in.extent(0),
-                                  resource::get_cuda_stream(handle),
-                                  fin_op);
+    detail::colNormCaller<norm_type, row_major>(resource::get_dry_run_flag(handle),
+                                                out.data_handle(),
+                                                in.data_handle(),
+                                                in.extent(1),
+                                                in.extent(0),
+                                                resource::get_cuda_stream(handle),
+                                                fin_op);
   }
 }
 
diff --git a/cpp/include/raft/linalg/normalize.cuh b/cpp/include/raft/linalg/normalize.cuh
index ca1f65b26c..6e9cde8bad 100644
--- a/cpp/include/raft/linalg/normalize.cuh
+++ b/cpp/include/raft/linalg/normalize.cuh
@@ -11,6 +11,7 @@
 #include <raft/core/device_mdspan.hpp>
 #include <raft/core/operators.hpp>
 #include <raft/core/resource/cuda_stream.hpp>
+#include <raft/core/resource/dry_run_flag.hpp>
 #include <raft/linalg/norm_types.hpp>
 #include <raft/util/input_validation.hpp>
 
@@ -54,6 +55,7 @@ void row_normalize(raft::resources const& handle,
                    FinalLambda fin_op,
                    ElementType eps = ElementType(1e-8))
 {
+  if (resource::get_dry_run_flag(handle)) { return; }
   RAFT_EXPECTS(raft::is_row_or_column_major(in), "Input must be contiguous");
   RAFT_EXPECTS(raft::is_row_or_column_major(out), "Output must be contiguous");
   RAFT_EXPECTS(in.extent(0) == out.extent(0),
diff --git a/cpp/include/raft/linalg/power.cuh b/cpp/include/raft/linalg/power.cuh
index f3ddc4037a..5f1cc2d2ac 100644
--- a/cpp/include/raft/linalg/power.cuh
+++ b/cpp/include/raft/linalg/power.cuh
@@ -11,6 +11,7 @@
 #include <raft/core/host_mdspan.hpp>
 #include <raft/core/operators.hpp>
 #include <raft/core/resource/cuda_stream.hpp>
+#include <raft/core/resource/dry_run_flag.hpp>
 #include <raft/linalg/binary_op.cuh>
 #include <raft/linalg/unary_op.cuh>
 #include <raft/util/input_validation.hpp>
@@ -75,6 +76,7 @@ template <typename InType,
           typename = raft::enable_if_output_device_mdspan<OutType>>
 void power(raft::resources const& handle, InType in1, InType in2, OutType out)
 {
+  if (resource::get_dry_run_flag(handle)) { return; }
   using in_value_t  = typename InType::value_type;
   using out_value_t = typename OutType::value_type;
 
@@ -113,6 +115,7 @@ void power_scalar(
   OutType out,
   const raft::host_scalar_view<const typename InType::value_type, ScalarIdxType> scalar)
 {
+  if (resource::get_dry_run_flag(handle)) { return; }
   using in_value_t  = typename InType::value_type;
   using out_value_t = typename OutType::value_type;
 
diff --git a/cpp/include/raft/linalg/reduce.cuh b/cpp/include/raft/linalg/reduce.cuh
index 63db7d3ce6..6ae82d5a17 100644
--- a/cpp/include/raft/linalg/reduce.cuh
+++ b/cpp/include/raft/linalg/reduce.cuh
@@ -14,6 +14,7 @@
 #include <raft/core/device_mdspan.hpp>
 #include <raft/core/operators.hpp>
 #include <raft/core/resource/cuda_stream.hpp>
+#include <raft/core/resource/dry_run_flag.hpp>
 #include <raft/core/types.hpp>
 #include <raft/util/input_validation.hpp>
 
@@ -72,7 +73,7 @@ void reduce(OutType* dots,
             FinalLambda final_op   = raft::identity_op())
 {
   detail::reduce<rowMajor, alongRows, InType, OutType, IdxType>(
-    dots, data, D, N, init, stream, inplace, main_op, reduce_op, final_op);
+    false, dots, data, D, N, init, stream, inplace, main_op, reduce_op, final_op);
 }
 
 /**
@@ -167,16 +168,18 @@ void reduce(raft::resources const& handle,
                  "Output should be equal to number of columns in Input");
   }
 
-  reduce<row_major, along_rows>(dots.data_handle(),
-                                data.data_handle(),
-                                data.extent(1),
-                                data.extent(0),
-                                init,
-                                resource::get_cuda_stream(handle),
-                                inplace,
-                                main_op,
-                                reduce_op,
-                                final_op);
+  detail::reduce<row_major, along_rows, InElementType, OutElementType, IdxType>(
+    resource::get_dry_run_flag(handle),
+    dots.data_handle(),
+    data.data_handle(),
+    data.extent(1),
+    data.extent(0),
+    init,
+    resource::get_cuda_stream(handle),
+    inplace,
+    main_op,
+    reduce_op,
+    final_op);
 }
 
 /** @} */  // end of group reduction
diff --git a/cpp/include/raft/linalg/reduce_cols_by_key.cuh b/cpp/include/raft/linalg/reduce_cols_by_key.cuh
index 07759ec206..3eda80c1a9 100644
--- a/cpp/include/raft/linalg/reduce_cols_by_key.cuh
+++ b/cpp/include/raft/linalg/reduce_cols_by_key.cuh
@@ -12,6 +12,7 @@
 #include <raft/core/detail/macros.hpp>
 #include <raft/core/device_mdspan.hpp>
 #include <raft/core/resource/cuda_stream.hpp>
+#include <raft/core/resource/dry_run_flag.hpp>
 #include <raft/core/resources.hpp>
 
 namespace raft {
@@ -82,6 +83,7 @@ void reduce_cols_by_key(
   IndexType nkeys = 0,
   bool reset_sums = true)
 {
+  if (resource::get_dry_run_flag(handle)) { return; }
   if (nkeys > 0) {
     RAFT_EXPECTS(out.extent(1) == nkeys, "Output doesn't have nkeys columns");
   } else {
diff --git a/cpp/include/raft/linalg/reduce_rows_by_key.cuh b/cpp/include/raft/linalg/reduce_rows_by_key.cuh
index dd2f54c7bc..61bce8bb03 100644
--- a/cpp/include/raft/linalg/reduce_rows_by_key.cuh
+++ b/cpp/include/raft/linalg/reduce_rows_by_key.cuh
@@ -12,6 +12,7 @@
 #include <raft/core/detail/macros.hpp>
 #include <raft/core/device_mdspan.hpp>
 #include <raft/core/resource/cuda_stream.hpp>
+#include <raft/core/resource/dry_run_flag.hpp>
 #include <raft/core/resources.hpp>
 
 namespace raft {
@@ -148,6 +149,7 @@ void reduce_rows_by_key(
   std::optional<raft::device_vector_view<const WeightType, IndexType>> d_weights = std::nullopt,
   bool reset_sums                                                                = true)
 {
+  if (resource::get_dry_run_flag(handle)) { return; }
   RAFT_EXPECTS(d_A.extent(0) == d_A.extent(0) && d_sums.extent(1) == n_unique_keys,
                "Output is not of size ncols * n_unique_keys");
   RAFT_EXPECTS(d_keys.extent(0) == d_A.extent(1), "Keys is not of size nrows");
diff --git a/cpp/include/raft/linalg/sqrt.cuh b/cpp/include/raft/linalg/sqrt.cuh
index c571b68ae5..7bc1e2f4bd 100644
--- a/cpp/include/raft/linalg/sqrt.cuh
+++ b/cpp/include/raft/linalg/sqrt.cuh
@@ -11,6 +11,7 @@
 #include <raft/core/device_mdspan.hpp>
 #include <raft/core/operators.hpp>
 #include <raft/core/resource/cuda_stream.hpp>
+#include <raft/core/resource/dry_run_flag.hpp>
 #include <raft/linalg/unary_op.cuh>
 
 namespace raft {
@@ -52,6 +53,7 @@ template <typename InType,
           typename = raft::enable_if_output_device_mdspan<OutType>>
 void sqrt(raft::resources const& handle, InType in, OutType out)
 {
+  if (resource::get_dry_run_flag(handle)) { return; }
   using in_value_t  = typename InType::value_type;
   using out_value_t = typename OutType::value_type;
 
diff --git a/cpp/include/raft/linalg/strided_reduction.cuh b/cpp/include/raft/linalg/strided_reduction.cuh
index 9480eb9fa0..bd293aff36 100644
--- a/cpp/include/raft/linalg/strided_reduction.cuh
+++ b/cpp/include/raft/linalg/strided_reduction.cuh
@@ -14,6 +14,7 @@
 #include <raft/core/device_mdspan.hpp>
 #include <raft/core/operators.hpp>
 #include <raft/core/resource/cuda_stream.hpp>
+#include <raft/core/resource/dry_run_flag.hpp>
 #include <raft/core/resources.hpp>
 
 #include <type_traits>
@@ -128,6 +129,7 @@ void strided_reduction(raft::resources const& handle,
                        ReduceLambda reduce_op = raft::add_op(),
                        FinalLambda final_op   = raft::identity_op())
 {
+  if (resource::get_dry_run_flag(handle)) { return; }
   if constexpr (std::is_same_v<LayoutPolicy, raft::row_major>) {
     RAFT_EXPECTS(static_cast<IndexType>(dots.size()) == data.extent(1),
                  "Output should be equal to number of columns in Input");
diff --git a/cpp/include/raft/linalg/subtract.cuh b/cpp/include/raft/linalg/subtract.cuh
index 8e1b9ca9db..51b66ffbd2 100644
--- a/cpp/include/raft/linalg/subtract.cuh
+++ b/cpp/include/raft/linalg/subtract.cuh
@@ -14,6 +14,7 @@
 #include <raft/core/device_mdspan.hpp>
 #include <raft/core/host_mdspan.hpp>
 #include <raft/core/resource/cuda_stream.hpp>
+#include <raft/core/resource/dry_run_flag.hpp>
 #include <raft/util/input_validation.hpp>
 
 namespace raft {
@@ -99,6 +100,7 @@ template <typename InType,
           typename = raft::enable_if_output_device_mdspan<OutType>>
 void subtract(raft::resources const& handle, InType in1, InType in2, OutType out)
 {
+  if (resource::get_dry_run_flag(handle)) { return; }
   using in_value_t  = typename InType::value_type;
   using out_value_t = typename OutType::value_type;
 
@@ -137,6 +139,7 @@ void subtract_scalar(
   OutType out,
   raft::device_scalar_view<const typename InType::element_type, ScalarIdxType> scalar)
 {
+  if (resource::get_dry_run_flag(handle)) { return; }
   using in_value_t  = typename InType::value_type;
   using out_value_t = typename OutType::value_type;
 
@@ -173,6 +176,7 @@ void subtract_scalar(
   OutType out,
   raft::host_scalar_view<const typename InType::element_type, ScalarIdxType> scalar)
 {
+  if (resource::get_dry_run_flag(handle)) { return; }
   using in_value_t  = typename InType::value_type;
   using out_value_t = typename OutType::value_type;
 
diff --git a/cpp/include/raft/linalg/unary_op.cuh b/cpp/include/raft/linalg/unary_op.cuh
index abba6113a1..efa3082b88 100644
--- a/cpp/include/raft/linalg/unary_op.cuh
+++ b/cpp/include/raft/linalg/unary_op.cuh
@@ -10,6 +10,7 @@
 #include <raft/core/detail/macros.hpp>
 #include <raft/core/device_mdspan.hpp>
 #include <raft/core/resource/cuda_stream.hpp>
+#include <raft/core/resource/dry_run_flag.hpp>
 #include <raft/core/resources.hpp>
 #include <raft/linalg/map.cuh>
 
@@ -110,6 +111,7 @@ template <typename OutType,
           typename = raft::enable_if_output_device_mdspan<OutType>>
 void write_only_unary_op(const raft::resources& handle, OutType out, Lambda op)
 {
+  if (resource::get_dry_run_flag(handle)) { return; }
   return writeOnlyUnaryOp(out.data_handle(), out.size(), op, resource::get_cuda_stream(handle));
 }
 
diff --git a/cpp/include/raft/matrix/argmax.cuh b/cpp/include/raft/matrix/argmax.cuh
index 83736ba2c0..0337edce02 100644
--- a/cpp/include/raft/matrix/argmax.cuh
+++ b/cpp/include/raft/matrix/argmax.cuh
@@ -8,6 +8,7 @@
 #include <raft/core/detail/macros.hpp>
 #include <raft/core/device_mdspan.hpp>
 #include <raft/core/resource/cuda_stream.hpp>
+#include <raft/core/resource/dry_run_flag.hpp>
 #include <raft/matrix/detail/math.cuh>
 
 namespace raft {
@@ -29,6 +30,7 @@ void argmax(raft::resources const& handle,
             raft::device_matrix_view<const math_t, matrix_idx_t, row_major> in,
             raft::device_vector_view<idx_t, matrix_idx_t> out)
 {
+  if (resource::get_dry_run_flag(handle)) { return; }
   RAFT_EXPECTS(out.extent(0) == in.extent(0),
                "Size of output vector must equal number of rows in input matrix.");
   detail::argmax(in.data_handle(),
diff --git a/cpp/include/raft/matrix/argmin.cuh b/cpp/include/raft/matrix/argmin.cuh
index c5d37e05cd..4e746b4305 100644
--- a/cpp/include/raft/matrix/argmin.cuh
+++ b/cpp/include/raft/matrix/argmin.cuh
@@ -8,6 +8,7 @@
 #include <raft/core/detail/macros.hpp>
 #include <raft/core/device_mdspan.hpp>
 #include <raft/core/resource/cuda_stream.hpp>
+#include <raft/core/resource/dry_run_flag.hpp>
 #include <raft/matrix/detail/math.cuh>
 
 namespace raft {
@@ -29,6 +30,7 @@ void argmin(raft::resources const& handle,
             raft::device_matrix_view<const math_t, matrix_idx_t, row_major> in,
             raft::device_vector_view<idx_t, matrix_idx_t> out)
 {
+  if (resource::get_dry_run_flag(handle)) { return; }
   RAFT_EXPECTS(out.extent(0) == in.extent(0),
                "Size of output vector must equal number of rows in input matrix.");
   detail::argmin(in.data_handle(),
diff --git a/cpp/include/raft/matrix/col_wise_sort.cuh b/cpp/include/raft/matrix/col_wise_sort.cuh
index fc0f3f1063..fed94e4511 100644
--- a/cpp/include/raft/matrix/col_wise_sort.cuh
+++ b/cpp/include/raft/matrix/col_wise_sort.cuh
@@ -11,6 +11,7 @@
 #include <raft/core/device_mdarray.hpp>
 #include <raft/core/device_mdspan.hpp>
 #include <raft/core/resource/cuda_stream.hpp>
+#include <raft/core/resource/dry_run_flag.hpp>
 #include <raft/matrix/detail/columnWiseSort.cuh>
 
 namespace raft {
@@ -40,8 +41,16 @@ void sort_cols_per_row(const InType* in,
                        cudaStream_t stream,
                        InType* sortedKeys = nullptr)
 {
-  detail::sortColumnsPerRow<InType, OutType>(
-    in, out, n_rows, n_columns, bAllocWorkspace, workspacePtr, workspaceSize, stream, sortedKeys);
+  detail::sortColumnsPerRow<InType, OutType>(false,
+                                             in,
+                                             out,
+                                             n_rows,
+                                             n_columns,
+                                             bAllocWorkspace,
+                                             workspacePtr,
+                                             workspaceSize,
+                                             stream,
+                                             sortedKeys);
 }
 
 /**
@@ -80,12 +89,14 @@ void sort_cols_per_row(raft::resources const& handle,
                  "Input and `sorted_keys` matrices must have the same shape.");
   }
 
+  bool dry_run          = resource::get_dry_run_flag(handle);
   size_t workspace_size = 0;
   bool alloc_workspace  = false;
 
   in_t* keys = sorted_keys.has_value() ? sorted_keys.value().data_handle() : nullptr;
 
-  detail::sortColumnsPerRow<in_t, out_t>(in.data_handle(),
+  detail::sortColumnsPerRow<in_t, out_t>(dry_run,
+                                         in.data_handle(),
                                          out.data_handle(),
                                          in.extent(0),
                                          in.extent(1),
@@ -98,7 +109,10 @@ void sort_cols_per_row(raft::resources const& handle,
   if (alloc_workspace) {
     auto workspace = raft::make_device_vector<char>(handle, workspace_size);
 
-    detail::sortColumnsPerRow<in_t, out_t>(in.data_handle(),
+    if (dry_run) { return; }
+
+    detail::sortColumnsPerRow<in_t, out_t>(dry_run,
+                                           in.data_handle(),
                                            out.data_handle(),
                                            in.extent(0),
                                            in.extent(1),
diff --git a/cpp/include/raft/matrix/copy.cuh b/cpp/include/raft/matrix/copy.cuh
index f673835915..b5478113e8 100644
--- a/cpp/include/raft/matrix/copy.cuh
+++ b/cpp/include/raft/matrix/copy.cuh
@@ -8,6 +8,7 @@
 #include <raft/core/detail/macros.hpp>
 #include <raft/core/device_mdspan.hpp>
 #include <raft/core/resource/cuda_stream.hpp>
+#include <raft/core/resource/dry_run_flag.hpp>
 #include <raft/matrix/detail/matrix.cuh>
 #include <raft/util/input_validation.hpp>
 
@@ -36,6 +37,7 @@ void copy_rows(raft::resources const& handle,
                raft::device_matrix_view<m_t, idx_t, layout> out,
                raft::device_vector_view<const idx_t, idx_t> indices)
 {
+  if (resource::get_dry_run_flag(handle)) { return; }
   RAFT_EXPECTS(in.extent(1) == out.extent(1),
                "Input and output matrices must have same number of columns");
   RAFT_EXPECTS(indices.extent(0) == out.extent(0),
@@ -61,6 +63,7 @@ void copy(raft::resources const& handle,
           raft::device_matrix_view<const m_t, matrix_idx_t, row_major> in,
           raft::device_matrix_view<m_t, matrix_idx_t, row_major> out)
 {
+  if (resource::get_dry_run_flag(handle)) { return; }
   RAFT_EXPECTS(in.extent(0) == out.extent(0) && in.extent(1) == out.extent(1),
                "Input and output matrix shapes must match.");
 
@@ -81,6 +84,7 @@ void copy(raft::resources const& handle,
           raft::device_matrix_view<const m_t, matrix_idx_t, col_major> in,
           raft::device_matrix_view<m_t, matrix_idx_t, col_major> out)
 {
+  if (resource::get_dry_run_flag(handle)) { return; }
   RAFT_EXPECTS(in.extent(0) == out.extent(0) && in.extent(1) == out.extent(1),
                "Input and output matrix shapes must match.");
 
@@ -102,6 +106,7 @@ void trunc_zero_origin(raft::resources const& handle,
                        raft::device_matrix_view<const m_t, idx_t, col_major> in,
                        raft::device_matrix_view<m_t, idx_t, col_major> out)
 {
+  if (resource::get_dry_run_flag(handle)) { return; }
   RAFT_EXPECTS(out.extent(0) <= in.extent(0) && out.extent(1) <= in.extent(1),
                "Output matrix must have less or equal number of rows and columns");
 
diff --git a/cpp/include/raft/matrix/detail/columnWiseSort.cuh b/cpp/include/raft/matrix/detail/columnWiseSort.cuh
index c8c8b9090d..2487ce8b8d 100644
--- a/cpp/include/raft/matrix/detail/columnWiseSort.cuh
+++ b/cpp/include/raft/matrix/detail/columnWiseSort.cuh
@@ -164,7 +164,8 @@ cudaError_t layoutSortOffset(T* in, T value, int n_times, cudaStream_t stream)
  * @param sortedKeys: Optional, output matrix for sorted keys (input)
  */
 template <typename InType, typename OutType>
-void sortColumnsPerRow(const InType* in,
+void sortColumnsPerRow(bool dry_run,
+                       const InType* in,
                        OutType* out,
                        int n_rows,
                        int n_columns,
@@ -204,6 +205,8 @@ void sortColumnsPerRow(const InType* in,
     // more elements per thread --> more register pressure
     // 512(blockSize) * 8 elements per thread = 71 register / thread
 
+    if (dry_run) { return; }
+
     // instantiate some kernel combinations
     if (n_columns <= 512)
       INST_BLOCK_SORT(in, sortedKeys, out, n_rows, n_columns, 128, 4, stream);
@@ -256,6 +259,8 @@ void sortColumnsPerRow(const InType* in,
       // for segment offsets (numOffsets = numSegments + 1, see above)
       workspaceSize += raft::alignTo(sizeof(int) * (size_t)numOffsets, memAlignWidth);
     } else {
+      if (dry_run) { return; }
+
       size_t workspaceOffset = 0;
 
       if (!sortedKeys) {
@@ -307,6 +312,8 @@ void sortColumnsPerRow(const InType* in,
 
       workspaceSize += raft::alignTo(sizeof(OutType) * (size_t)n_columns, memAlignWidth);
     } else {
+      if (dry_run) { return; }
+
       size_t workspaceOffset   = 0;
       bool userKeyOutputBuffer = true;
 
diff --git a/cpp/include/raft/matrix/detail/gather.cuh b/cpp/include/raft/matrix/detail/gather.cuh
index c1686b2f55..08b2755710 100644
--- a/cpp/include/raft/matrix/detail/gather.cuh
+++ b/cpp/include/raft/matrix/detail/gather.cuh
@@ -14,6 +14,7 @@
 #include <raft/core/operators.hpp>
 #include <raft/core/pinned_mdarray.hpp>
 #include <raft/core/pinned_mdspan.hpp>
+#include <raft/core/resource/dry_run_flag.hpp>
 #include <raft/util/cuda_dev_essentials.cuh>
 #include <raft/util/cudart_utils.hpp>
 #include <raft/util/integer_utils.hpp>
@@ -551,13 +552,15 @@ void gather(raft::resources const& res,
             device_vector_view<const IdxT, MatIdxT> indices,
             raft::device_matrix_view<T, MatIdxT> output)
 {
+  auto dry_run = resource::get_dry_run_flag(res);
   raft::common::nvtx::range<common::nvtx::domain::raft> fun_scope("gather");
   IdxT n_dim        = output.extent(1);
   IdxT n_train      = output.extent(0);
   auto indices_host = raft::make_host_vector<IdxT, MatIdxT>(n_train);
-  raft::copy(
-    indices_host.data_handle(), indices.data_handle(), n_train, resource::get_cuda_stream(res));
-  resource::sync_stream(res);
+  if (!dry_run) {
+    raft::copy(
+      indices_host.data_handle(), indices.data_handle(), n_train, resource::get_cuda_stream(res));
+  }
 
   const size_t buffer_size = 32768 * 1024;  // bytes
   const size_t max_batch_size =
@@ -569,6 +572,10 @@ void gather(raft::resources const& res,
   auto out_tmp1 = raft::make_pinned_matrix<T, MatIdxT>(res, max_batch_size, n_dim);
   auto out_tmp2 = raft::make_pinned_matrix<T, MatIdxT>(res, max_batch_size, n_dim);
 
+  if (dry_run) { return; }
+
+  resource::sync_stream(res);
+
   // Usually a limited number of threads provide sufficient bandwidth for gathering data.
 #if defined(_OPENMP)
   int n_threads = std::min(omp_get_max_threads(), 32);
diff --git a/cpp/include/raft/matrix/detail/gather_inplace.cuh b/cpp/include/raft/matrix/detail/gather_inplace.cuh
index 1cfd7664ec..7eaf05539b 100644
--- a/cpp/include/raft/matrix/detail/gather_inplace.cuh
+++ b/cpp/include/raft/matrix/detail/gather_inplace.cuh
@@ -6,6 +6,7 @@
 
 #include <raft/core/detail/macros.hpp>
 #include <raft/core/device_mdarray.hpp>
+#include <raft/core/resource/dry_run_flag.hpp>
 #include <raft/core/resource/thrust_policy.hpp>
 #include <raft/linalg/map.cuh>
 #include <raft/util/fast_int_div.cuh>
@@ -39,12 +40,14 @@ void gatherInplaceImpl(raft::resources const& handle,
   // re-assign batch_size for default case
   if (batch_size == 0 || batch_size > n) batch_size = n;
 
+  auto scratch_space = raft::make_device_vector<MatrixT, IndexT>(handle, map_length * batch_size);
+
+  if (resource::get_dry_run_flag(handle)) { return; }
+
   auto exec_policy = resource::get_thrust_policy(handle);
 
   IndexT n_batches = raft::ceildiv(n, batch_size);
 
-  auto scratch_space = raft::make_device_vector<MatrixT, IndexT>(handle, map_length * batch_size);
-
   for (IndexT bid = 0; bid < n_batches; bid++) {
     IndexT batch_offset   = bid * batch_size;
     IndexT cols_per_batch = min(batch_size, n - batch_offset);
diff --git a/cpp/include/raft/matrix/detail/math.cuh b/cpp/include/raft/matrix/detail/math.cuh
index 14a7846704..bd6a6a0144 100644
--- a/cpp/include/raft/matrix/detail/math.cuh
+++ b/cpp/include/raft/matrix/detail/math.cuh
@@ -7,6 +7,7 @@
 
 #include <raft/core/detail/macros.hpp>
 #include <raft/core/operators.hpp>
+#include <raft/core/resource/dry_run_flag.hpp>
 #include <raft/core/resources.hpp>
 #include <raft/linalg/binary_op.cuh>
 #include <raft/linalg/map_then_reduce.cuh>
@@ -187,10 +188,10 @@ template <typename math_t, typename IdxType = int>
 void ratio(
   raft::resources const& handle, const math_t* src, math_t* dest, IdxType len, cudaStream_t stream)
 {
-  auto d_src  = src;
-  auto d_dest = dest;
-
   rmm::device_scalar<math_t> d_sum(stream);
+  if (resource::get_dry_run_flag(handle)) { return; }
+  auto d_src      = src;
+  auto d_dest     = dest;
   auto* d_sum_ptr = d_sum.data();
   raft::linalg::mapThenSumReduce(d_sum_ptr, len, raft::identity_op{}, stream, src);
   raft::linalg::unaryOp(
@@ -201,15 +202,16 @@ template <bool rowMajor, bool bcastAlongRows, typename Type, typename IdxType =
 void matrixVectorBinaryMult(
   Type* data, const Type* vec, IdxType n_row, IdxType n_col, cudaStream_t stream)
 {
-  raft::linalg::matrixVectorOp<rowMajor, bcastAlongRows>(
-    data, data, vec, n_col, n_row, raft::mul_op(), stream);
+  raft::linalg::detail::matrixVectorOp<rowMajor, bcastAlongRows>(
+    false, data, data, vec, n_col, n_row, raft::mul_op(), stream);
 }
 
 template <bool rowMajor, bool bcastAlongRows, typename Type, typename IdxType = int, int TPB = 256>
 void matrixVectorBinaryMultSkipZero(
   Type* data, const Type* vec, IdxType n_row, IdxType n_col, cudaStream_t stream)
 {
-  raft::linalg::matrixVectorOp<rowMajor, bcastAlongRows>(
+  raft::linalg::detail::matrixVectorOp<rowMajor, bcastAlongRows>(
+    false,
     data,
     data,
     vec,
@@ -228,8 +230,8 @@ template <bool rowMajor, bool bcastAlongRows, typename Type, typename IdxType =
 void matrixVectorBinaryDiv(
   Type* data, const Type* vec, IdxType n_row, IdxType n_col, cudaStream_t stream)
 {
-  raft::linalg::matrixVectorOp<rowMajor, bcastAlongRows>(
-    data, data, vec, n_col, n_row, raft::div_op(), stream);
+  raft::linalg::detail::matrixVectorOp<rowMajor, bcastAlongRows>(
+    false, data, data, vec, n_col, n_row, raft::div_op(), stream);
 }
 
 template <bool rowMajor, bool bcastAlongRows, typename Type, typename IdxType = int, int TPB = 256>
@@ -241,7 +243,8 @@ void matrixVectorBinaryDivSkipZero(Type* data,
                                    bool return_zero = false)
 {
   if (return_zero) {
-    raft::linalg::matrixVectorOp<rowMajor, bcastAlongRows>(
+    raft::linalg::detail::matrixVectorOp<rowMajor, bcastAlongRows>(
+      false,
       data,
       data,
       vec,
@@ -255,7 +258,8 @@ void matrixVectorBinaryDivSkipZero(Type* data,
       },
       stream);
   } else {
-    raft::linalg::matrixVectorOp<rowMajor, bcastAlongRows>(
+    raft::linalg::detail::matrixVectorOp<rowMajor, bcastAlongRows>(
+      false,
       data,
       data,
       vec,
@@ -275,16 +279,16 @@ template <bool rowMajor, bool bcastAlongRows, typename Type, typename IdxType =
 void matrixVectorBinaryAdd(
   Type* data, const Type* vec, IdxType n_row, IdxType n_col, cudaStream_t stream)
 {
-  raft::linalg::matrixVectorOp<rowMajor, bcastAlongRows>(
-    data, data, vec, n_col, n_row, raft::add_op(), stream);
+  raft::linalg::detail::matrixVectorOp<rowMajor, bcastAlongRows>(
+    false, data, data, vec, n_col, n_row, raft::add_op(), stream);
 }
 
 template <bool rowMajor, bool bcastAlongRows, typename Type, typename IdxType = int, int TPB = 256>
 void matrixVectorBinarySub(
   Type* data, const Type* vec, IdxType n_row, IdxType n_col, cudaStream_t stream)
 {
-  raft::linalg::matrixVectorOp<rowMajor, bcastAlongRows>(
-    data, data, vec, n_col, n_row, raft::sub_op(), stream);
+  raft::linalg::detail::matrixVectorOp<rowMajor, bcastAlongRows>(
+    false, data, data, vec, n_col, n_row, raft::sub_op(), stream);
 }
 
 // Computes an argmin/argmax column-wise in a DxN matrix
diff --git a/cpp/include/raft/matrix/detail/matrix.cuh b/cpp/include/raft/matrix/detail/matrix.cuh
index f3545fb103..a7d41e19f8 100644
--- a/cpp/include/raft/matrix/detail/matrix.cuh
+++ b/cpp/include/raft/matrix/detail/matrix.cuh
@@ -7,6 +7,7 @@
 
 #include <raft/core/detail/macros.hpp>
 #include <raft/core/resource/cublas_handle.hpp>
+#include <raft/core/resource/dry_run_flag.hpp>
 #include <raft/core/resources.hpp>
 #include <raft/linalg/detail/cublas_wrappers.hpp>
 #include <raft/util/cache_util.cuh>
@@ -297,6 +298,7 @@ void getDiagonalInverseMatrix(m_t* in, idx_t len, cudaStream_t stream)
 template <typename m_t, typename idx_t = int>
 m_t getL2Norm(raft::resources const& handle, const m_t* in, idx_t size, cudaStream_t stream)
 {
+  if (resource::get_dry_run_flag(handle)) { return m_t{0}; }
   cublasHandle_t cublasH = resource::get_cublas_handle(handle);
   m_t normval            = 0;
   RAFT_EXPECTS(
diff --git a/cpp/include/raft/matrix/detail/scatter_inplace.cuh b/cpp/include/raft/matrix/detail/scatter_inplace.cuh
index 2c735e3fda..ecad4a0477 100644
--- a/cpp/include/raft/matrix/detail/scatter_inplace.cuh
+++ b/cpp/include/raft/matrix/detail/scatter_inplace.cuh
@@ -6,6 +6,7 @@
 
 #include <raft/core/detail/macros.hpp>
 #include <raft/core/device_mdarray.hpp>
+#include <raft/core/resource/dry_run_flag.hpp>
 #include <raft/core/resource/thrust_policy.hpp>
 #include <raft/linalg/map.cuh>
 #include <raft/util/cuda_dev_essentials.cuh>
@@ -64,12 +65,14 @@ void scatterInplaceImpl(
   // re-assign batch_size for default case
   if (batch_size == 0 || batch_size > n) batch_size = n;
 
+  auto scratch_space = raft::make_device_vector<MatrixT, IndexT>(handle, m * batch_size);
+
+  if (resource::get_dry_run_flag(handle)) { return; }
+
   auto exec_policy = resource::get_thrust_policy(handle);
 
   IndexT n_batches = raft::ceildiv(n, batch_size);
 
-  auto scratch_space = raft::make_device_vector<MatrixT, IndexT>(handle, m * batch_size);
-
   for (IndexT bid = 0; bid < n_batches; bid++) {
     IndexT batch_offset   = bid * batch_size;
     IndexT cols_per_batch = min(batch_size, n - batch_offset);
diff --git a/cpp/include/raft/matrix/detail/select_k-inl.cuh b/cpp/include/raft/matrix/detail/select_k-inl.cuh
index f693f986c6..d22d0c24ce 100644
--- a/cpp/include/raft/matrix/detail/select_k-inl.cuh
+++ b/cpp/include/raft/matrix/detail/select_k-inl.cuh
@@ -15,6 +15,7 @@
 #include <raft/core/nvtx.hpp>
 #include <raft/core/operators.hpp>
 #include <raft/core/resource/device_memory_resource.hpp>
+#include <raft/core/resource/dry_run_flag.hpp>
 #include <raft/linalg/map.cuh>
 #include <raft/matrix/select_k_types.hpp>
 
@@ -127,6 +128,8 @@ void segmented_sort_by_key(raft::resources const& handle,
   auto d_temp_storage = raft::make_device_mdarray<char, size_t>(
     handle, mr, raft::make_extents<size_t>(temp_storage_bytes));
 
+  if (resource::get_dry_run_flag(handle)) { return; }
+
   if (asc) {
     // Run sorting operation
     cub::DeviceSegmentedRadixSort::SortPairs((void*)d_temp_storage.data_handle(),
diff --git a/cpp/include/raft/matrix/detail/select_radix.cuh b/cpp/include/raft/matrix/detail/select_radix.cuh
index 718096c466..ea64a0f524 100644
--- a/cpp/include/raft/matrix/detail/select_radix.cuh
+++ b/cpp/include/raft/matrix/detail/select_radix.cuh
@@ -11,6 +11,7 @@
 #include <raft/core/resource/cuda_stream.hpp>
 #include <raft/core/resource/device_memory_resource.hpp>
 #include <raft/core/resource/device_properties.hpp>
+#include <raft/core/resource/dry_run_flag.hpp>
 #include <raft/linalg/map.cuh>
 #include <raft/matrix/detail/select_k_layout.cuh>
 #include <raft/util/cudart_utils.hpp>
@@ -877,7 +878,8 @@ unsigned calc_grid_dim(int batch_size, IdxT len, int sm_cnt)
 }
 
 template <typename T, typename IdxT, int BitsPerPass, int BlockSize, typename RowLayout>
-void radix_topk(const T* in,
+void radix_topk(bool dry_run,
+                const T* in,
                 const IdxT* in_idx,
                 int batch_size,
                 IdxT len,
@@ -911,6 +913,8 @@ void radix_topk(const T* in,
 
   rmm::device_buffer bufs(max_chunk_size * buf_len * 2 * (sizeof(T) + sizeof(IdxT)), stream, mr);
 
+  if (dry_run) { return; }
+
   for (size_t offset = 0; offset < static_cast<size_t>(batch_size); offset += max_chunk_size) {
     int chunk_size = std::min(max_chunk_size, batch_size - offset);
     RAFT_CUDA_TRY(
@@ -1152,7 +1156,8 @@ RAFT_KERNEL radix_topk_one_block_kernel(const T* in,
 // used. It's used when len is relatively small or when the number of blocks per row calculated by
 // `calc_grid_dim()` is 1.
 template <typename T, typename IdxT, int BitsPerPass, int BlockSize, typename RowLayout>
-void radix_topk_one_block(const T* in,
+void radix_topk_one_block(bool dry_run,
+                          const T* in,
                           const IdxT* in_idx,
                           int batch_size,
                           IdxT len,
@@ -1174,6 +1179,8 @@ void radix_topk_one_block(const T* in,
 
   rmm::device_buffer bufs(max_chunk_size * buf_len * 2 * (sizeof(T) + sizeof(IdxT)), stream, mr);
 
+  if (dry_run) { return; }
+
   for (size_t offset = 0; offset < static_cast<size_t>(batch_size); offset += max_chunk_size) {
     int chunk_size          = std::min(max_chunk_size, batch_size - offset);
     const IdxT* chunk_len_i = len_i ? (len_i + offset) : nullptr;
@@ -1270,9 +1277,11 @@ void select_k(raft::resources const& res,
   RAFT_EXPECTS(RowLayout::is_uniform || len_i != nullptr,
                "CSR layout requires a non-null indptr array (len_i)!");
 
-  auto stream = resource::get_cuda_stream(res);
-  auto mr     = resource::get_workspace_resource_ref(res);
+  bool dry_run = resource::get_dry_run_flag(res);
+  auto stream  = resource::get_cuda_stream(res);
+  auto mr      = resource::get_workspace_resource_ref(res);
   if (k == len && RowLayout::is_uniform) {
+    if (dry_run) { return; }
     RAFT_CUDA_TRY(
       cudaMemcpyAsync(out, in, sizeof(T) * batch_size * len, cudaMemcpyDeviceToDevice, stream));
     if (in_idx) {
@@ -1292,15 +1301,27 @@ void select_k(raft::resources const& res,
 
   if (len <= BlockSize * items_per_thread) {
     impl::radix_topk_one_block<T, IdxT, BitsPerPass, BlockSize, RowLayout>(
-      in, in_idx, batch_size, len, k, out, out_idx, select_min, len_i, sm_cnt, stream, mr);
+      dry_run, in, in_idx, batch_size, len, k, out, out_idx, select_min, len_i, sm_cnt, stream, mr);
   } else {
     unsigned grid_dim =
       impl::calc_grid_dim<T, IdxT, BitsPerPass, BlockSize>(batch_size, len, sm_cnt);
     if (grid_dim == 1) {
-      impl::radix_topk_one_block<T, IdxT, BitsPerPass, BlockSize, RowLayout>(
-        in, in_idx, batch_size, len, k, out, out_idx, select_min, len_i, sm_cnt, stream, mr);
+      impl::radix_topk_one_block<T, IdxT, BitsPerPass, BlockSize, RowLayout>(dry_run,
+                                                                             in,
+                                                                             in_idx,
+                                                                             batch_size,
+                                                                             len,
+                                                                             k,
+                                                                             out,
+                                                                             out_idx,
+                                                                             select_min,
+                                                                             len_i,
+                                                                             sm_cnt,
+                                                                             stream,
+                                                                             mr);
     } else {
-      impl::radix_topk<T, IdxT, BitsPerPass, BlockSize, RowLayout>(in,
+      impl::radix_topk<T, IdxT, BitsPerPass, BlockSize, RowLayout>(dry_run,
+                                                                   in,
                                                                    in_idx,
                                                                    batch_size,
                                                                    len,
diff --git a/cpp/include/raft/matrix/detail/select_warpsort.cuh b/cpp/include/raft/matrix/detail/select_warpsort.cuh
index b517ef8c10..830720c42d 100644
--- a/cpp/include/raft/matrix/detail/select_warpsort.cuh
+++ b/cpp/include/raft/matrix/detail/select_warpsort.cuh
@@ -10,6 +10,7 @@
 #include <raft/core/resource/cuda_stream.hpp>
 #include <raft/core/resource/custom_resource.hpp>
 #include <raft/core/resource/device_memory_resource.hpp>
+#include <raft/core/resource/dry_run_flag.hpp>
 #include <raft/matrix/detail/select_k_layout.cuh>
 #include <raft/util/bitonic_sort.cuh>
 #include <raft/util/cache.hpp>
@@ -1043,7 +1044,8 @@ template <template <int, bool, typename, typename> class WarpSortClass,
           typename T,
           typename IdxT,
           typename RowLayout>
-void select_k_(int num_of_block,
+void select_k_(bool dry_run,
+               int num_of_block,
                int num_of_warp,
                const T* in,
                const IdxT* in_idx,
@@ -1059,6 +1061,7 @@ void select_k_(int num_of_block,
 {
   rmm::device_uvector<T> tmp_val(num_of_block * k * batch_size, stream, mr);
   rmm::device_uvector<IdxT> tmp_idx(num_of_block * k * batch_size, stream, mr);
+  if (dry_run) { return; }
 
   int capacity   = bound_by_power_of_two(k);
   int warp_width = std::min(capacity, WarpSize);
@@ -1122,7 +1125,8 @@ void select_k_impl(raft::resources const& res,
   calc_launch_parameter<WarpSortClass, T, IdxT>(
     res, batch_size, len, k, &num_of_block, &num_of_warp);
 
-  select_k_<WarpSortClass, T, IdxT, RowLayout>(num_of_block,
+  select_k_<WarpSortClass, T, IdxT, RowLayout>(resource::get_dry_run_flag(res),
+                                               num_of_block,
                                                num_of_warp,
                                                in,
                                                in_idx,
@@ -1186,6 +1190,7 @@ void select_k(raft::resources const& res,
               bool select_min,
               const IdxT* in_indptr = nullptr)
 {
+  if (resource::get_dry_run_flag(res)) { return; }
   ASSERT(k <= kMaxCapacity, "Current max k is %d (requested %d)", kMaxCapacity, k);
   ASSERT(len <= size_t(std::numeric_limits<IdxT>::max()),
          "The `len` (%zu) does not fit the indexing type",
@@ -1199,7 +1204,8 @@ void select_k(raft::resources const& res,
   int len_per_thread = len / (num_of_block * num_of_warp * std::min(capacity, WarpSize));
 
   if (len_per_thread <= LaunchThreshold<warp_sort_immediate>::len_factor_for_choosing) {
-    select_k_<warp_sort_immediate, T, IdxT, RowLayout>(num_of_block,
+    select_k_<warp_sort_immediate, T, IdxT, RowLayout>(resource::get_dry_run_flag(res),
+                                                       num_of_block,
                                                        num_of_warp,
                                                        in,
                                                        in_idx,
@@ -1215,7 +1221,8 @@ void select_k(raft::resources const& res,
   } else {
     calc_launch_parameter<warp_sort_filtered, T, IdxT>(
       res, batch_size, len, k, &num_of_block, &num_of_warp);
-    select_k_<warp_sort_filtered, T, IdxT, RowLayout>(num_of_block,
+    select_k_<warp_sort_filtered, T, IdxT, RowLayout>(resource::get_dry_run_flag(res),
+                                                      num_of_block,
                                                       num_of_warp,
                                                       in,
                                                       in_idx,
diff --git a/cpp/include/raft/matrix/detail/shift.cuh b/cpp/include/raft/matrix/detail/shift.cuh
index f7e9f78a6f..a245c2cf2f 100644
--- a/cpp/include/raft/matrix/detail/shift.cuh
+++ b/cpp/include/raft/matrix/detail/shift.cuh
@@ -7,6 +7,7 @@
 
 #include <raft/core/detail/macros.hpp>
 #include <raft/core/device_mdspan.hpp>
+#include <raft/core/resource/dry_run_flag.hpp>
 #include <raft/core/resources.hpp>
 #include <raft/matrix/shift_types.hpp>
 
@@ -135,6 +136,7 @@ void shift_dispatch(raft::resources const& handle,
                     ShiftDirection shift_direction = ShiftDirection::TOWARDS_END,
                     ShiftType shift_type           = ShiftType::COL)
 {
+  if (resource::get_dry_run_flag(handle)) { return; }
   size_t n_rows = in_out.extent(0);
   size_t n_cols = in_out.extent(1);
   size_t TPB    = 256;
@@ -170,6 +172,7 @@ void shift(raft::resources const& handle,
            ShiftDirection shift_direction = ShiftDirection::TOWARDS_END,
            ShiftType shift_type           = ShiftType::COL)
 {
+  if (resource::get_dry_run_flag(handle)) { return; }
   if (val.has_value()) {
     shift_dispatch<ValueT, IdxT, ValueT, CONSTANT>(
       handle, in_out, val.value(), k, shift_direction, shift_type);
@@ -187,6 +190,7 @@ void shift(raft::resources const& handle,
            ShiftDirection shift_direction = ShiftDirection::TOWARDS_END,
            ShiftType shift_type           = ShiftType::COL)
 {
+  if (resource::get_dry_run_flag(handle)) { return; }
   size_t k = shift_type == ShiftType::COL ? values.extent(1) : values.extent(0);
   shift_dispatch<ValueT, IdxT, const ValueT*, MATRIX>(
     handle, in_out, values.data_handle(), k, shift_direction, shift_type);
diff --git a/cpp/include/raft/matrix/diagonal.cuh b/cpp/include/raft/matrix/diagonal.cuh
index 9936a3e5be..0363861797 100644
--- a/cpp/include/raft/matrix/diagonal.cuh
+++ b/cpp/include/raft/matrix/diagonal.cuh
@@ -8,6 +8,7 @@
 #include <raft/core/detail/macros.hpp>
 #include <raft/core/device_mdspan.hpp>
 #include <raft/core/resource/cuda_stream.hpp>
+#include <raft/core/resource/dry_run_flag.hpp>
 #include <raft/matrix/detail/matrix.cuh>
 #include <raft/matrix/init.cuh>
 #include <raft/util/input_validation.hpp>
@@ -31,6 +32,7 @@ void set_diagonal(raft::resources const& handle,
                   raft::device_vector_view<const m_t, idx_t> vec,
                   raft::device_matrix_view<m_t, idx_t, layout> matrix)
 {
+  if (resource::get_dry_run_flag(handle)) { return; }
   RAFT_EXPECTS(vec.extent(0) == std::min(matrix.extent(0), matrix.extent(1)),
                "Diagonal vector must be min(matrix.n_rows, matrix.n_cols)");
   constexpr auto is_row_major = std::is_same_v<layout, layout_c_contiguous>;
@@ -54,6 +56,7 @@ void get_diagonal(raft::resources const& handle,
                   raft::device_matrix_view<const m_t, idx_t, layout> matrix,
                   raft::device_vector_view<m_t, idx_t> vec)
 {
+  if (resource::get_dry_run_flag(handle)) { return; }
   RAFT_EXPECTS(vec.extent(0) == std::min(matrix.extent(0), matrix.extent(1)),
                "Diagonal vector must be min(matrix.n_rows, matrix.n_cols)");
   constexpr auto is_row_major = std::is_same_v<layout, layout_c_contiguous>;
@@ -74,6 +77,7 @@ template <typename m_t, typename idx_t, typename layout>
 void invert_diagonal(raft::resources const& handle,
                      raft::device_matrix_view<m_t, idx_t, layout> inout)
 {
+  if (resource::get_dry_run_flag(handle)) { return; }
   // TODO: Use get_diagonal for this to support rectangular
   RAFT_EXPECTS(inout.extent(0) == inout.extent(1), "Matrix must be square.");
   detail::getDiagonalInverseMatrix(
@@ -94,6 +98,7 @@ void eye(const raft::resources& handle, raft::device_matrix_view<math_t, idx_t,
   RAFT_EXPECTS(raft::is_row_or_column_major(out), "Output must be contiguous");
 
   auto diag = raft::make_device_vector<math_t, idx_t>(handle, min(out.extent(0), out.extent(1)));
+  if (resource::get_dry_run_flag(handle)) { return; }
   RAFT_CUDA_TRY(cudaMemsetAsync(
     out.data_handle(), 0, out.size() * sizeof(math_t), resource::get_cuda_stream(handle)));
   raft::matrix::fill(handle, diag.view(), math_t(1));
diff --git a/cpp/include/raft/matrix/gather.cuh b/cpp/include/raft/matrix/gather.cuh
index 8b02d3827f..f39e58e483 100644
--- a/cpp/include/raft/matrix/gather.cuh
+++ b/cpp/include/raft/matrix/gather.cuh
@@ -8,6 +8,7 @@
 #include <raft/core/detail/macros.hpp>
 #include <raft/core/device_mdspan.hpp>
 #include <raft/core/resource/cuda_stream.hpp>
+#include <raft/core/resource/dry_run_flag.hpp>
 #include <raft/core/resources.hpp>
 #include <raft/matrix/detail/gather.cuh>
 #include <raft/matrix/detail/gather_inplace.cuh>
@@ -209,6 +210,7 @@ void gather(const raft::resources& handle,
             raft::device_matrix_view<matrix_t, idx_t, row_major> out,
             map_xform_t transform_op = raft::identity_op())
 {
+  if (resource::get_dry_run_flag(handle)) { return; }
   RAFT_EXPECTS(out.extent(0) == map.extent(0),
                "Number of rows in output matrix must equal the size of the map vector");
   RAFT_EXPECTS(out.extent(1) == in.extent(1),
@@ -254,6 +256,7 @@ void gather(
   raft::device_matrix_view<matrix_t, idx_t, row_major> out,
   map_xform_t transform_op = raft::identity_op())
 {
+  if (resource::get_dry_run_flag(handle)) { return; }
   RAFT_EXPECTS(out.extent(0) == map.extent(0),
                "Number of rows in output matrix must equal the size of the map vector");
   RAFT_EXPECTS(out.extent(1) == in.extent(1),
@@ -308,6 +311,7 @@ void gather_if(const raft::resources& handle,
                unary_pred_t pred_op,
                map_xform_t transform_op = raft::identity_op())
 {
+  if (resource::get_dry_run_flag(handle)) { return; }
   RAFT_EXPECTS(out.extent(0) == map.extent(0),
                "Number of rows in output matrix must equal the size of the map vector");
   RAFT_EXPECTS(out.extent(1) == in.extent(1),
@@ -365,6 +369,7 @@ void gather_if(const raft::resources& handle,
                unary_pred_t pred_op,
                map_xform_t transform_op = raft::identity_op())
 {
+  if (resource::get_dry_run_flag(handle)) { return; }
   RAFT_EXPECTS(out.extent(0) == map.extent(0),
                "Number of rows in output matrix must equal the size of the map vector");
   RAFT_EXPECTS(out.extent(1) == in.extent(1),
diff --git a/cpp/include/raft/matrix/init.cuh b/cpp/include/raft/matrix/init.cuh
index 58e0c4ed22..dcdb4c8f5f 100644
--- a/cpp/include/raft/matrix/init.cuh
+++ b/cpp/include/raft/matrix/init.cuh
@@ -9,6 +9,7 @@
 #include <raft/core/device_mdspan.hpp>
 #include <raft/core/host_mdspan.hpp>
 #include <raft/core/resource/cuda_stream.hpp>
+#include <raft/core/resource/dry_run_flag.hpp>
 #include <raft/linalg/map.cuh>
 #include <raft/matrix/detail/math.cuh>
 
@@ -36,6 +37,7 @@ void fill(raft::resources const& handle,
           raft::device_mdspan<math_t, extents, layout> out,
           raft::host_scalar_view<math_t> scalar)
 {
+  if (resource::get_dry_run_flag(handle)) { return; }
   RAFT_EXPECTS(raft::is_row_or_column_major(out), "Data layout not supported");
   RAFT_EXPECTS(in.size() == out.size(), "Input and output matrices must be the same size.");
   RAFT_EXPECTS(scalar.data_handle() != nullptr, "Empty scalar");
@@ -60,6 +62,7 @@ void fill(raft::resources const& handle,
           raft::device_mdspan<math_t, extents, layout> inout,
           math_t scalar)
 {
+  if (resource::get_dry_run_flag(handle)) { return; }
   linalg::map(handle, inout, raft::const_op{scalar});
 }
 
diff --git a/cpp/include/raft/matrix/linewise_op.cuh b/cpp/include/raft/matrix/linewise_op.cuh
index 788d9e0044..ed428064fb 100644
--- a/cpp/include/raft/matrix/linewise_op.cuh
+++ b/cpp/include/raft/matrix/linewise_op.cuh
@@ -8,6 +8,7 @@
 #include <raft/core/detail/macros.hpp>
 #include <raft/core/device_mdspan.hpp>
 #include <raft/core/resource/cuda_stream.hpp>
+#include <raft/core/resource/dry_run_flag.hpp>
 #include <raft/core/resources.hpp>
 #include <raft/core/types.hpp>
 #include <raft/matrix/detail/linewise_op.cuh>
@@ -62,6 +63,7 @@ void linewise_op(raft::resources const& handle,
                  Lambda op,
                  vec_t... vecs)
 {
+  if (resource::get_dry_run_flag(handle)) { return; }
   constexpr auto is_rowmajor = std::is_same_v<layout, row_major>;
   constexpr auto is_colmajor = std::is_same_v<layout, col_major>;
 
@@ -97,6 +99,7 @@ void linewise_op(raft::resources const& handle,
                  Lambda op,
                  vec_t... vecs)
 {
+  if (resource::get_dry_run_flag(handle)) { return; }
   constexpr auto is_rowmajor = std::is_same_v<layout, raft::layout_right_padded<m_t>>;
   constexpr auto is_colmajor = std::is_same_v<layout, raft::layout_left_padded<m_t>>;
 
diff --git a/cpp/include/raft/matrix/norm.cuh b/cpp/include/raft/matrix/norm.cuh
index a04f1da1da..b573a9d7de 100644
--- a/cpp/include/raft/matrix/norm.cuh
+++ b/cpp/include/raft/matrix/norm.cuh
@@ -8,6 +8,7 @@
 #include <raft/core/detail/macros.hpp>
 #include <raft/core/device_mdspan.hpp>
 #include <raft/core/resource/cuda_stream.hpp>
+#include <raft/core/resource/dry_run_flag.hpp>
 #include <raft/matrix/detail/matrix.cuh>
 
 namespace raft {
@@ -27,6 +28,7 @@ namespace matrix {
 template <typename m_t, typename idx_t>
 m_t l2_norm(raft::resources const& handle, raft::device_mdspan<const m_t, idx_t> in)
 {
+  if (resource::get_dry_run_flag(handle)) { return {}; }
   return detail::getL2Norm(handle, in.data_handle(), in.size(), resource::get_cuda_stream(handle));
 }
 
diff --git a/cpp/include/raft/matrix/power.cuh b/cpp/include/raft/matrix/power.cuh
index 57160e39c0..fab14e4883 100644
--- a/cpp/include/raft/matrix/power.cuh
+++ b/cpp/include/raft/matrix/power.cuh
@@ -8,6 +8,7 @@
 #include <raft/core/detail/macros.hpp>
 #include <raft/core/device_mdspan.hpp>
 #include <raft/core/resource/cuda_stream.hpp>
+#include <raft/core/resource/dry_run_flag.hpp>
 #include <raft/matrix/detail/math.cuh>
 
 namespace raft {
@@ -34,6 +35,7 @@ void weighted_power(raft::resources const& handle,
                     raft::device_matrix_view<math_t, idx_t, layout> out,
                     math_t scalar)
 {
+  if (resource::get_dry_run_flag(handle)) { return; }
   RAFT_EXPECTS(in.size() == out.size(), "Size of input and output matrices must be equal");
   detail::power(
     in.data_handle(), out.data_handle(), scalar, in.size(), resource::get_cuda_stream(handle));
@@ -53,6 +55,7 @@ void weighted_power(raft::resources const& handle,
                     raft::device_matrix_view<math_t, idx_t, layout> inout,
                     math_t scalar)
 {
+  if (resource::get_dry_run_flag(handle)) { return; }
   detail::power(inout.data_handle(), scalar, inout.size(), resource::get_cuda_stream(handle));
 }
 
@@ -67,6 +70,7 @@ void weighted_power(raft::resources const& handle,
 template <typename math_t, typename idx_t, typename layout>
 void power(raft::resources const& handle, raft::device_matrix_view<math_t, idx_t, layout> inout)
 {
+  if (resource::get_dry_run_flag(handle)) { return; }
   detail::power<math_t>(inout.data_handle(), inout.size(), resource::get_cuda_stream(handle));
 }
 
@@ -85,6 +89,7 @@ void power(raft::resources const& handle,
            raft::device_matrix_view<const math_t, idx_t, layout> in,
            raft::device_matrix_view<math_t, idx_t, layout> out)
 {
+  if (resource::get_dry_run_flag(handle)) { return; }
   RAFT_EXPECTS(in.size() == out.size(), "Input and output matrices must be same size.");
   detail::power<math_t>(
     in.data_handle(), out.data_handle(), in.size(), resource::get_cuda_stream(handle));
diff --git a/cpp/include/raft/matrix/print.cuh b/cpp/include/raft/matrix/print.cuh
index 77a0440870..ce4532a642 100644
--- a/cpp/include/raft/matrix/print.cuh
+++ b/cpp/include/raft/matrix/print.cuh
@@ -9,6 +9,7 @@
 #include <raft/core/device_mdspan.hpp>
 #include <raft/core/host_mdspan.hpp>
 #include <raft/core/resource/cuda_stream.hpp>
+#include <raft/core/resource/dry_run_flag.hpp>
 #include <raft/matrix/detail/matrix.cuh>
 #include <raft/matrix/matrix_types.hpp>
 
@@ -33,6 +34,7 @@ void print(raft::resources const& handle,
            raft::device_matrix_view<const m_t, idx_t, col_major> in,
            print_separators& separators)
 {
+  if (resource::get_dry_run_flag(handle)) { return; }
   detail::print(in.data_handle(),
                 in.extent(0),
                 in.extent(1),
diff --git a/cpp/include/raft/matrix/reciprocal.cuh b/cpp/include/raft/matrix/reciprocal.cuh
index c4aa5be870..aba28d6eed 100644
--- a/cpp/include/raft/matrix/reciprocal.cuh
+++ b/cpp/include/raft/matrix/reciprocal.cuh
@@ -9,6 +9,7 @@
 #include <raft/core/device_mdspan.hpp>
 #include <raft/core/host_mdspan.hpp>
 #include <raft/core/resource/cuda_stream.hpp>
+#include <raft/core/resource/dry_run_flag.hpp>
 #include <raft/matrix/detail/math.cuh>
 
 namespace raft {
@@ -39,6 +40,7 @@ void reciprocal(raft::resources const& handle,
                 bool setzero = false,
                 math_t thres = 1e-15)
 {
+  if (resource::get_dry_run_flag(handle)) { return; }
   RAFT_EXPECTS(in.size() == out.size(), "Input and output matrices must have the same size.");
   detail::reciprocal<math_t>(in.data_handle(),
                              out.data_handle(),
@@ -68,6 +70,7 @@ void reciprocal(raft::resources const& handle,
                 bool setzero = false,
                 math_t thres = 1e-15)
 {
+  if (resource::get_dry_run_flag(handle)) { return; }
   detail::reciprocal<math_t>(inout.data_handle(),
                              *(scalar.data_handle()),
                              inout.size(),
diff --git a/cpp/include/raft/matrix/reverse.cuh b/cpp/include/raft/matrix/reverse.cuh
index 240a3c3eba..1e5a031566 100644
--- a/cpp/include/raft/matrix/reverse.cuh
+++ b/cpp/include/raft/matrix/reverse.cuh
@@ -8,6 +8,7 @@
 #include <raft/core/detail/macros.hpp>
 #include <raft/core/device_mdspan.hpp>
 #include <raft/core/resource/cuda_stream.hpp>
+#include <raft/core/resource/dry_run_flag.hpp>
 #include <raft/matrix/detail/matrix.cuh>
 #include <raft/util/input_validation.hpp>
 
@@ -29,6 +30,7 @@ template <typename m_t, typename idx_t, typename layout_t>
 void col_reverse(raft::resources const& handle,
                  raft::device_matrix_view<m_t, idx_t, layout_t> inout)
 {
+  if (resource::get_dry_run_flag(handle)) { return; }
   RAFT_EXPECTS(raft::is_row_or_column_major(inout), "Unsupported matrix layout");
   if (raft::is_col_major(inout)) {
     detail::colReverse(
@@ -49,6 +51,7 @@ template <typename m_t, typename idx_t, typename layout_t>
 void row_reverse(raft::resources const& handle,
                  raft::device_matrix_view<m_t, idx_t, layout_t> inout)
 {
+  if (resource::get_dry_run_flag(handle)) { return; }
   RAFT_EXPECTS(raft::is_row_or_column_major(inout), "Unsupported matrix layout");
   if (raft::is_col_major(inout)) {
     detail::rowReverse(
diff --git a/cpp/include/raft/matrix/sign_flip.cuh b/cpp/include/raft/matrix/sign_flip.cuh
index 4e680f63e7..519986a46d 100644
--- a/cpp/include/raft/matrix/sign_flip.cuh
+++ b/cpp/include/raft/matrix/sign_flip.cuh
@@ -8,6 +8,7 @@
 #include <raft/core/detail/macros.hpp>
 #include <raft/core/device_mdspan.hpp>
 #include <raft/core/resource/cuda_stream.hpp>
+#include <raft/core/resource/dry_run_flag.hpp>
 #include <raft/matrix/detail/matrix.cuh>
 
 namespace raft {
@@ -30,6 +31,7 @@ template <typename math_t, typename idx_t>
 void sign_flip(raft::resources const& handle,
                raft::device_matrix_view<math_t, idx_t, col_major> inout)
 {
+  if (resource::get_dry_run_flag(handle)) { return; }
   detail::signFlip(
     inout.data_handle(), inout.extent(0), inout.extent(1), resource::get_cuda_stream(handle));
 }
diff --git a/cpp/include/raft/matrix/slice.cuh b/cpp/include/raft/matrix/slice.cuh
index eb2f974f62..4ae69fb98b 100644
--- a/cpp/include/raft/matrix/slice.cuh
+++ b/cpp/include/raft/matrix/slice.cuh
@@ -8,6 +8,7 @@
 #include <raft/core/detail/macros.hpp>
 #include <raft/core/device_mdspan.hpp>
 #include <raft/core/resource/cuda_stream.hpp>
+#include <raft/core/resource/dry_run_flag.hpp>
 #include <raft/matrix/detail/matrix.cuh>
 #include <raft/util/input_validation.hpp>
 
@@ -48,6 +49,7 @@ void slice(raft::resources const& handle,
            raft::device_matrix_view<m_t, idx_t, layout_t> out,
            slice_coordinates<idx_t> coords)
 {
+  if (resource::get_dry_run_flag(handle)) { return; }
   RAFT_EXPECTS(raft::is_row_or_column_major(in), "Matrix layout must be row- or column-major");
   RAFT_EXPECTS(coords.row2 > coords.row1, "row2 must be > row1");
   RAFT_EXPECTS(coords.col2 > coords.col1, "col2 must be > col1");
diff --git a/cpp/include/raft/matrix/sqrt.cuh b/cpp/include/raft/matrix/sqrt.cuh
index 468b0ba8c4..771a58e7e4 100644
--- a/cpp/include/raft/matrix/sqrt.cuh
+++ b/cpp/include/raft/matrix/sqrt.cuh
@@ -9,6 +9,7 @@
 #include <raft/core/device_mdspan.hpp>
 #include <raft/core/host_mdspan.hpp>
 #include <raft/core/resource/cuda_stream.hpp>
+#include <raft/core/resource/dry_run_flag.hpp>
 #include <raft/matrix/detail/math.cuh>
 
 namespace raft {
@@ -33,6 +34,7 @@ void sqrt(raft::resources const& handle,
           raft::device_matrix_view<const math_t, idx_t, layout> in,
           raft::device_matrix_view<math_t, idx_t, layout> out)
 {
+  if (resource::get_dry_run_flag(handle)) { return; }
   RAFT_EXPECTS(in.size() == out.size(), "Input and output matrices must have same size.");
   detail::seqRoot(
     in.data_handle(), out.data_handle(), in.size(), resource::get_cuda_stream(handle));
@@ -49,6 +51,7 @@ void sqrt(raft::resources const& handle,
 template <typename math_t, typename idx_t, typename layout>
 void sqrt(raft::resources const& handle, raft::device_matrix_view<math_t, idx_t, layout> inout)
 {
+  if (resource::get_dry_run_flag(handle)) { return; }
   detail::seqRoot(inout.data_handle(), inout.size(), resource::get_cuda_stream(handle));
 }
 
@@ -70,6 +73,7 @@ void weighted_sqrt(raft::resources const& handle,
                    raft::host_scalar_view<math_t> scalar,
                    bool set_neg_zero = false)
 {
+  if (resource::get_dry_run_flag(handle)) { return; }
   RAFT_EXPECTS(in.size() == out.size(), "Input and output matrices must have same size.");
   detail::seqRoot(in.data_handle(),
                   out.data_handle(),
@@ -95,6 +99,7 @@ void weighted_sqrt(raft::resources const& handle,
                    raft::host_scalar_view<math_t> scalar,
                    bool set_neg_zero = false)
 {
+  if (resource::get_dry_run_flag(handle)) { return; }
   detail::seqRoot(inout.data_handle(),
                   *(scalar.data_handle()),
                   inout.size(),
diff --git a/cpp/include/raft/matrix/threshold.cuh b/cpp/include/raft/matrix/threshold.cuh
index dcf11388fe..d904d8d6d6 100644
--- a/cpp/include/raft/matrix/threshold.cuh
+++ b/cpp/include/raft/matrix/threshold.cuh
@@ -8,6 +8,7 @@
 #include <raft/core/detail/macros.hpp>
 #include <raft/core/device_mdspan.hpp>
 #include <raft/core/resource/cuda_stream.hpp>
+#include <raft/core/resource/dry_run_flag.hpp>
 #include <raft/matrix/detail/matrix.cuh>
 
 namespace raft {
@@ -34,6 +35,7 @@ void zero_small_values(raft::resources const& handle,
                        raft::device_matrix_view<math_t, idx_t, layout> out,
                        math_t thres = 1e-15)
 {
+  if (resource::get_dry_run_flag(handle)) { return; }
   RAFT_EXPECTS(in.size() == out.size(), "Input and output matrices must have same size");
   detail::setSmallValuesZero(
     out.data_handle(), in.data_handle(), in.size(), resource::get_cuda_stream(handle), thres);
@@ -53,6 +55,7 @@ void zero_small_values(raft::resources const& handle,
                        raft::device_matrix_view<math_t, idx_t, layout> inout,
                        math_t thres = 1e-15)
 {
+  if (resource::get_dry_run_flag(handle)) { return; }
   detail::setSmallValuesZero(
     inout.data_handle(), inout.size(), resource::get_cuda_stream(handle), thres);
 }
diff --git a/cpp/include/raft/matrix/triangular.cuh b/cpp/include/raft/matrix/triangular.cuh
index d7b4659c4b..c832237e9a 100644
--- a/cpp/include/raft/matrix/triangular.cuh
+++ b/cpp/include/raft/matrix/triangular.cuh
@@ -8,6 +8,7 @@
 #include <raft/core/detail/macros.hpp>
 #include <raft/core/device_mdspan.hpp>
 #include <raft/core/resource/cuda_stream.hpp>
+#include <raft/core/resource/dry_run_flag.hpp>
 #include <raft/matrix/detail/matrix.cuh>
 
 namespace raft {
@@ -29,6 +30,7 @@ void upper_triangular(raft::resources const& handle,
                       raft::device_matrix_view<const m_t, idx_t, col_major> src,
                       raft::device_matrix_view<m_t, idx_t, col_major> dst)
 {
+  if (resource::get_dry_run_flag(handle)) { return; }
   auto k = std::min(src.extent(0), src.extent(1));
   RAFT_EXPECTS(k == dst.extent(0) && k == dst.extent(1),
                "dst should be of size kxk, k = min(n_rows, n_cols)");
diff --git a/cpp/include/raft/mr/dry_run_resource.hpp b/cpp/include/raft/mr/dry_run_resource.hpp
new file mode 100644
index 0000000000..6a41c2c6f8
--- /dev/null
+++ b/cpp/include/raft/mr/dry_run_resource.hpp
@@ -0,0 +1,223 @@
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 2026, NVIDIA CORPORATION.
+ * SPDX-License-Identifier: Apache-2.0
+ */
+#pragma once
+
+#include <cuda/memory_resource>
+#include <cuda/stream_ref>
+#include <cuda_runtime_api.h>
+
+#include <atomic>
+#include <cstddef>
+#include <memory>
+#include <mutex>
+#include <type_traits>
+#include <utility>
+
+namespace raft::mr {
+
+namespace detail {
+
+/**
+ * @brief Lock-free atomic counter that tracks current and peak allocation bytes.
+ */
+struct dry_run_memory_counter {
+  void record_allocate(std::size_t bytes) noexcept
+  {
+    auto new_total    = allocated_bytes_.fetch_add(bytes, std::memory_order_relaxed) + bytes;
+    auto current_peak = peak_bytes_.load(std::memory_order_relaxed);
+    while (new_total > current_peak &&
+           !peak_bytes_.compare_exchange_weak(
+             current_peak, new_total, std::memory_order_relaxed, std::memory_order_relaxed)) {}
+  }
+
+  void record_deallocate(std::size_t bytes) noexcept
+  {
+    allocated_bytes_.fetch_sub(bytes, std::memory_order_relaxed);
+  }
+
+  [[nodiscard]] auto get_allocated_bytes() const noexcept -> std::size_t
+  {
+    return allocated_bytes_.load(std::memory_order_relaxed);
+  }
+
+  [[nodiscard]] auto get_peak_bytes() const noexcept -> std::size_t
+  {
+    return peak_bytes_.load(std::memory_order_relaxed);
+  }
+
+ private:
+  std::atomic<std::size_t> allocated_bytes_{0};
+  std::atomic<std::size_t> peak_bytes_{0};
+};
+
+/**
+ * @brief Minimal RAII container for a single allocation from a memory resource.
+ *
+ * Stripped-down RAII wrapper: just allocate / deallocate / data().
+ * Two constructor overloads cover sync and async resources:
+ *   - Sync: (MR, size, alignment) -- calls allocate_sync, destructor calls deallocate_sync
+ *   - Async: (MR, stream, size, alignment) -- calls allocate, destructor calls deallocate
+ *
+ * @tparam MR  Memory resource type, stored by value (use a ref type for non-owning).
+ */
+template <typename MR>
+class probe_container {
+  MR mr_;
+  void* ptr_;
+  std::size_t size_;
+  std::size_t alignment_;
+
+ public:
+  template <typename M = MR, std::enable_if_t<cuda::mr::synchronous_resource<M>, int> = 0>
+  probe_container(MR mr, std::size_t size, std::size_t alignment = alignof(std::max_align_t))
+    : mr_(std::move(mr)), ptr_(nullptr), size_(size), alignment_(alignment)
+  {
+    ptr_ = mr_.allocate_sync(size_, alignment_);
+  }
+
+  template <typename M = MR, std::enable_if_t<cuda::mr::resource<M>, int> = 0>
+  probe_container(MR mr,
+                  cuda::stream_ref stream,
+                  std::size_t size,
+                  std::size_t alignment = alignof(std::max_align_t))
+    : mr_(std::move(mr)), ptr_(nullptr), size_(size), alignment_(alignment)
+  {
+    ptr_ = mr_.allocate(stream, size_, alignment_);
+  }
+
+  ~probe_container()
+  {
+    if (ptr_ == nullptr) return;
+    if constexpr (cuda::mr::resource<MR>) {
+      mr_.deallocate(cuda::stream_ref{cudaStreamPerThread}, ptr_, size_, alignment_);
+    } else {
+      mr_.deallocate_sync(ptr_, size_, alignment_);
+    }
+  }
+
+  probe_container(probe_container const&)            = delete;
+  probe_container& operator=(probe_container const&) = delete;
+  probe_container(probe_container&&)                 = delete;
+  probe_container& operator=(probe_container&&)      = delete;
+
+  [[nodiscard]] auto data() const noexcept -> void* { return ptr_; }
+};
+
+}  // namespace detail
+
+static constexpr std::size_t kDryRunProbeSize = 256;
+
+/**
+ * @brief Resource adaptor that returns a single probed pointer for every allocation
+ *        and tracks peak usage without holding real memory.
+ *
+ * Modeled after raft::mr::statistics_adaptor: a single template handles host,
+ * device, pinned, and managed resources depending on the Upstream type.
+ *
+ * Properties are forwarded from Upstream via ADL friend get_property, so
+ * dry_run_resource<host_resource_ref> satisfies host_accessible,
+ * dry_run_resource<host_device_resource_ref> satisfies host + device accessible,
+ * and dry_run_resource<rmm::device_async_resource_ref> satisfies device_accessible.
+ *
+ * @tparam Upstream  Stored by value.  Use a ref type for non-owning semantics.
+ */
+template <typename Upstream>
+class dry_run_resource : public cuda::forward_property<dry_run_resource<Upstream>, Upstream> {
+  Upstream upstream_;
+
+  struct shared_state {
+    detail::dry_run_memory_counter counter;
+    std::once_flag probe_flag;
+    std::unique_ptr<detail::probe_container<Upstream>> probe;
+  };
+  std::shared_ptr<shared_state> state_;
+
+ public:
+  template <typename U, std::enable_if_t<std::is_same_v<std::decay_t<U>, Upstream>, int> = 0>
+  explicit dry_run_resource(U&& upstream)
+    : upstream_(std::forward<U>(upstream)), state_(std::make_shared<shared_state>())
+  {
+  }
+
+  // NVCC injects __host__ __device__ on std::shared_ptr special members,
+  // which makes the *implicit* or *defaulted* special members __host__
+  // __device__ too.  That conflicts with Upstream types whose special
+  // members are __host__ only (e.g. rmm::device_async_resource_ref).
+  // User-defined bodies (not = default) force plain __host__ execution space.
+  dry_run_resource(dry_run_resource&& other) noexcept
+    : upstream_(std::move(other.upstream_)), state_(std::move(other.state_))
+  {
+  }
+  dry_run_resource(dry_run_resource const& other) : upstream_(other.upstream_), state_(other.state_)
+  {
+  }
+  dry_run_resource& operator=(dry_run_resource&& other) noexcept
+  {
+    upstream_ = std::move(other.upstream_);
+    state_    = std::move(other.state_);
+    return *this;
+  }
+  dry_run_resource& operator=(dry_run_resource const& other)
+  {
+    upstream_ = other.upstream_;
+    state_    = other.state_;
+    return *this;
+  }
+
+  [[nodiscard]] auto get_counter() const noexcept -> std::shared_ptr<detail::dry_run_memory_counter>
+  {
+    return {state_, &state_->counter};
+  }
+
+  void* allocate_sync(std::size_t bytes, std::size_t alignment = alignof(std::max_align_t))
+  {
+    std::call_once(state_->probe_flag, [&] {
+      state_->probe =
+        std::make_unique<detail::probe_container<Upstream>>(upstream_, kDryRunProbeSize, alignment);
+    });
+    state_->counter.record_allocate(bytes);
+    return state_->probe->data();
+  }
+
+  void deallocate_sync(void*, std::size_t bytes, std::size_t = alignof(std::max_align_t)) noexcept
+  {
+    state_->counter.record_deallocate(bytes);
+  }
+
+  template <typename U = Upstream, std::enable_if_t<cuda::mr::resource<U>, int> = 0>
+  void* allocate(cuda::stream_ref stream,
+                 std::size_t bytes,
+                 std::size_t alignment = alignof(std::max_align_t))
+  {
+    std::call_once(state_->probe_flag, [&] {
+      state_->probe = std::make_unique<detail::probe_container<Upstream>>(
+        upstream_, stream, kDryRunProbeSize, alignment);
+    });
+    state_->counter.record_allocate(bytes);
+    return state_->probe->data();
+  }
+
+  template <typename U = Upstream, std::enable_if_t<cuda::mr::resource<U>, int> = 0>
+  void deallocate(cuda::stream_ref,
+                  void*,
+                  std::size_t bytes,
+                  std::size_t = alignof(std::max_align_t)) noexcept
+  {
+    state_->counter.record_deallocate(bytes);
+  }
+
+  [[nodiscard]] bool operator==(dry_run_resource const& other) const noexcept
+  {
+    return upstream_ == other.upstream_;
+  }
+
+  [[nodiscard]] auto upstream_resource() noexcept -> Upstream& { return upstream_; }
+  [[nodiscard]] auto upstream_resource() const noexcept -> Upstream const& { return upstream_; }
+};
+
+template <typename Upstream>
+dry_run_resource(Upstream) -> dry_run_resource<Upstream>;
+
+}  // namespace raft::mr
diff --git a/cpp/include/raft/random/detail/make_blobs.cuh b/cpp/include/raft/random/detail/make_blobs.cuh
index 3132bb0bae..e483fc7575 100644
--- a/cpp/include/raft/random/detail/make_blobs.cuh
+++ b/cpp/include/raft/random/detail/make_blobs.cuh
@@ -204,7 +204,8 @@ void generate_data(DataT* out,
  * @param[in]  type               RNG type
  */
 template <typename DataT, typename IdxT>
-void make_blobs_caller(DataT* out,
+void make_blobs_caller(bool dry_run,
+                       DataT* out,
                        IdxT* labels,
                        IdxT n_rows,
                        IdxT n_cols,
@@ -222,16 +223,18 @@ void make_blobs_caller(DataT* out,
 {
   raft::random::RngState r(seed, type);
   // use the right centers buffer for data generation
-  rmm::device_uvector<DataT> rand_centers(0, stream);
+  rmm::device_uvector<DataT> rand_centers(centers == nullptr ? n_clusters * n_cols : 0, stream);
   const DataT* _centers;
   if (centers == nullptr) {
-    rand_centers.resize(n_clusters * n_cols, stream);
-    detail::uniform(
-      r, rand_centers.data(), n_clusters * n_cols, center_box_min, center_box_max, stream);
     _centers = rand_centers.data();
   } else {
     _centers = centers;
   }
+  if (dry_run) { return; }
+  if (centers == nullptr) {
+    detail::uniform(
+      r, rand_centers.data(), n_clusters * n_cols, center_box_min, center_box_max, stream);
+  }
   generate_labels(labels, n_rows, n_clusters, shuffle, r, stream);
   generate_data(out,
                 labels,
diff --git a/cpp/include/raft/random/detail/make_regression.cuh b/cpp/include/raft/random/detail/make_regression.cuh
index 773eae7b39..668d534f61 100644
--- a/cpp/include/raft/random/detail/make_regression.cuh
+++ b/cpp/include/raft/random/detail/make_regression.cuh
@@ -10,6 +10,7 @@
 #pragma once
 
 #include <raft/core/detail/macros.hpp>
+#include <raft/core/resource/dry_run_flag.hpp>
 #include <raft/core/resources.hpp>
 #include <raft/linalg/add.cuh>
 #include <raft/linalg/gemm.cuh>
@@ -53,25 +54,34 @@ static void _make_low_rank_matrix(raft::resources const& handle,
                                   raft::random::RngState& r,
                                   cudaStream_t stream)
 {
-  IdxT n = std::min(n_rows, n_cols);
+  bool is_dry_run = resource::get_dry_run_flag(handle);
+  IdxT n          = std::min(n_rows, n_cols);
 
   // Generate random (ortho normal) vectors with QR decomposition
   rmm::device_uvector<DataT> rd_mat_0(n_rows * n, stream);
   rmm::device_uvector<DataT> rd_mat_1(n_cols * n, stream);
-  normal(r, rd_mat_0.data(), n_rows * n, (DataT)0.0, (DataT)1.0, stream);
-  normal(r, rd_mat_1.data(), n_cols * n, (DataT)0.0, (DataT)1.0, stream);
+  if (!is_dry_run) {
+    normal(r, rd_mat_0.data(), n_rows * n, (DataT)0.0, (DataT)1.0, stream);
+    normal(r, rd_mat_1.data(), n_cols * n, (DataT)0.0, (DataT)1.0, stream);
+  }
   rmm::device_uvector<DataT> q0(n_rows * n, stream);
   rmm::device_uvector<DataT> q1(n_cols * n, stream);
-  raft::linalg::qrGetQ(handle, rd_mat_0.data(), q0.data(), n_rows, n, stream);
-  raft::linalg::qrGetQ(handle, rd_mat_1.data(), q1.data(), n_cols, n, stream);
+  if (!is_dry_run) {
+    raft::linalg::qrGetQ(handle, rd_mat_0.data(), q0.data(), n_rows, n, stream);
+    raft::linalg::qrGetQ(handle, rd_mat_1.data(), q1.data(), n_cols, n, stream);
+  }
 
   // Build the singular profile by assembling signal and noise components
   rmm::device_uvector<DataT> singular_vec(n, stream);
-  _singular_profile_kernel<<<raft::ceildiv<IdxT>(n, 256), 256, 0, stream>>>(
-    singular_vec.data(), n, tail_strength, effective_rank);
-  RAFT_CUDA_TRY(cudaPeekAtLastError());
+  if (!is_dry_run) {
+    _singular_profile_kernel<<<raft::ceildiv<IdxT>(n, 256), 256, 0, stream>>>(
+      singular_vec.data(), n, tail_strength, effective_rank);
+    RAFT_CUDA_TRY(cudaPeekAtLastError());
+  }
   rmm::device_uvector<DataT> singular_mat(n * n, stream);
-  RAFT_CUDA_TRY(cudaMemsetAsync(singular_mat.data(), 0, n * n * sizeof(DataT), stream));
+  if (!is_dry_run) {
+    RAFT_CUDA_TRY(cudaMemsetAsync(singular_mat.data(), 0, n * n * sizeof(DataT), stream));
+  }
 
   raft::matrix::set_diagonal(handle,
                              make_device_vector_view<const DataT, IdxT>(singular_vec.data(), n),
@@ -152,55 +162,41 @@ void make_regression_caller(raft::resources const& handle,
                             uint64_t seed                    = 0ULL,
                             raft::random::GeneratorType type = raft::random::GenPC)
 {
-  n_informative = std::min(n_informative, n_cols);
+  bool is_dry_run = resource::get_dry_run_flag(handle);
+  n_informative   = std::min(n_informative, n_cols);
 
   raft::random::RngState r(seed, type);
 
   if (effective_rank < 0) {
     // Randomly generate a well conditioned input set
-    normal(r, out, n_rows * n_cols, (DataT)0.0, (DataT)1.0, stream);
+    if (!is_dry_run) { normal(r, out, n_rows * n_cols, (DataT)0.0, (DataT)1.0, stream); }
   } else {
     // Randomly generate a low rank, fat tail input set
     _make_low_rank_matrix(handle, out, n_rows, n_cols, effective_rank, tail_strength, r, stream);
   }
 
   // Use the right output buffer for the values
-  rmm::device_uvector<DataT> tmp_values(0, stream);
-  DataT* _values;
-  if (shuffle) {
-    tmp_values.resize(n_rows * n_targets, stream);
-    _values = tmp_values.data();
-  } else {
-    _values = values;
-  }
+  rmm::device_uvector<DataT> tmp_values(shuffle ? n_rows * n_targets : 0, stream);
+  DataT* _values = shuffle ? tmp_values.data() : values;
   // Create a column-major matrix of output values only if it has more
   // than 1 column
-  rmm::device_uvector<DataT> values_col(0, stream);
-  DataT* _values_col;
-  if (n_targets > 1) {
-    values_col.resize(n_rows * n_targets, stream);
-    _values_col = values_col.data();
-  } else {
-    _values_col = _values;
-  }
+  rmm::device_uvector<DataT> values_col(n_targets > 1 ? n_rows * n_targets : 0, stream);
+  DataT* _values_col = n_targets > 1 ? values_col.data() : _values;
 
   // Use the right buffer for the coefficients
-  rmm::device_uvector<DataT> tmp_coef(0, stream);
-  DataT* _coef;
-  if (coef != nullptr && !shuffle) {
-    _coef = coef;
-  } else {
-    tmp_coef.resize(n_cols * n_targets, stream);
-    _coef = tmp_coef.data();
-  }
+  rmm::device_uvector<DataT> tmp_coef((coef != nullptr && !shuffle) ? 0 : n_cols * n_targets,
+                                      stream);
+  DataT* _coef = tmp_coef.size() == 0 ? coef : tmp_coef.data();
 
   // Generate a ground truth model with only n_informative features
-  uniform(r, _coef, n_informative * n_targets, (DataT)1.0, (DataT)100.0, stream);
-  if (coef && n_informative != n_cols) {
-    RAFT_CUDA_TRY(cudaMemsetAsync(_coef + n_informative * n_targets,
-                                  0,
-                                  (n_cols - n_informative) * n_targets * sizeof(DataT),
-                                  stream));
+  if (!is_dry_run) {
+    uniform(r, _coef, n_informative * n_targets, (DataT)1.0, (DataT)100.0, stream);
+    if (coef && n_informative != n_cols) {
+      RAFT_CUDA_TRY(cudaMemsetAsync(_coef + n_informative * n_targets,
+                                    0,
+                                    (n_cols - n_informative) * n_targets * sizeof(DataT),
+                                    stream));
+    }
   }
 
   // Compute the output values
@@ -226,15 +222,16 @@ void make_regression_caller(raft::resources const& handle,
     raft::linalg::transpose(handle, _values_col, _values, n_rows, n_targets, stream);
   }
 
-  if (bias != 0.0) {
-    // Add bias
-    raft::linalg::addScalar(_values, _values, bias, n_rows * n_targets, stream);
+  if (!is_dry_run) {
+    if (bias != 0.0) {
+      // Add bias
+      raft::linalg::addScalar(_values, _values, bias, n_rows * n_targets, stream);
+    }
   }
 
-  rmm::device_uvector<DataT> white_noise(0, stream);
-  if (noise != 0.0) {
+  rmm::device_uvector<DataT> white_noise(noise != 0.0 ? n_rows * n_targets : 0, stream);
+  if (noise != 0.0 && !is_dry_run) {
     // Add white noise
-    white_noise.resize(n_rows * n_targets, stream);
     normal(r, white_noise.data(), n_rows * n_targets, (DataT)0.0, noise, stream);
     raft::linalg::add(_values, _values, white_noise.data(), n_rows * n_targets, stream);
   }
@@ -244,26 +241,28 @@ void make_regression_caller(raft::resources const& handle,
     rmm::device_uvector<IdxT> perms_samples(n_rows, stream);
     rmm::device_uvector<IdxT> perms_features(n_cols, stream);
 
-    constexpr IdxT Nthreads = 256;
+    if (!is_dry_run) {
+      constexpr IdxT Nthreads = 256;
 
-    // Shuffle the samples from out to tmp_out
-    raft::random::permute<DataT, IdxT, IdxT>(
-      perms_samples.data(), tmp_out.data(), out, n_cols, n_rows, true, stream);
-    IdxT nblks_rows = raft::ceildiv<IdxT>(n_rows, Nthreads);
-    _gather2d_kernel<<<nblks_rows, Nthreads, 0, stream>>>(
-      values, _values, perms_samples.data(), n_rows, n_targets);
-    RAFT_CUDA_TRY(cudaPeekAtLastError());
-
-    // Shuffle the features from tmp_out to out
-    raft::random::permute<DataT, IdxT, IdxT>(
-      perms_features.data(), out, tmp_out.data(), n_rows, n_cols, false, stream);
-
-    // Shuffle the coefficients accordingly
-    if (coef != nullptr) {
-      IdxT nblks_cols = raft::ceildiv<IdxT>(n_cols, Nthreads);
-      _gather2d_kernel<<<nblks_cols, Nthreads, 0, stream>>>(
-        coef, _coef, perms_features.data(), n_cols, n_targets);
+      // Shuffle the samples from out to tmp_out
+      raft::random::permute<DataT, IdxT, IdxT>(
+        perms_samples.data(), tmp_out.data(), out, n_cols, n_rows, true, stream);
+      IdxT nblks_rows = raft::ceildiv<IdxT>(n_rows, Nthreads);
+      _gather2d_kernel<<<nblks_rows, Nthreads, 0, stream>>>(
+        values, _values, perms_samples.data(), n_rows, n_targets);
       RAFT_CUDA_TRY(cudaPeekAtLastError());
+
+      // Shuffle the features from tmp_out to out
+      raft::random::permute<DataT, IdxT, IdxT>(
+        perms_features.data(), out, tmp_out.data(), n_rows, n_cols, false, stream);
+
+      // Shuffle the coefficients accordingly
+      if (coef != nullptr) {
+        IdxT nblks_cols = raft::ceildiv<IdxT>(n_cols, Nthreads);
+        _gather2d_kernel<<<nblks_cols, Nthreads, 0, stream>>>(
+          coef, _coef, perms_features.data(), n_cols, n_targets);
+        RAFT_CUDA_TRY(cudaPeekAtLastError());
+      }
     }
   }
 }
diff --git a/cpp/include/raft/random/detail/multi_variable_gaussian.cuh b/cpp/include/raft/random/detail/multi_variable_gaussian.cuh
index b4291b950a..0b75cb88fa 100644
--- a/cpp/include/raft/random/detail/multi_variable_gaussian.cuh
+++ b/cpp/include/raft/random/detail/multi_variable_gaussian.cuh
@@ -10,6 +10,7 @@
 #include <raft/core/device_mdspan.hpp>
 #include <raft/core/resource/cuda_stream.hpp>
 #include <raft/core/resource/cusolver_dn_handle.hpp>
+#include <raft/core/resource/dry_run_flag.hpp>
 #include <raft/core/resources.hpp>
 #include <raft/linalg/detail/cublas_wrappers.hpp>
 #include <raft/linalg/detail/cusolver_wrappers.hpp>
@@ -75,7 +76,8 @@ template <typename T>
 void matVecAdd(
   T* out, const T* in_m, const T* in_v, T scalar, int rows, int cols, cudaStream_t stream)
 {
-  raft::linalg::matrixVectorOp<true, true>(
+  raft::linalg::detail::matrixVectorOp<true, true>(
+    false,
     out,
     in_m,
     in_v,
@@ -185,6 +187,7 @@ class multi_variable_gaussian_impl {
 
   void give_gaussian(const int nPoints, T* P, T* X, const T* x = 0)
   {
+    if (resource::get_dry_run_flag(handle)) { return; }
     auto cusolverHandle = resource::get_cusolver_dn_handle(handle);
     auto cudaStream     = resource::get_cuda_stream(handle);
     if (method == chol_decomp) {
diff --git a/cpp/include/raft/random/detail/rmat_rectangular_generator.cuh b/cpp/include/raft/random/detail/rmat_rectangular_generator.cuh
index 044ef84171..77c541ee21 100644
--- a/cpp/include/raft/random/detail/rmat_rectangular_generator.cuh
+++ b/cpp/include/raft/random/detail/rmat_rectangular_generator.cuh
@@ -9,6 +9,7 @@
 
 #include <raft/core/detail/macros.hpp>
 #include <raft/core/resource/cuda_stream.hpp>
+#include <raft/core/resource/dry_run_flag.hpp>
 #include <raft/core/resources.hpp>
 #include <raft/random/rng_device.cuh>
 #include <raft/random/rng_state.hpp>
@@ -205,6 +206,7 @@ void rmat_rectangular_gen_impl(raft::resources const& handle,
                                IdxT r_scale,
                                IdxT c_scale)
 {
+  if (resource::get_dry_run_flag(handle)) { return; }
   static_assert(std::is_integral_v<IdxT>,
                 "rmat_rectangular_gen: "
                 "Template parameter IdxT must be an integral type");
@@ -260,6 +262,7 @@ void rmat_rectangular_gen_impl(raft::resources const& handle,
                                IdxT r_scale,
                                IdxT c_scale)
 {
+  if (resource::get_dry_run_flag(handle)) { return; }
   static_assert(std::is_integral_v<IdxT>,
                 "rmat_rectangular_gen: "
                 "Template parameter IdxT must be an integral type");
diff --git a/cpp/include/raft/random/detail/rng_impl.cuh b/cpp/include/raft/random/detail/rng_impl.cuh
index 683114318a..3c3ad9963d 100644
--- a/cpp/include/raft/random/detail/rng_impl.cuh
+++ b/cpp/include/raft/random/detail/rng_impl.cuh
@@ -13,12 +13,12 @@
 #include <raft/random/rng_device.cuh>
 #include <raft/random/rng_state.hpp>
 #include <raft/util/cudart_utils.hpp>
-#include <raft/util/detail/cub_wrappers.cuh>
 #include <raft/util/scatter.cuh>
 
 #include <rmm/device_scalar.hpp>
 
 #include <cub/device/device_merge_sort.cuh>
+#include <cub/device/device_radix_sort.cuh>
 #include <cub/device/device_scan.cuh>
 #include <cub/device/device_select.cuh>
 #include <cuda_fp16.h>
@@ -251,7 +251,8 @@ void call_sample_with_replacement_kernel(DeviceState<GenType> const& dev_state,
 }
 
 template <typename OutType, typename WeightType, typename IndexType = OutType>
-std::enable_if_t<std::is_integral_v<OutType>> discrete(RngState& rng_state,
+std::enable_if_t<std::is_integral_v<OutType>> discrete(bool dry_run,
+                                                       RngState& rng_state,
                                                        OutType* ptr,
                                                        const WeightType* weights,
                                                        IndexType sampledLen,
@@ -264,6 +265,9 @@ std::enable_if_t<std::is_integral_v<OutType>> discrete(RngState& rng_state,
   cub::DeviceScan::InclusiveSum(
     nullptr, temp_storage_bytes, weights, weights_csum.data(), len, stream);
   rmm::device_uvector<uint8_t> temp_storage(temp_storage_bytes, stream);
+
+  if (dry_run) { return; }
+
   cub::DeviceScan::InclusiveSum(
     temp_storage.data(), temp_storage_bytes, weights, weights_csum.data(), len, stream);
 
@@ -280,7 +284,8 @@ std::enable_if_t<std::is_integral_v<OutType>> discrete(RngState& rng_state,
 
 /** Note the memory space requirements are O(4*len) */
 template <typename DataT, typename WeightsT, typename IdxT = int>
-void sampleWithoutReplacement(RngState& rng_state,
+void sampleWithoutReplacement(bool dry_run,
+                              RngState& rng_state,
                               DataT* out,
                               IdxT* outIdx,
                               const DataT* in,
@@ -301,13 +306,37 @@ void sampleWithoutReplacement(RngState& rng_state,
   params.inIdxPtr = inIdxPtr;
   params.wts      = wts;
 
+  // Query workspace size for sortPairs before dry-run check to track allocation
+  size_t workspace_size = 0;
+  cub::DeviceRadixSort::SortPairs(nullptr,
+                                  workspace_size,
+                                  expWts.data(),
+                                  sortedWts.data(),
+                                  inIdxPtr,
+                                  outIdxBuff.data(),
+                                  (int)len,
+                                  0,
+                                  sizeof(WeightsT) * 8,
+                                  stream);
+  rmm::device_uvector<char> workspace(workspace_size, stream);
+
+  if (dry_run) { return; }
+
   RAFT_CALL_RNG_FUNC(rng_state, call_rng_kernel<1>, rng_state, stream, expWts.data(), len, params);
 
   ///@todo: use a more efficient partitioning scheme instead of full sort
   // sort the array and pick the top sampledLen items
   IdxT* outIdxPtr = outIdxBuff.data();
-  rmm::device_uvector<char> workspace(0, stream);
-  sortPairs(workspace, expWts.data(), sortedWts.data(), inIdxPtr, outIdxPtr, (int)len, stream);
+  cub::DeviceRadixSort::SortPairs(workspace.data(),
+                                  workspace_size,
+                                  expWts.data(),
+                                  sortedWts.data(),
+                                  inIdxPtr,
+                                  outIdxPtr,
+                                  (int)len,
+                                  0,
+                                  sizeof(WeightsT) * 8,
+                                  stream);
   if (outIdx != nullptr) {
     RAFT_CUDA_TRY(cudaMemcpyAsync(
       outIdx, outIdxPtr, sizeof(IdxT) * sampledLen, cudaMemcpyDeviceToDevice, stream));
@@ -364,20 +393,18 @@ auto excess_subsample(raft::resources const& res, RngState& state, IdxT N, IdxT
   // There is a variance of n_excess_samples, we take 10% more elements.
   n_excess_samples += std::max<IdxT>(0.1 * n_samples, 100);
 
+  bool dry_run = resource::get_dry_run_flag(res);
+  auto stream  = resource::get_cuda_stream(res);
+
   while (true) {
     // n_excess_sampless will be larger than N around k = 0.64*N. When we reach N, then instead of
     // doing rejection sampling, we simply shuffle the range [0..N-1] using N random numbers.
     n_excess_samples = std::min<IdxT>(n_excess_samples, N);
     auto rnd_idx     = raft::make_device_vector<IdxT, IdxT>(res, n_excess_samples);
+    auto linear_idx  = raft::make_device_vector<IdxT, IdxT>(res, rnd_idx.size());
 
-    auto linear_idx = raft::make_device_vector<IdxT, IdxT>(res, rnd_idx.size());
-    raft::linalg::map_offset(res, linear_idx.view(), identity_op());
-
-    uniformInt(res, state, rnd_idx.data_handle(), rnd_idx.size(), IdxT(0), IdxT(N));
-
-    // Sort indices according to rnd keys
+    // Workspace size queries (safe with nullptr)
     size_t workspace_size = 0;
-    auto stream           = resource::get_cuda_stream(res);
     cub::DeviceMergeSort::SortPairs(nullptr,
                                     workspace_size,
                                     rnd_idx.data_handle(),
@@ -385,7 +412,30 @@ auto excess_subsample(raft::resources const& res, RngState& state, IdxT N, IdxT
                                     rnd_idx.size(),
                                     raft::less_op{},
                                     stream);
+
+    auto keys_out   = raft::make_device_vector<IdxT, IdxT>(res, rnd_idx.size());
+    auto values_out = raft::make_device_vector<IdxT, IdxT>(res, rnd_idx.size());
+    rmm::device_scalar<IdxT> num_selected(stream);
+    size_t worksize2 = 0;
+    cub::DeviceSelect::UniqueByKey(nullptr,
+                                   worksize2,
+                                   rnd_idx.data_handle(),
+                                   linear_idx.data_handle(),
+                                   keys_out.data_handle(),
+                                   values_out.data_handle(),
+                                   num_selected.data(),
+                                   rnd_idx.size(),
+                                   stream);
+
+    workspace_size = std::max(workspace_size, worksize2);
     auto workspace = raft::make_device_vector<char, IdxT>(res, workspace_size);
+
+    if (dry_run) { return raft::make_device_vector<IdxT, IdxT>(res, n_samples); }
+
+    raft::linalg::map_offset(res, linear_idx.view(), identity_op());
+    uniformInt(res, state, rnd_idx.data_handle(), rnd_idx.size(), IdxT(0), IdxT(N));
+
+    // Sort indices according to rnd keys
     cub::DeviceMergeSort::SortPairs(workspace.data_handle(),
                                     workspace_size,
                                     rnd_idx.data_handle(),
@@ -404,25 +454,6 @@ auto excess_subsample(raft::resources const& res, RngState& state, IdxT N, IdxT
     }
     // Else we do a rejection sampling (or excess sampling): we generated more random indices than
     // needed and reject the duplicates.
-    auto keys_out   = raft::make_device_vector<IdxT, IdxT>(res, rnd_idx.size());
-    auto values_out = raft::make_device_vector<IdxT, IdxT>(res, rnd_idx.size());
-    rmm::device_scalar<IdxT> num_selected(stream);
-    size_t worksize2 = 0;
-    cub::DeviceSelect::UniqueByKey(nullptr,
-                                   worksize2,
-                                   rnd_idx.data_handle(),
-                                   linear_idx.data_handle(),
-                                   keys_out.data_handle(),
-                                   values_out.data_handle(),
-                                   num_selected.data(),
-                                   rnd_idx.size(),
-                                   stream);
-
-    if (worksize2 > workspace.size()) {
-      workspace      = raft::make_device_vector<char, IdxT>(res, worksize2);
-      workspace_size = workspace.size();
-    }
-
     cub::DeviceSelect::UniqueByKey(workspace.data_handle(),
                                    workspace_size,
                                    rnd_idx.data_handle(),
diff --git a/cpp/include/raft/random/detail/rng_impl_deprecated.cuh b/cpp/include/raft/random/detail/rng_impl_deprecated.cuh
index 3284f6d9a8..aa89e1bb06 100644
--- a/cpp/include/raft/random/detail/rng_impl_deprecated.cuh
+++ b/cpp/include/raft/random/detail/rng_impl_deprecated.cuh
@@ -12,14 +12,16 @@
 #include "rng_device.cuh"
 
 #include <raft/core/detail/macros.hpp>
+#include <raft/core/resource/dry_run_flag.hpp>
 #include <raft/core/resources.hpp>
 #include <raft/random/rng_state.hpp>
 #include <raft/util/cuda_utils.cuh>
-#include <raft/util/detail/cub_wrappers.cuh>
 #include <raft/util/scatter.cuh>
 
 #include <rmm/device_uvector.hpp>
 
+#include <cub/device/device_radix_sort.cuh>
+
 #include <curand_kernel.h>
 
 #include <random>
@@ -272,13 +274,37 @@ class RngImpl {
     SamplingParams<WeightsT, IdxT> params;
     params.inIdxPtr = inIdxPtr;
     params.wts      = wts;
+
+    // Query workspace size for sortPairs before dry-run check to track allocation
+    size_t workspace_size = 0;
+    cub::DeviceRadixSort::SortPairs(nullptr,
+                                    workspace_size,
+                                    expWts.data(),
+                                    sortedWts.data(),
+                                    inIdxPtr,
+                                    outIdxBuff.data(),
+                                    (int)len,
+                                    0,
+                                    sizeof(WeightsT) * 8,
+                                    stream);
+    rmm::device_uvector<char> workspace(workspace_size, stream);
+
+    if (resource::get_dry_run_flag(handle)) { return; }
     kernel_dispatch<WeightsT, IdxT, 1, SamplingParams<WeightsT, IdxT>>(
       expWts.data(), len, stream, params);
     ///@todo: use a more efficient partitioning scheme instead of full sort
     // sort the array and pick the top sampledLen items
     IdxT* outIdxPtr = outIdxBuff.data();
-    rmm::device_uvector<char> workspace(0, stream);
-    sortPairs(workspace, expWts.data(), sortedWts.data(), inIdxPtr, outIdxPtr, (int)len, stream);
+    cub::DeviceRadixSort::SortPairs(workspace.data(),
+                                    workspace_size,
+                                    expWts.data(),
+                                    sortedWts.data(),
+                                    inIdxPtr,
+                                    outIdxPtr,
+                                    (int)len,
+                                    0,
+                                    sizeof(WeightsT) * 8,
+                                    stream);
     if (outIdx != nullptr) {
       RAFT_CUDA_TRY(cudaMemcpyAsync(
         outIdx, outIdxPtr, sizeof(IdxT) * sampledLen, cudaMemcpyDeviceToDevice, stream));
diff --git a/cpp/include/raft/random/make_blobs.cuh b/cpp/include/raft/random/make_blobs.cuh
index ec5e1db870..6ad7f79cb5 100644
--- a/cpp/include/raft/random/make_blobs.cuh
+++ b/cpp/include/raft/random/make_blobs.cuh
@@ -13,6 +13,7 @@
 #include <raft/core/detail/macros.hpp>
 #include <raft/core/device_mdspan.hpp>
 #include <raft/core/resource/cuda_stream.hpp>
+#include <raft/core/resource/dry_run_flag.hpp>
 #include <raft/core/resources.hpp>
 
 #include <optional>
@@ -71,7 +72,8 @@ void make_blobs(DataT* out,
                 uint64_t seed                  = 0ULL,
                 GeneratorType type             = GenPC)
 {
-  detail::make_blobs_caller(out,
+  detail::make_blobs_caller(false,
+                            out,
                             labels,
                             n_rows,
                             n_cols,
@@ -157,7 +159,8 @@ void make_blobs(
   auto prm_centers     = centers.has_value() ? centers.value().data_handle() : nullptr;
   auto prm_cluster_std = cluster_std.has_value() ? cluster_std.value().data_handle() : nullptr;
 
-  detail::make_blobs_caller(out.data_handle(),
+  detail::make_blobs_caller(resource::get_dry_run_flag(handle),
+                            out.data_handle(),
                             labels.data_handle(),
                             (IdxT)out.extent(0),
                             (IdxT)out.extent(1),
diff --git a/cpp/include/raft/random/permute.cuh b/cpp/include/raft/random/permute.cuh
index 6a308d1fd4..01b9316e47 100644
--- a/cpp/include/raft/random/permute.cuh
+++ b/cpp/include/raft/random/permute.cuh
@@ -13,6 +13,7 @@
 #include <raft/core/detail/macros.hpp>
 #include <raft/core/device_mdspan.hpp>
 #include <raft/core/resource/cuda_stream.hpp>
+#include <raft/core/resource/dry_run_flag.hpp>
 #include <raft/core/resources.hpp>
 
 #include <optional>
@@ -92,6 +93,7 @@ void permute(raft::resources const& handle,
              std::optional<raft::device_vector_view<IntType, IdxType>> permsOut,
              std::optional<raft::device_matrix_view<InputOutputValueType, IdxType, Layout>> out)
 {
+  if (resource::get_dry_run_flag(handle)) { return; }
   static_assert(std::is_integral_v<IntType>,
                 "permute: The type of each element "
                 "of permsOut (if provided) must be an integral type.");
@@ -144,6 +146,7 @@ void permute(raft::resources const& handle,
              PermsOutType&& permsOut,
              OutType&& out)
 {
+  if (resource::get_dry_run_flag(handle)) { return; }
   // If PermsOutType is std::optional<device_vector_view<T, IdxType>>
   // for some T, then that type T need not be related to any of the
   // other template parameters.  Thus, we have to deduce it specially.
diff --git a/cpp/include/raft/random/rng.cuh b/cpp/include/raft/random/rng.cuh
index d013eae07a..8def069032 100644
--- a/cpp/include/raft/random/rng.cuh
+++ b/cpp/include/raft/random/rng.cuh
@@ -12,6 +12,7 @@
 #include <raft/core/detail/macros.hpp>
 #include <raft/core/device_mdspan.hpp>
 #include <raft/core/resource/cuda_stream.hpp>
+#include <raft/core/resource/dry_run_flag.hpp>
 #include <raft/core/resources.hpp>
 
 #include <cassert>
@@ -46,6 +47,7 @@ void uniform(raft::resources const& handle,
              OutputValueType start,
              OutputValueType end)
 {
+  if (resource::get_dry_run_flag(handle)) { return; }
   detail::uniform(
     rng_state, out.data_handle(), out.extent(0), start, end, resource::get_cuda_stream(handle));
 }
@@ -74,6 +76,7 @@ void uniform(raft::resources const& handle,
              OutType start,
              OutType end)
 {
+  if (resource::get_dry_run_flag(handle)) { return; }
   detail::uniform(rng_state, ptr, len, start, end, resource::get_cuda_stream(handle));
 }
 
@@ -97,6 +100,7 @@ void uniformInt(raft::resources const& handle,
                 OutputValueType start,
                 OutputValueType end)
 {
+  if (resource::get_dry_run_flag(handle)) { return; }
   static_assert(
     std::is_same<OutputValueType, typename std::remove_cv<OutputValueType>::type>::value,
     "uniformInt: The output vector must be a view of nonconst, "
@@ -127,6 +131,7 @@ void uniformInt(raft::resources const& handle,
                 OutType start,
                 OutType end)
 {
+  if (resource::get_dry_run_flag(handle)) { return; }
   detail::uniformInt(rng_state, ptr, len, start, end, resource::get_cuda_stream(handle));
 }
 
@@ -151,6 +156,7 @@ void normal(raft::resources const& handle,
             OutputValueType mu,
             OutputValueType sigma)
 {
+  if (resource::get_dry_run_flag(handle)) { return; }
   detail::normal(
     rng_state, out.data_handle(), out.extent(0), mu, sigma, resource::get_cuda_stream(handle));
 }
@@ -175,6 +181,7 @@ void normal(raft::resources const& handle,
             OutType mu,
             OutType sigma)
 {
+  if (resource::get_dry_run_flag(handle)) { return; }
   detail::normal(rng_state, ptr, len, mu, sigma, resource::get_cuda_stream(handle));
 }
 
@@ -198,6 +205,7 @@ void normalInt(raft::resources const& handle,
                OutputValueType mu,
                OutputValueType sigma)
 {
+  if (resource::get_dry_run_flag(handle)) { return; }
   static_assert(
     std::is_same<OutputValueType, typename std::remove_cv<OutputValueType>::type>::value,
     "normalInt: The output vector must be a view of nonconst, "
@@ -229,6 +237,7 @@ void normalInt(raft::resources const& handle,
                IntType mu,
                IntType sigma)
 {
+  if (resource::get_dry_run_flag(handle)) { return; }
   detail::normalInt(rng_state, ptr, len, mu, sigma, resource::get_cuda_stream(handle));
 }
 
@@ -261,6 +270,7 @@ void normalTable(
   std::variant<raft::device_vector_view<const OutputValueType, IndexType>, OutputValueType> sigma,
   raft::device_matrix_view<OutputValueType, IndexType, raft::row_major> out)
 {
+  if (resource::get_dry_run_flag(handle)) { return; }
   const OutputValueType* sigma_vec_ptr = nullptr;
   OutputValueType sigma_value{};
 
@@ -327,6 +337,7 @@ void normalTable(raft::resources const& handle,
                  const OutType* sigma_vec,
                  OutType sigma)
 {
+  if (resource::get_dry_run_flag(handle)) { return; }
   detail::normalTable(
     rng_state, ptr, n_rows, n_cols, mu_vec, sigma_vec, sigma, resource::get_cuda_stream(handle));
 }
@@ -349,6 +360,7 @@ void fill(raft::resources const& handle,
           OutputValueType val,
           raft::device_vector_view<OutputValueType, IndexType> out)
 {
+  if (resource::get_dry_run_flag(handle)) { return; }
   detail::fill(rng_state, out.data_handle(), out.extent(0), val, resource::get_cuda_stream(handle));
 }
 
@@ -367,6 +379,7 @@ template <typename OutType, typename LenType = int>
 void fill(
   raft::resources const& handle, RngState& rng_state, OutType* ptr, LenType len, OutType val)
 {
+  if (resource::get_dry_run_flag(handle)) { return; }
   detail::fill(rng_state, ptr, len, val, resource::get_cuda_stream(handle));
 }
 
@@ -390,6 +403,7 @@ void bernoulli(raft::resources const& handle,
                raft::device_vector_view<OutputValueType, IndexType> out,
                Type prob)
 {
+  if (resource::get_dry_run_flag(handle)) { return; }
   detail::bernoulli(
     rng_state, out.data_handle(), out.extent(0), prob, resource::get_cuda_stream(handle));
 }
@@ -411,6 +425,7 @@ template <typename Type, typename OutType = bool, typename LenType = int>
 void bernoulli(
   raft::resources const& handle, RngState& rng_state, OutType* ptr, LenType len, Type prob)
 {
+  if (resource::get_dry_run_flag(handle)) { return; }
   detail::bernoulli(rng_state, ptr, len, prob, resource::get_cuda_stream(handle));
 }
 
@@ -434,6 +449,7 @@ void scaled_bernoulli(raft::resources const& handle,
                       OutputValueType prob,
                       OutputValueType scale)
 {
+  if (resource::get_dry_run_flag(handle)) { return; }
   detail::scaled_bernoulli(
     rng_state, out.data_handle(), out.extent(0), prob, scale, resource::get_cuda_stream(handle));
 }
@@ -458,6 +474,7 @@ void scaled_bernoulli(raft::resources const& handle,
                       OutType prob,
                       OutType scale)
 {
+  if (resource::get_dry_run_flag(handle)) { return; }
   detail::scaled_bernoulli(rng_state, ptr, len, prob, scale, resource::get_cuda_stream(handle));
 }
 
@@ -482,6 +499,7 @@ void gumbel(raft::resources const& handle,
             OutputValueType mu,
             OutputValueType beta)
 {
+  if (resource::get_dry_run_flag(handle)) { return; }
   detail::gumbel(
     rng_state, out.data_handle(), out.extent(0), mu, beta, resource::get_cuda_stream(handle));
 }
@@ -507,6 +525,7 @@ void gumbel(raft::resources const& handle,
             OutType mu,
             OutType beta)
 {
+  if (resource::get_dry_run_flag(handle)) { return; }
   detail::gumbel(rng_state, ptr, len, mu, beta, resource::get_cuda_stream(handle));
 }
 
@@ -530,6 +549,7 @@ void lognormal(raft::resources const& handle,
                OutputValueType mu,
                OutputValueType sigma)
 {
+  if (resource::get_dry_run_flag(handle)) { return; }
   detail::lognormal(
     rng_state, out.data_handle(), out.extent(0), mu, sigma, resource::get_cuda_stream(handle));
 }
@@ -554,6 +574,7 @@ void lognormal(raft::resources const& handle,
                OutType mu,
                OutType sigma)
 {
+  if (resource::get_dry_run_flag(handle)) { return; }
   detail::lognormal(rng_state, ptr, len, mu, sigma, resource::get_cuda_stream(handle));
 }
 
@@ -577,6 +598,7 @@ void logistic(raft::resources const& handle,
               OutputValueType mu,
               OutputValueType scale)
 {
+  if (resource::get_dry_run_flag(handle)) { return; }
   detail::logistic(
     rng_state, out.data_handle(), out.extent(0), mu, scale, resource::get_cuda_stream(handle));
 }
@@ -601,6 +623,7 @@ void logistic(raft::resources const& handle,
               OutType mu,
               OutType scale)
 {
+  if (resource::get_dry_run_flag(handle)) { return; }
   detail::logistic(rng_state, ptr, len, mu, scale, resource::get_cuda_stream(handle));
 }
 
@@ -622,6 +645,7 @@ void exponential(raft::resources const& handle,
                  raft::device_vector_view<OutputValueType, IndexType> out,
                  OutputValueType lambda)
 {
+  if (resource::get_dry_run_flag(handle)) { return; }
   detail::exponential(
     rng_state, out.data_handle(), out.extent(0), lambda, resource::get_cuda_stream(handle));
 }
@@ -641,6 +665,7 @@ template <typename OutType, typename LenType = int>
 void exponential(
   raft::resources const& handle, RngState& rng_state, OutType* ptr, LenType len, OutType lambda)
 {
+  if (resource::get_dry_run_flag(handle)) { return; }
   detail::exponential(rng_state, ptr, len, lambda, resource::get_cuda_stream(handle));
 }
 
@@ -662,6 +687,7 @@ void rayleigh(raft::resources const& handle,
               raft::device_vector_view<OutputValueType, IndexType> out,
               OutputValueType sigma)
 {
+  if (resource::get_dry_run_flag(handle)) { return; }
   detail::rayleigh(
     rng_state, out.data_handle(), out.extent(0), sigma, resource::get_cuda_stream(handle));
 }
@@ -681,6 +707,7 @@ template <typename OutType, typename LenType = int>
 void rayleigh(
   raft::resources const& handle, RngState& rng_state, OutType* ptr, LenType len, OutType sigma)
 {
+  if (resource::get_dry_run_flag(handle)) { return; }
   detail::rayleigh(rng_state, ptr, len, sigma, resource::get_cuda_stream(handle));
 }
 /**
@@ -703,6 +730,7 @@ void laplace(raft::resources const& handle,
              OutputValueType mu,
              OutputValueType scale)
 {
+  if (resource::get_dry_run_flag(handle)) { return; }
   detail::laplace(
     rng_state, out.data_handle(), out.extent(0), mu, scale, resource::get_cuda_stream(handle));
 }
@@ -727,6 +755,7 @@ void laplace(raft::resources const& handle,
              OutType mu,
              OutType scale)
 {
+  if (resource::get_dry_run_flag(handle)) { return; }
   detail::laplace(rng_state, ptr, len, mu, scale, resource::get_cuda_stream(handle));
 }
 
@@ -763,7 +792,8 @@ std::enable_if_t<std::is_integral_v<OutType>> discrete(
   raft::device_vector_view<OutType, IndexType> out,
   raft::device_vector_view<const WeightType, IndexType> weights)
 {
-  detail::discrete(rng_state,
+  detail::discrete(resource::get_dry_run_flag(handle),
+                   rng_state,
                    out.data_handle(),
                    weights.data_handle(),
                    out.extent(0),
@@ -800,8 +830,15 @@ void sampleWithoutReplacement(raft::resources const& handle,
                               IdxT sampledLen,
                               IdxT len)
 {
-  detail::sampleWithoutReplacement(
-    rng_state, out, outIdx, in, wts, sampledLen, len, resource::get_cuda_stream(handle));
+  detail::sampleWithoutReplacement(resource::get_dry_run_flag(handle),
+                                   rng_state,
+                                   out,
+                                   outIdx,
+                                   in,
+                                   wts,
+                                   sampledLen,
+                                   len,
+                                   resource::get_cuda_stream(handle));
 }
 
 /** @brief Sample from range 0..N-1.
diff --git a/cpp/include/raft/random/sample_without_replacement.cuh b/cpp/include/raft/random/sample_without_replacement.cuh
index 7b3c99d5f6..fcd58dd437 100644
--- a/cpp/include/raft/random/sample_without_replacement.cuh
+++ b/cpp/include/raft/random/sample_without_replacement.cuh
@@ -11,6 +11,7 @@
 #include <raft/core/detail/macros.hpp>
 #include <raft/core/device_mdspan.hpp>
 #include <raft/core/resource/cuda_stream.hpp>
+#include <raft/core/resource/dry_run_flag.hpp>
 #include <raft/core/resources.hpp>
 
 #include <cassert>
@@ -131,7 +132,8 @@ void sample_without_replacement(raft::resources const& handle,
   }
   const weight_type* wts_ptr = wts_has_value ? (*wts).data_handle() : nullptr;
 
-  detail::sampleWithoutReplacement(rng_state,
+  detail::sampleWithoutReplacement(resource::get_dry_run_flag(handle),
+                                   rng_state,
                                    out.data_handle(),
                                    outIdx_ptr,
                                    in.data_handle(),
diff --git a/cpp/include/raft/solver/detail/lap_functions.cuh b/cpp/include/raft/solver/detail/lap_functions.cuh
index 64536ce86a..585942e3a8 100644
--- a/cpp/include/raft/solver/detail/lap_functions.cuh
+++ b/cpp/include/raft/solver/detail/lap_functions.cuh
@@ -30,6 +30,7 @@
 
 #include <raft/core/detail/macros.hpp>
 #include <raft/core/resource/cuda_stream.hpp>
+#include <raft/core/resource/dry_run_flag.hpp>
 #include <raft/core/resources.hpp>
 #include <raft/solver/detail/lap_kernels.cuh>
 #include <raft/solver/linear_assignment_types.hpp>
@@ -111,6 +112,7 @@ inline void initialReduction(raft::resources const& handle,
                              int SP,
                              vertex_t N)
 {
+  if (resource::get_dry_run_flag(handle)) { return; }
   dim3 blocks_per_grid;
   dim3 threads_per_block;
   int total_blocks = 0;
@@ -151,6 +153,8 @@ inline void computeInitialAssignments(raft::resources const& handle,
   rmm::device_uvector<int> row_lock_v(size, resource::get_cuda_stream(handle));
   rmm::device_uvector<int> col_lock_v(size, resource::get_cuda_stream(handle));
 
+  if (resource::get_dry_run_flag(handle)) { return; }
+
   thrust::fill_n(thrust::device, d_vertices.row_assignments, size, -1);
   thrust::fill_n(thrust::device, d_vertices.col_assignments, size, -1);
   thrust::fill_n(thrust::device, row_lock_v.data(), size, 0);
@@ -184,6 +188,7 @@ inline int computeRowCovers(raft::resources const& handle,
                             int SP,
                             vertex_t N)
 {
+  if (resource::get_dry_run_flag(handle)) { return {}; }
   dim3 blocks_per_grid;
   dim3 threads_per_block;
   int total_blocks = 0;
@@ -226,6 +231,7 @@ inline void coverZeroAndExpand(raft::resources const& handle,
                                vertex_t N,
                                weight_t epsilon)
 {
+  if (resource::get_dry_run_flag(handle)) { return; }
   int total_blocks = 0;
   dim3 blocks_per_grid;
   dim3 threads_per_block;
@@ -258,11 +264,18 @@ inline vertex_t zeroCoverIteration(raft::resources const& handle,
                                    vertex_t N,
                                    weight_t epsilon)
 {
-  vertex_t M;
-
-  rmm::device_uvector<vertex_t> csr_ptrs_v(0, resource::get_cuda_stream(handle));
+  bool is_dry_run = resource::get_dry_run_flag(handle);
+  vertex_t M      = 0;
   rmm::device_uvector<vertex_t> csr_neighbors_v(0, resource::get_cuda_stream(handle));
 
+  // Allocate all buffers before dry-run check to track allocations
+  rmm::device_uvector<vertex_t> csr_ptrs_v(SP + 1, resource::get_cuda_stream(handle));
+  if (is_dry_run) {
+    // Upper bound for csr_neighbors_v: at most SP * N elements (one per matrix element)
+    csr_neighbors_v = rmm::device_uvector<vertex_t>(SP * N, resource::get_cuda_stream(handle));
+    return vertex_t{0};
+  }
+
   {
     dim3 blocks_per_grid;
     dim3 threads_per_block;
@@ -273,9 +286,6 @@ inline vertex_t zeroCoverIteration(raft::resources const& handle,
 
     thrust::fill_n(thrust::device, predicates_v.data(), SP * N, false);
     thrust::fill_n(thrust::device, addresses_v.data(), SP * N, vertex_t{0});
-
-    csr_ptrs_v.resize(SP + 1, resource::get_cuda_stream(handle));
-
     thrust::fill_n(thrust::device, csr_ptrs_v.data(), (SP + 1), vertex_t{-1});
 
     detail::calculateRectangularDims(blocks_per_grid, threads_per_block, total_blocks, N, SP);
@@ -340,6 +350,7 @@ inline void executeZeroCover(raft::resources const& handle,
                              vertex_t N,
                              weight_t epsilon)
 {
+  if (resource::get_dry_run_flag(handle)) { return; }
   vertex_t M = 1;
   while (M > 0) {
     M = zeroCoverIteration(
@@ -361,10 +372,15 @@ inline void reversePass(raft::resources const& handle,
 
   std::size_t size = SP * N;
 
-  detail::calculateLinearDims(blocks_per_grid, threads_per_block, total_blocks, size);
-
   rmm::device_uvector<bool> predicates_v(size, resource::get_cuda_stream(handle));
   rmm::device_uvector<vertex_t> addresses_v(size, resource::get_cuda_stream(handle));
+  if (resource::get_dry_run_flag(handle)) {
+    // Upper bound for elements_v: at most size elements (one per matrix element)
+    rmm::device_uvector<vertex_t> elements_v(size, resource::get_cuda_stream(handle));
+    return;
+  }
+
+  detail::calculateLinearDims(blocks_per_grid, threads_per_block, total_blocks, size);
 
   thrust::fill_n(thrust::device, predicates_v.data(), size, false);
   thrust::fill_n(thrust::device, addresses_v.data(), size, vertex_t{0});
@@ -418,14 +434,19 @@ inline void augmentationPass(raft::resources const& handle,
                              int SP,
                              int N)
 {
+  rmm::device_uvector<bool> predicates_v(SP * N, resource::get_cuda_stream(handle));
+  rmm::device_uvector<vertex_t> addresses_v(SP * N, resource::get_cuda_stream(handle));
+  if (resource::get_dry_run_flag(handle)) {
+    // Upper bound for elements_v: at most SP * N elements (one per matrix element)
+    rmm::device_uvector<vertex_t> elements_v(SP * N, resource::get_cuda_stream(handle));
+    return;
+  }
+
   int total_blocks = 0;
   dim3 blocks_per_grid;
   dim3 threads_per_block;
   detail::calculateLinearDims(blocks_per_grid, threads_per_block, total_blocks, SP * N);
 
-  rmm::device_uvector<bool> predicates_v(SP * N, resource::get_cuda_stream(handle));
-  rmm::device_uvector<vertex_t> addresses_v(SP * N, resource::get_cuda_stream(handle));
-
   thrust::fill_n(thrust::device, predicates_v.data(), SP * N, false);
   thrust::fill_n(thrust::device, addresses_v.data(), SP * N, vertex_t{0});
 
@@ -487,12 +508,14 @@ inline void dualUpdate(raft::resources const& handle,
                        vertex_t N,
                        weight_t epsilon)
 {
+  rmm::device_uvector<weight_t> sp_min_v(SP, resource::get_cuda_stream(handle));
+
+  if (resource::get_dry_run_flag(handle)) { return; }
+
   dim3 blocks_per_grid;
   dim3 threads_per_block;
   int total_blocks;
 
-  rmm::device_uvector<weight_t> sp_min_v(SP, resource::get_cuda_stream(handle));
-
   detail::calculateLinearDims(blocks_per_grid, threads_per_block, total_blocks, SP);
   kernel_dualUpdate_1<<<blocks_per_grid, threads_per_block, 0, resource::get_cuda_stream(handle)>>>(
     sp_min_v.data(),
@@ -530,6 +553,7 @@ inline void calcObjValDual(raft::resources const& handle,
                            int SP,
                            int N)
 {
+  if (resource::get_dry_run_flag(handle)) { return; }
   dim3 blocks_per_grid;
   dim3 threads_per_block;
   int total_blocks = 0;
@@ -554,6 +578,7 @@ inline void calcObjValPrimal(raft::resources const& handle,
                              int SP,
                              vertex_t N)
 {
+  if (resource::get_dry_run_flag(handle)) { return; }
   dim3 blocks_per_grid;
   dim3 threads_per_block;
   int total_blocks = 0;
diff --git a/cpp/include/raft/sparse/convert/detail/adj_to_csr.cuh b/cpp/include/raft/sparse/convert/detail/adj_to_csr.cuh
index b937195ba9..b6669d9c65 100644
--- a/cpp/include/raft/sparse/convert/detail/adj_to_csr.cuh
+++ b/cpp/include/raft/sparse/convert/detail/adj_to_csr.cuh
@@ -7,6 +7,7 @@
 
 #include <raft/core/detail/macros.hpp>
 #include <raft/core/resource/cuda_stream.hpp>
+#include <raft/core/resource/dry_run_flag.hpp>
 #include <raft/core/resources.hpp>
 #include <raft/util/cudart_utils.hpp>
 #include <raft/util/device_atomics.cuh>
@@ -130,6 +131,8 @@ void adj_to_csr(raft::resources const& handle,
                 index_t* out_col_ind     // output column indices
 )
 {
+  if (resource::get_dry_run_flag(handle)) { return; }  // No allocations below
+
   auto stream = resource::get_cuda_stream(handle);
 
   // Check inputs and return early if possible.
diff --git a/cpp/include/raft/sparse/convert/detail/bitmap_to_csr.cuh b/cpp/include/raft/sparse/convert/detail/bitmap_to_csr.cuh
index 39c13412e6..0f1a68a0e5 100644
--- a/cpp/include/raft/sparse/convert/detail/bitmap_to_csr.cuh
+++ b/cpp/include/raft/sparse/convert/detail/bitmap_to_csr.cuh
@@ -8,6 +8,7 @@
 #include <raft/core/detail/macros.hpp>
 #include <raft/core/detail/mdspan_util.cuh>  // detail::popc
 #include <raft/core/resource/cuda_stream.hpp>
+#include <raft/core/resource/dry_run_flag.hpp>
 #include <raft/core/resource/thrust_policy.hpp>
 #include <raft/core/resources.hpp>
 #include <raft/sparse/convert/detail/adj_to_csr.cuh>
@@ -16,10 +17,12 @@
 #include <rmm/device_uvector.hpp>
 
 #include <cub/block/block_reduce.cuh>
+#include <cub/device/device_scan.cuh>
 #include <cuda/std/cassert>
 #include <cuda/std/functional>
 #include <thrust/fill.h>
-#include <thrust/scan.h>
+
+#include <limits>
 
 namespace raft {
 namespace sparse {
@@ -105,6 +108,8 @@ void calc_nnz_by_rows(raft::resources const& handle,
     sub_nnz_size     = num_rows * ((num_cols + bits_per_sub_col - 1) / bits_per_sub_col);
     return;
   }
+  if (resource::get_dry_run_flag(handle)) { return; }
+
   auto stream        = resource::get_cuda_stream(handle);
   const size_t total = num_rows * num_cols;
   const size_t bitmap_num =
@@ -244,6 +249,7 @@ void fill_indices_by_rows(raft::resources const& handle,
                           index_t bits_per_sub_col,
                           size_t sub_nnz_size)
 {
+  if (resource::get_dry_run_flag(handle)) { return; }
   auto stream  = resource::get_cuda_stream(handle);
   auto block_x = num_rows;
   auto block_y = sub_nnz_size / num_rows;
@@ -283,7 +289,10 @@ void bitmap_to_csr(raft::resources const& handle,
   index_t* indptr  = csr_view.get_indptr().data();
   index_t* indices = csr_view.get_indices().data();
 
-  RAFT_CUDA_TRY(cudaMemsetAsync(indptr, 0, (csr_view.get_n_rows() + 1) * sizeof(index_t), stream));
+  if (!resource::get_dry_run_flag(handle)) {
+    RAFT_CUDA_TRY(
+      cudaMemsetAsync(indptr, 0, (csr_view.get_n_rows() + 1) * sizeof(index_t), stream));
+  }
 
   size_t sub_nnz_size      = 0;
   index_t bits_per_sub_col = 0;
@@ -300,6 +309,21 @@ void bitmap_to_csr(raft::resources const& handle,
   rmm::device_async_resource_ref device_memory = resource::get_workspace_resource_ref(handle);
   rmm::device_uvector<nnz_t> sub_nnz(sub_nnz_size + 1, stream, device_memory);
 
+  size_t scan_ws_bytes = 0;
+  cub::DeviceScan::ExclusiveSum(
+    nullptr, scan_ws_bytes, sub_nnz.data(), sub_nnz.data(), sub_nnz_size + 1, stream);
+  rmm::device_uvector<char> scan_ws(scan_ws_bytes, stream);
+
+  if (resource::get_dry_run_flag(handle)) {
+    if constexpr (is_device_csr_sparsity_owning_v<csr_matrix_t>) {
+      auto safe_nnz = std::min(
+        static_cast<uint64_t>(csr_view.get_n_rows()) * static_cast<uint64_t>(csr_view.get_n_cols()),
+        static_cast<uint64_t>(std::numeric_limits<nnz_t>::max()));
+      csr.initialize_sparsity(static_cast<nnz_t>(safe_nnz));
+    }
+    return;
+  }
+
   calc_nnz_by_rows(handle,
                    bitmap.data(),
                    csr_view.get_n_rows(),
@@ -308,8 +332,8 @@ void bitmap_to_csr(raft::resources const& handle,
                    sub_nnz_size,
                    bits_per_sub_col);
 
-  thrust::exclusive_scan(
-    thrust_policy, sub_nnz.data(), sub_nnz.data() + sub_nnz_size + 1, sub_nnz.data());
+  cub::DeviceScan::ExclusiveSum(
+    scan_ws.data(), scan_ws_bytes, sub_nnz.data(), sub_nnz.data(), sub_nnz_size + 1, stream);
 
   if constexpr (is_device_csr_sparsity_owning_v<csr_matrix_t>) {
     nnz_t nnz = 0;
diff --git a/cpp/include/raft/sparse/convert/detail/bitset_to_csr.cuh b/cpp/include/raft/sparse/convert/detail/bitset_to_csr.cuh
index 20e7200c14..b77970d418 100644
--- a/cpp/include/raft/sparse/convert/detail/bitset_to_csr.cuh
+++ b/cpp/include/raft/sparse/convert/detail/bitset_to_csr.cuh
@@ -8,6 +8,7 @@
 #include <raft/core/detail/macros.hpp>
 #include <raft/core/detail/mdspan_util.cuh>  // detail::popc
 #include <raft/core/resource/cuda_stream.hpp>
+#include <raft/core/resource/dry_run_flag.hpp>
 #include <raft/core/resource/thrust_policy.hpp>
 #include <raft/core/resources.hpp>
 #include <raft/sparse/convert/detail/adj_to_csr.cuh>
@@ -15,9 +16,11 @@
 
 #include <rmm/device_uvector.hpp>
 
+#include <cub/device/device_scan.cuh>
 #include <cuda/std/cassert>
 #include <thrust/fill.h>
-#include <thrust/scan.h>
+
+#include <limits>
 
 namespace raft {
 namespace sparse {
@@ -63,6 +66,7 @@ void gpu_repeat_csr(raft::resources const& handle,
                     index_t* d_repeated_indices)
 {
   if (nnz == 0) return;
+  if (resource::get_dry_run_flag(handle)) { return; }
 
   auto stream            = resource::get_cuda_stream(handle);
   index_t repeat_csr_tpb = 256;
@@ -96,7 +100,10 @@ void bitset_to_csr(raft::resources const& handle,
   index_t* indptr  = csr_view.get_indptr().data();
   index_t* indices = csr_view.get_indices().data();
 
-  RAFT_CUDA_TRY(cudaMemsetAsync(indptr, 0, (csr_view.get_n_rows() + 1) * sizeof(index_t), stream));
+  if (!resource::get_dry_run_flag(handle)) {
+    RAFT_CUDA_TRY(
+      cudaMemsetAsync(indptr, 0, (csr_view.get_n_rows() + 1) * sizeof(index_t), stream));
+  }
 
   size_t sub_nnz_size      = 0;
   index_t bits_per_sub_col = 0;
@@ -113,6 +120,21 @@ void bitset_to_csr(raft::resources const& handle,
   rmm::device_async_resource_ref device_memory = resource::get_workspace_resource_ref(handle);
   rmm::device_uvector<nnz_t> sub_nnz(sub_nnz_size + 1, stream, device_memory);
 
+  size_t scan_ws_bytes = 0;
+  cub::DeviceScan::ExclusiveSum(
+    nullptr, scan_ws_bytes, sub_nnz.data(), sub_nnz.data(), sub_nnz_size + 1, stream);
+  rmm::device_uvector<char> scan_ws(scan_ws_bytes, stream);
+
+  if (resource::get_dry_run_flag(handle)) {
+    if constexpr (is_device_csr_sparsity_owning_v<csr_matrix_t>) {
+      auto safe_nnz = std::min(
+        static_cast<uint64_t>(csr_view.get_n_rows()) * static_cast<uint64_t>(csr_view.get_n_cols()),
+        static_cast<uint64_t>(std::numeric_limits<nnz_t>::max()));
+      csr.initialize_sparsity(static_cast<nnz_t>(safe_nnz));
+    }
+    return;
+  }
+
   calc_nnz_by_rows(handle,
                    bitset.data(),
                    row_t(1),
@@ -121,8 +143,8 @@ void bitset_to_csr(raft::resources const& handle,
                    sub_nnz_size,
                    bits_per_sub_col);
 
-  thrust::exclusive_scan(
-    thrust_policy, sub_nnz.data(), sub_nnz.data() + sub_nnz_size + 1, sub_nnz.data());
+  cub::DeviceScan::ExclusiveSum(
+    scan_ws.data(), scan_ws_bytes, sub_nnz.data(), sub_nnz.data(), sub_nnz_size + 1, stream);
 
   nnz_t bitset_nnz = 0;
   if constexpr (is_device_csr_sparsity_owning_v<csr_matrix_t>) {
diff --git a/cpp/include/raft/sparse/convert/detail/csr.cuh b/cpp/include/raft/sparse/convert/detail/csr.cuh
index e5309bb05e..0804f6d523 100644
--- a/cpp/include/raft/sparse/convert/detail/csr.cuh
+++ b/cpp/include/raft/sparse/convert/detail/csr.cuh
@@ -8,6 +8,7 @@
 #include <raft/core/detail/macros.hpp>
 #include <raft/core/resource/cuda_stream.hpp>
 #include <raft/core/resource/cusparse_handle.hpp>
+#include <raft/core/resource/dry_run_flag.hpp>
 #include <raft/core/resources.hpp>
 #include <raft/sparse/coo.hpp>
 #include <raft/sparse/detail/cusparse_wrappers.h>
@@ -47,15 +48,19 @@ void coo_to_csr(raft::resources const& handle,
 {
   auto stream         = resource::get_cuda_stream(handle);
   auto cusparseHandle = resource::get_cusparse_handle(handle);
+
   rmm::device_uvector<int> dstRows(nnz, stream);
-  RAFT_CUDA_TRY(
-    cudaMemcpyAsync(dstRows.data(), srcRows, sizeof(int) * nnz, cudaMemcpyDeviceToDevice, stream));
-  RAFT_CUDA_TRY(
-    cudaMemcpyAsync(dstCols, srcCols, sizeof(int) * nnz, cudaMemcpyDeviceToDevice, stream));
   auto buffSize = raft::sparse::detail::cusparsecoosort_bufferSizeExt(
     cusparseHandle, m, m, nnz, srcRows, srcCols, stream);
   rmm::device_uvector<char> pBuffer(buffSize, stream);
   rmm::device_uvector<int> P(nnz, stream);
+
+  if (resource::get_dry_run_flag(handle)) { return; }
+
+  RAFT_CUDA_TRY(
+    cudaMemcpyAsync(dstRows.data(), srcRows, sizeof(int) * nnz, cudaMemcpyDeviceToDevice, stream));
+  RAFT_CUDA_TRY(
+    cudaMemcpyAsync(dstCols, srcCols, sizeof(int) * nnz, cudaMemcpyDeviceToDevice, stream));
   RAFT_CUSPARSE_TRY(cusparseCreateIdentityPermutation(cusparseHandle, nnz, P.data()));
   raft::sparse::detail::cusparsecoosortByRow(
     cusparseHandle, m, m, nnz, dstRows.data(), dstCols, P.data(), pBuffer.data(), stream);
diff --git a/cpp/include/raft/sparse/linalg/detail/laplacian.cuh b/cpp/include/raft/sparse/linalg/detail/laplacian.cuh
index 40662a5484..0421eae1e0 100644
--- a/cpp/include/raft/sparse/linalg/detail/laplacian.cuh
+++ b/cpp/include/raft/sparse/linalg/detail/laplacian.cuh
@@ -195,13 +195,13 @@ device_coo_matrix<ElementType, RowType, ColType, NZType> compute_graph_laplacian
                            });
 
   raft::sparse::op::coo_sort<ElementType, RowType, NZType>(
+    res,
     dim,
     dim,
     result.structure_view().get_nnz(),
     result.structure_view().get_rows().data(),
     result.structure_view().get_cols().data(),
-    result.get_elements().data(),
-    raft::resource::get_cuda_stream(res));
+    result.get_elements().data());
 
   auto result_nnz = result.structure_view().get_nnz();
   auto degrees    = raft::make_device_vector<ElementType, RowType>(res, dim);
diff --git a/cpp/include/raft/sparse/linalg/detail/sddmm.hpp b/cpp/include/raft/sparse/linalg/detail/sddmm.hpp
index d4ff1eef53..ebbfeba9b4 100644
--- a/cpp/include/raft/sparse/linalg/detail/sddmm.hpp
+++ b/cpp/include/raft/sparse/linalg/detail/sddmm.hpp
@@ -10,6 +10,7 @@
 #include <raft/core/host_mdspan.hpp>
 #include <raft/core/resource/cuda_stream.hpp>
 #include <raft/core/resource/cusparse_handle.hpp>
+#include <raft/core/resource/dry_run_flag.hpp>
 #include <raft/core/resources.hpp>
 #include <raft/linalg/linalg_types.hpp>
 #include <raft/sparse/detail/cusparse_wrappers.h>
@@ -66,6 +67,8 @@ void sddmm(raft::resources const& handle,
 
   rmm::device_uvector<uint8_t> tmp(bufferSize, resource::get_cuda_stream(handle));
 
+  if (resource::get_dry_run_flag(handle)) { return; }
+
   RAFT_CUSPARSE_TRY(raft::sparse::detail::cusparsesddmm(resource::get_cusparse_handle(handle),
                                                         op_a,
                                                         op_b,
diff --git a/cpp/include/raft/sparse/linalg/detail/spmm.hpp b/cpp/include/raft/sparse/linalg/detail/spmm.hpp
index bbfb0ffbd5..ef279fdb51 100644
--- a/cpp/include/raft/sparse/linalg/detail/spmm.hpp
+++ b/cpp/include/raft/sparse/linalg/detail/spmm.hpp
@@ -10,6 +10,7 @@
 #include <raft/core/host_mdspan.hpp>
 #include <raft/core/resource/cuda_stream.hpp>
 #include <raft/core/resource/cusparse_handle.hpp>
+#include <raft/core/resource/dry_run_flag.hpp>
 #include <raft/core/resources.hpp>
 #include <raft/sparse/detail/cusparse_wrappers.h>
 
@@ -86,10 +87,12 @@ void spmm(raft::resources const& handle,
                                                   &bufferSize,
                                                   resource::get_cuda_stream(handle)));
 
-  raft::interruptible::synchronize(resource::get_cuda_stream(handle));
+  resource::sync_stream(handle);
 
   rmm::device_uvector<ValueType> tmp(bufferSize, resource::get_cuda_stream(handle));
 
+  if (resource::get_dry_run_flag(handle)) { return; }
+
   RAFT_CUSPARSE_TRY(raft::sparse::detail::cusparsespmm(resource::get_cusparse_handle(handle),
                                                        opX,
                                                        opY,
diff --git a/cpp/include/raft/sparse/linalg/detail/symmetrize.cuh b/cpp/include/raft/sparse/linalg/detail/symmetrize.cuh
index ddbb93d84f..426affd94a 100644
--- a/cpp/include/raft/sparse/linalg/detail/symmetrize.cuh
+++ b/cpp/include/raft/sparse/linalg/detail/symmetrize.cuh
@@ -9,6 +9,7 @@
 #include <raft/core/device_coo_matrix.hpp>
 #include <raft/core/device_resources.hpp>
 #include <raft/core/resource/cuda_stream.hpp>
+#include <raft/core/resource/dry_run_flag.hpp>
 #include <raft/matrix/init.cuh>
 #include <raft/sparse/convert/csr.cuh>
 #include <raft/sparse/coo.hpp>
@@ -401,22 +402,23 @@ void symmetrize(raft::resources const& handle,
   rmm::device_uvector<value_idx> symm_cols(nnz * 2, stream);
   rmm::device_uvector<value_t> symm_vals(nnz * 2, stream);
 
-  raft::copy_async(symm_rows.data(), rows, nnz, stream);
-  raft::copy_async(symm_rows.data() + nnz, cols, nnz, stream);
-  raft::copy_async(symm_cols.data(), cols, nnz, stream);
-  raft::copy_async(symm_cols.data() + nnz, rows, nnz, stream);
+  if (!resource::get_dry_run_flag(handle)) {
+    raft::copy_async(symm_rows.data(), rows, nnz, stream);
+    raft::copy_async(symm_rows.data() + nnz, cols, nnz, stream);
+    raft::copy_async(symm_cols.data(), cols, nnz, stream);
+    raft::copy_async(symm_cols.data() + nnz, rows, nnz, stream);
 
-  raft::copy_async(symm_vals.data(), vals, nnz, stream);
-  raft::copy_async(symm_vals.data() + nnz, vals, nnz, stream);
+    raft::copy_async(symm_vals.data(), vals, nnz, stream);
+    raft::copy_async(symm_vals.data() + nnz, vals, nnz, stream);
+  }
 
-  // sort COO
-  raft::sparse::op::coo_sort((value_idx)m,
+  raft::sparse::op::coo_sort(handle,
+                             (value_idx)m,
                              (value_idx)n,
                              static_cast<nnz_t>(nnz) * 2,
                              symm_rows.data(),
                              symm_cols.data(),
-                             symm_vals.data(),
-                             stream);
+                             symm_vals.data());
 
   raft::sparse::op::max_duplicates(
     handle, out, symm_rows.data(), symm_cols.data(), symm_vals.data(), nnz * 2, m, n);
diff --git a/cpp/include/raft/sparse/linalg/detail/utils.cuh b/cpp/include/raft/sparse/linalg/detail/utils.cuh
index f4f9385ea5..69b9b33cda 100644
--- a/cpp/include/raft/sparse/linalg/detail/utils.cuh
+++ b/cpp/include/raft/sparse/linalg/detail/utils.cuh
@@ -7,6 +7,7 @@
 
 #include <raft/core/detail/macros.hpp>
 #include <raft/core/math.hpp>
+#include <raft/core/resource/dry_run_flag.hpp>
 
 #include <cub/warp/warp_reduce.cuh>
 #include <cuda_fp16.h>
@@ -95,6 +96,7 @@ void faster_dot_on_csr(raft::resources const& handle,
                        const value_idx dim)
 {
   if (nnz == 0 || n_rows == 0) return;
+  if (resource::get_dry_run_flag(handle)) { return; }  // No allocations below
 
   auto stream = resource::get_cuda_stream(handle);
 
diff --git a/cpp/include/raft/sparse/linalg/spmm.hpp b/cpp/include/raft/sparse/linalg/spmm.hpp
index cc27544a4e..3e7b7a599e 100644
--- a/cpp/include/raft/sparse/linalg/spmm.hpp
+++ b/cpp/include/raft/sparse/linalg/spmm.hpp
@@ -7,7 +7,10 @@
 
 #pragma once
 
+#include <raft/core/copy.hpp>
 #include <raft/core/detail/macros.hpp>
+#include <raft/core/device_mdarray.hpp>
+#include <raft/core/resource/device_memory_resource.hpp>
 #include <raft/sparse/linalg/detail/cusparse_utils.hpp>
 #include <raft/sparse/linalg/detail/spmm.hpp>
 
@@ -53,16 +56,22 @@ void spmm(raft::resources const& handle,
   // WARNING: The following copy is working around a bug in cusparse which causes an alignment issue
   // and incorrect results. This bug is fixed in CUDA 12.5+ so this workaround shouldn't be removed
   // until that version is supported.
-  auto size = is_row_major ? (z.extent(0) - 1) * z.stride(0) + z.extent(1)
-                           : (z.extent(1) - 1) * z.stride(1) + z.extent(0);
-  rmm::device_uvector<ValueType> z_tmp(size, raft::resource::get_cuda_stream(handle));
-  raft::copy(z_tmp.data(), z.data_handle(), z_tmp.size(), raft::resource::get_cuda_stream(handle));
+  auto size  = is_row_major ? (z.extent(0) - 1) * z.stride(0) + z.extent(1)
+                            : (z.extent(1) - 1) * z.stride(1) + z.extent(0);
+  auto z_tmp = raft::make_device_mdarray<ValueType, IndexType>(
+    handle,
+    raft::resource::get_workspace_resource_ref(handle),
+    raft::make_extents<IndexType>(size));
+
+  raft::copy(handle,
+             z_tmp.view(),
+             raft::make_device_vector_view<const ValueType, IndexType>(z.data_handle(), size));
 
   auto z_tmp_view =
     is_row_major ? raft::make_device_strided_matrix_view<ValueType, IndexType, layout_c_contiguous>(
-                     z_tmp.data(), z.extent(0), z.extent(1), z.stride(0))
+                     z_tmp.data_handle(), z.extent(0), z.extent(1), z.stride(0))
                  : raft::make_device_strided_matrix_view<ValueType, IndexType, layout_f_contiguous>(
-                     z_tmp.data(), z.extent(0), z.extent(1), z.stride(1));
+                     z_tmp.data_handle(), z.extent(0), z.extent(1), z.stride(1));
 
   auto descr_x = detail::create_descriptor(x);
   auto descr_y = detail::create_descriptor(y);
@@ -72,7 +81,9 @@ void spmm(raft::resources const& handle,
 
   // WARNING: Do not remove the following copy unless you can, with certainty, say that
   // the underlying cuSPARSE issue affecting CUDA 12.2+ has been resolved.
-  raft::copy(z.data_handle(), z_tmp.data(), z_tmp.size(), raft::resource::get_cuda_stream(handle));
+  raft::copy(handle,
+             raft::make_device_vector_view<ValueType, IndexType>(z.data_handle(), size),
+             raft::make_const_mdspan(z_tmp.view()));
   RAFT_CUSPARSE_TRY_NO_THROW(cusparseDestroySpMat(descr_x));
   RAFT_CUSPARSE_TRY_NO_THROW(cusparseDestroyDnMat(descr_y));
   RAFT_CUSPARSE_TRY_NO_THROW(cusparseDestroyDnMat(descr_z));
diff --git a/cpp/include/raft/sparse/linalg/transpose.cuh b/cpp/include/raft/sparse/linalg/transpose.cuh
index 24b0758014..ff84339024 100644
--- a/cpp/include/raft/sparse/linalg/transpose.cuh
+++ b/cpp/include/raft/sparse/linalg/transpose.cuh
@@ -7,8 +7,11 @@
 
 #include <raft/core/detail/macros.hpp>
 #include <raft/core/resource/cusparse_handle.hpp>
+#include <raft/core/resource/dry_run_flag.hpp>
 #include <raft/core/resources.hpp>
-#include <raft/sparse/linalg/detail/transpose.h>
+#include <raft/sparse/detail/cusparse_wrappers.h>
+
+#include <rmm/device_uvector.hpp>
 
 namespace raft {
 namespace sparse {
@@ -43,17 +46,44 @@ void csr_transpose(raft::resources const& handle,
                    value_idx nnz,
                    cudaStream_t stream)
 {
-  detail::csr_transpose(resource::get_cusparse_handle(handle),
-                        csr_indptr,
-                        csr_indices,
-                        csr_data,
-                        csc_indptr,
-                        csc_indices,
-                        csc_data,
-                        csr_nrows,
-                        csr_ncols,
-                        nnz,
-                        stream);
+  auto cusparse_h = resource::get_cusparse_handle(handle);
+
+  size_t convert_csc_workspace_size = 0;
+  RAFT_CUSPARSE_TRY(raft::sparse::detail::cusparsecsr2csc_bufferSize(cusparse_h,
+                                                                     csr_nrows,
+                                                                     csr_ncols,
+                                                                     nnz,
+                                                                     csr_data,
+                                                                     csr_indptr,
+                                                                     csr_indices,
+                                                                     csc_data,
+                                                                     csc_indptr,
+                                                                     csc_indices,
+                                                                     CUSPARSE_ACTION_NUMERIC,
+                                                                     CUSPARSE_INDEX_BASE_ZERO,
+                                                                     CUSPARSE_CSR2CSC_ALG1,
+                                                                     &convert_csc_workspace_size,
+                                                                     stream));
+
+  rmm::device_uvector<char> convert_csc_workspace(convert_csc_workspace_size, stream);
+
+  if (resource::get_dry_run_flag(handle)) { return; }
+
+  RAFT_CUSPARSE_TRY(raft::sparse::detail::cusparsecsr2csc(cusparse_h,
+                                                          csr_nrows,
+                                                          csr_ncols,
+                                                          nnz,
+                                                          csr_data,
+                                                          csr_indptr,
+                                                          csr_indices,
+                                                          csc_data,
+                                                          csc_indptr,
+                                                          csc_indices,
+                                                          CUSPARSE_ACTION_NUMERIC,
+                                                          CUSPARSE_INDEX_BASE_ZERO,
+                                                          CUSPARSE_CSR2CSC_ALG1,
+                                                          convert_csc_workspace.data(),
+                                                          stream));
 }
 
 };  // end NAMESPACE linalg
diff --git a/cpp/include/raft/sparse/matrix/detail/preprocessing.cuh b/cpp/include/raft/sparse/matrix/detail/preprocessing.cuh
index 76c51d3814..a2d4b23f8d 100644
--- a/cpp/include/raft/sparse/matrix/detail/preprocessing.cuh
+++ b/cpp/include/raft/sparse/matrix/detail/preprocessing.cuh
@@ -4,14 +4,19 @@
  */
 #pragma once
 
+#include <raft/core/copy.cuh>
 #include <raft/core/detail/macros.hpp>
 #include <raft/core/device_mdarray.hpp>
 #include <raft/core/host_mdarray.hpp>
+#include <raft/core/resource/cuda_stream.hpp>
+#include <raft/core/resource/dry_run_flag.hpp>
 #include <raft/core/resource/thrust_policy.hpp>
 #include <raft/label/classlabels.cuh>
 #include <raft/linalg/map_reduce.cuh>
 #include <raft/stats/histogram.cuh>
 
+#include <rmm/device_uvector.hpp>
+
 #include <thrust/reduce.h>
 
 namespace raft {
@@ -36,7 +41,12 @@ void get_uniques_counts(raft::resources const& handle,
                         raft::device_vector_view<IndexType, int64_t> keys_out,
                         raft::device_vector_view<ValueType, int64_t> counts_out)
 {
-  cudaStream_t stream = raft::resource::get_cuda_stream(handle);
+  if (resource::get_dry_run_flag(handle)) {
+    // Upper bound for thrust::reduce_by_key internal workspace
+    rmm::device_uvector<char> reduce_ws(static_cast<size_t>(nnz) * sizeof(IndexType) + 4096,
+                                        resource::get_cuda_stream(handle));
+    return;
+  }
   thrust::reduce_by_key(raft::resource::get_thrust_policy(handle),
                         rows,
                         rows + nnz,
@@ -66,27 +76,26 @@ void fit_tfidf(raft::resources const& handle,
                raft::device_vector_view<IndexType, int64_t> idFeatCount,
                int& fullFeatCount)
 {
-  cudaStream_t stream = raft::resource::get_cuda_stream(handle);
+  auto batchIdLen = raft::make_host_scalar<ValueType>(handle, 0);
+  auto values_mat = raft::make_device_scalar<ValueType>(handle, 0);
 
-  // Use RAFT's histogram function to count occurrences of each column index
-  // This replaces the countLabels function from kmeans_common.cuh
   raft::stats::histogram(
-    raft::stats::HistTypeAuto,                           // Let RAFT choose the best algorithm
-    idFeatCount.data_handle(),                           // output bins (counts per feature)
-    num_cols,                                            // number of bins (one per column/feature)
-    columns,                                             // input data (column indices)
-    nnz,                                                 // number of data points
-    1,                                                   // single batch
-    stream,                                              // CUDA stream
-    raft::stats::IdentityBinner<IndexType, IndexType>()  // column indices map directly to bins
-  );
-
-  // get total number of words
-  auto batchIdLen = raft::make_host_scalar<ValueType>(0);
-  auto values_mat = raft::make_device_scalar<ValueType>(handle, 0);
-  raft::linalg::mapReduce<ValueType>(
-    values_mat.data_handle(), nnz, 0.0f, raft::identity_op(), raft::add_op(), stream, values);
-  raft::copy(batchIdLen.data_handle(), values_mat.data_handle(), values_mat.size(), stream);
+    handle,
+    raft::stats::HistTypeAuto,
+    raft::make_device_matrix_view<const IndexType, IndexType, raft::col_major>(columns, nnz, 1),
+    raft::make_device_matrix_view<int, IndexType, raft::col_major>(
+      idFeatCount.data_handle(), num_cols, 1),
+    raft::stats::IdentityBinner<IndexType, IndexType>());
+
+  raft::linalg::map_reduce(handle,
+                           raft::make_device_vector_view<const ValueType, IndexType>(values, nnz),
+                           values_mat.view(),
+                           ValueType{0},
+                           raft::identity_op(),
+                           raft::add_op());
+
+  raft::copy(handle, batchIdLen.view(), raft::make_const_mdspan(values_mat.view()));
+  if (resource::get_dry_run_flag(handle)) { return; }
   fullFeatCount += (int)batchIdLen(0);
 }
 
@@ -120,16 +129,15 @@ void fit_bm25(raft::resources const& handle,
 {
   cudaStream_t stream = raft::resource::get_cuda_stream(handle);
 
-  // Count unique row indices using raft::label::getUniquelabels
-  // This replaces the get_n_components function from cross_component_nn.cuh
   rmm::device_uvector<IndexType> temp_unique_rows(0, stream);
-  int uniq_cnt  = raft::label::getUniquelabels(temp_unique_rows, rows, nnz, stream);
-  auto row_keys = raft::make_device_vector<IndexType>(handle, uniq_cnt);
-  auto row_cnts = raft::make_device_vector<ValueType>(handle, uniq_cnt);
+  int uniq_cnt   = raft::label::getUniquelabels(handle, temp_unique_rows, rows, nnz);
+  auto row_keys  = raft::make_device_vector<IndexType>(handle, uniq_cnt);
+  auto row_cnts  = raft::make_device_vector<ValueType>(handle, uniq_cnt);
+  auto dummy_vec = raft::make_device_vector<IndexType>(handle, uniq_cnt);
+
   get_uniques_counts<IndexType, ValueType, int64_t>(
     handle, rows, columns, values, nnz, row_keys.view(), row_cnts.view());
 
-  auto dummy_vec = raft::make_device_vector<IndexType>(handle, uniq_cnt);
   raft::linalg::map(
     handle,
     dummy_vec.view(),
diff --git a/cpp/include/raft/sparse/matrix/preprocessing.cuh b/cpp/include/raft/sparse/matrix/preprocessing.cuh
index 28b375ec44..c7e9ee25e3 100644
--- a/cpp/include/raft/sparse/matrix/preprocessing.cuh
+++ b/cpp/include/raft/sparse/matrix/preprocessing.cuh
@@ -8,6 +8,7 @@
 #include <raft/core/detail/macros.hpp>
 #include <raft/core/device_coo_matrix.hpp>
 #include <raft/core/device_csr_matrix.hpp>
+#include <raft/core/resource/dry_run_flag.hpp>
 #include <raft/sparse/convert/coo.cuh>
 #include <raft/sparse/matrix/detail/preprocessing.cuh>
 
@@ -118,13 +119,15 @@ void encode_bm25(raft::resources const& handle,
   auto nnz      = csr_in.structure_view().get_nnz();
   auto indptr   = csr_in.structure_view().get_indptr();
 
-  auto rows = raft::make_device_vector<IndexType, int64_t>(handle, nnz);
-  raft::sparse::convert::csr_to_coo(
-    indptr.data(), (int)indptr.size(), rows.data_handle(), (int)nnz, stream);
-
+  auto rows         = raft::make_device_vector<IndexType, int64_t>(handle, nnz);
   int fullFeatCount = 0;
   auto featIdCount  = raft::make_device_vector<IndexType, int64_t>(handle, num_cols);
   auto rowFeatCnts  = raft::make_device_vector<IndexType, int64_t>(handle, num_rows);
+
+  if (!resource::get_dry_run_flag(handle)) {
+    raft::sparse::convert::csr_to_coo(
+      indptr.data(), (int)indptr.size(), rows.data_handle(), (int)nnz, stream);
+  }
   detail::fit_bm25<ValueType, IndexType>(handle,
                                          rows.data_handle(),
                                          columns.data(),
diff --git a/cpp/include/raft/sparse/op/detail/filter.cuh b/cpp/include/raft/sparse/op/detail/filter.cuh
index 631e17ec9b..f4c427a409 100644
--- a/cpp/include/raft/sparse/op/detail/filter.cuh
+++ b/cpp/include/raft/sparse/op/detail/filter.cuh
@@ -9,6 +9,7 @@
 #include <raft/core/device_coo_matrix.hpp>
 #include <raft/core/device_resources.hpp>
 #include <raft/core/host_mdspan.hpp>
+#include <raft/core/resource/dry_run_flag.hpp>
 #include <raft/sparse/coo.hpp>
 #include <raft/sparse/detail/cusparse_wrappers.h>
 #include <raft/sparse/detail/utils.h>
@@ -214,6 +215,18 @@ void coo_remove_scalar(raft::resources const& handle,
   rmm::device_uvector<nnz_t> row_count_nz(in_n_rows, stream);
   rmm::device_uvector<nnz_t> row_count(in_n_rows, stream);
 
+  if (resource::get_dry_run_flag(handle)) {
+    // Upper bound on non-dry-run compliant and data-dependent code below (thrust calls, unknown
+    // out_nnz <= in_nnz)
+    out.initialize_sparsity(in_nnz);
+    rmm::device_uvector<nnz_t> ex_scan(in_n_rows, stream);
+    rmm::device_uvector<nnz_t> cur_ex_scan(in_n_rows, stream);
+    // Upper bound for thrust workspace (reduce + 2x exclusive_scan)
+    rmm::device_uvector<char> thrust_ws(3 * (static_cast<size_t>(in_n_rows) * sizeof(nnz_t) + 4096),
+                                        stream);
+    return;
+  }
+
   RAFT_CUDA_TRY(
     cudaMemsetAsync(row_count_nz.data(), 0, static_cast<nnz_t>(in_n_rows) * sizeof(nnz_t), stream));
   RAFT_CUDA_TRY(
diff --git a/cpp/include/raft/sparse/op/detail/reduce.cuh b/cpp/include/raft/sparse/op/detail/reduce.cuh
index cc1ebbfc3c..64f46cb9b9 100644
--- a/cpp/include/raft/sparse/op/detail/reduce.cuh
+++ b/cpp/include/raft/sparse/op/detail/reduce.cuh
@@ -7,6 +7,7 @@
 
 #include <raft/core/detail/macros.hpp>
 #include <raft/core/resource/cuda_stream.hpp>
+#include <raft/core/resource/dry_run_flag.hpp>
 #include <raft/core/resource/thrust_policy.hpp>
 #include <raft/sparse/convert/csr.cuh>
 #include <raft/sparse/coo.hpp>
@@ -19,6 +20,7 @@
 
 #include <rmm/device_uvector.hpp>
 
+#include <cub/device/device_scan.cuh>
 #include <cuda_runtime.h>
 #include <thrust/device_ptr.h>
 #include <thrust/scan.h>
@@ -130,9 +132,21 @@ void max_duplicates(raft::resources const& handle,
   // compute diffs & take exclusive scan
   rmm::device_uvector<value_idx> diff(nnz + 1, stream);
 
+  size_t scan_ws_bytes = 0;
+  cub::DeviceScan::ExclusiveSum(
+    nullptr, scan_ws_bytes, diff.data(), diff.data(), static_cast<int>(diff.size()), stream);
+  rmm::device_uvector<char> scan_ws(scan_ws_bytes, stream);
+
+  if (resource::get_dry_run_flag(handle)) {
+    // Upper bound: at most nnz unique entries (no duplicates removed).
+    out.allocate(nnz, m, n, false, stream);
+    return;
+  }
+
   compute_duplicates_mask(diff.data(), rows, cols, nnz, stream);
 
-  thrust::exclusive_scan(thrust_policy, diff.data(), diff.data() + diff.size(), diff.data());
+  cub::DeviceScan::ExclusiveSum(
+    scan_ws.data(), scan_ws_bytes, diff.data(), diff.data(), static_cast<int>(diff.size()), stream);
 
   // compute final size
   value_idx size = 0;
diff --git a/cpp/include/raft/sparse/op/detail/sort.h b/cpp/include/raft/sparse/op/detail/sort.h
index ffdde947db..a98dc35e86 100644
--- a/cpp/include/raft/sparse/op/detail/sort.h
+++ b/cpp/include/raft/sparse/op/detail/sort.h
@@ -12,6 +12,7 @@
 #include <raft/util/cuda_utils.cuh>
 #include <raft/util/cudart_utils.hpp>
 
+#include <rmm/device_uvector.hpp>
 #include <rmm/exec_policy.hpp>
 
 #include <cuda/std/tuple>
@@ -50,6 +51,8 @@ struct TupleComp {
  * @brief Sorts the arrays that comprise the coo matrix
  * by row and then by column.
  *
+ * @param dry_run when true, allocates a best-effort workspace estimate
+ *                without launching kernels (for memory tracking)
  * @param m number of rows in coo matrix
  * @param n number of cols in coo matrix
  * @param nnz number of non-zeros
@@ -59,8 +62,18 @@ struct TupleComp {
  * @param stream: cuda stream to use
  */
 template <typename T, typename IdxT = int, typename nnz_t>
-void coo_sort(IdxT m, IdxT n, nnz_t nnz, IdxT* rows, IdxT* cols, T* vals, cudaStream_t stream)
+void coo_sort(
+  bool dry_run, IdxT m, IdxT n, nnz_t nnz, IdxT* rows, IdxT* cols, T* vals, cudaStream_t stream)
 {
+  if (dry_run) {
+    // Best-effort upper bound for thrust::sort_by_key workspace.
+    // Double-buffer estimate for large inputs; minimum 4096 for small inputs
+    // where per-allocation alignment overhead dominates.
+    auto sort_data_bytes = static_cast<std::size_t>(nnz) * (sizeof(IdxT) * 2 + sizeof(T));
+    rmm::device_uvector<char> sort_ws_est(std::max(sort_data_bytes * 2, std::size_t{4096}), stream);
+    return;
+  }
+
   auto coo_indices = thrust::make_zip_iterator(cuda::std::make_tuple(rows, cols));
 
   // get all the colors in contiguous locations so we can map them to warps.
@@ -77,7 +90,7 @@ template <typename T, typename IdxT = int, typename nnz_t>
 void coo_sort(COO<T, IdxT, nnz_t>* const in, cudaStream_t stream)
 {
   coo_sort<T, IdxT, nnz_t>(
-    in->n_rows, in->n_cols, in->nnz, in->rows(), in->cols(), in->vals(), stream);
+    false, in->n_rows, in->n_cols, in->nnz, in->rows(), in->cols(), in->vals(), stream);
 }
 
 /**
diff --git a/cpp/include/raft/sparse/op/sort.cuh b/cpp/include/raft/sparse/op/sort.cuh
index e33da812f9..188ff398d0 100644
--- a/cpp/include/raft/sparse/op/sort.cuh
+++ b/cpp/include/raft/sparse/op/sort.cuh
@@ -8,6 +8,8 @@
 #pragma once
 
 #include <raft/core/detail/macros.hpp>
+#include <raft/core/resource/cuda_stream.hpp>
+#include <raft/core/resource/dry_run_flag.hpp>
 #include <raft/core/resources.hpp>
 #include <raft/sparse/op/detail/sort.h>
 
@@ -30,7 +32,33 @@ namespace op {
 template <typename T, typename IdxT = int, typename nnz_t>
 void coo_sort(IdxT m, IdxT n, nnz_t nnz, IdxT* rows, IdxT* cols, T* vals, cudaStream_t stream)
 {
-  detail::coo_sort(m, n, nnz, rows, cols, vals, stream);
+  detail::coo_sort(false, m, n, nnz, rows, cols, vals, stream);
+}
+
+/**
+ * @brief Sorts the arrays that comprise the coo matrix
+ * by row and then by column (dry-run aware).
+ *
+ * @param handle raft resources handle
+ * @param m number of rows in coo matrix
+ * @param n number of cols in coo matrix
+ * @param nnz number of non-zeros
+ * @param rows rows array from coo matrix
+ * @param cols cols array from coo matrix
+ * @param vals vals array from coo matrix
+ */
+template <typename T, typename IdxT = int, typename nnz_t>
+void coo_sort(
+  raft::resources const& handle, IdxT m, IdxT n, nnz_t nnz, IdxT* rows, IdxT* cols, T* vals)
+{
+  detail::coo_sort(resource::get_dry_run_flag(handle),
+                   m,
+                   n,
+                   nnz,
+                   rows,
+                   cols,
+                   vals,
+                   resource::get_cuda_stream(handle));
 }
 
 /**
@@ -42,8 +70,7 @@ void coo_sort(IdxT m, IdxT n, nnz_t nnz, IdxT* rows, IdxT* cols, T* vals, cudaSt
 template <typename T, typename IdxT = int, typename nnz_t>
 void coo_sort(COO<T, IdxT, nnz_t>* const in, cudaStream_t stream)
 {
-  coo_sort<T, IdxT, nnz_t>(
-    in->n_rows, in->n_cols, in->nnz, in->rows(), in->cols(), in->vals(), stream);
+  detail::coo_sort(in, stream);
 }
 
 /**
diff --git a/cpp/include/raft/sparse/solver/detail/cholesky_qr.cuh b/cpp/include/raft/sparse/solver/detail/cholesky_qr.cuh
index 2176c38b2b..fcca96af4d 100644
--- a/cpp/include/raft/sparse/solver/detail/cholesky_qr.cuh
+++ b/cpp/include/raft/sparse/solver/detail/cholesky_qr.cuh
@@ -5,9 +5,13 @@
 
 #pragma once
 
+#include <raft/core/copy.hpp>
+#include <raft/core/device_mdspan.hpp>
+#include <raft/core/host_mdspan.hpp>
 #include <raft/core/resource/cublas_handle.hpp>
 #include <raft/core/resource/cuda_stream.hpp>
 #include <raft/core/resource/cusolver_dn_handle.hpp>
+#include <raft/core/resource/dry_run_flag.hpp>
 #include <raft/core/resources.hpp>
 #include <raft/linalg/detail/cublas_wrappers.hpp>
 #include <raft/linalg/detail/cusolver_wrappers.hpp>
@@ -62,6 +66,12 @@ bool cholesky_qr_pass(raft::resources const& handle,
                      k,
                      stream);
 
+  if (raft::resource::get_dry_run_flag(handle)) {
+    // Return false to force fallback to QR in dry run mode to track its memory usage.
+    // There are no more allocations to track below this point - safe to return early.
+    return false;
+  }
+
   // L = cholesky(W, LOWER)  — W is overwritten with L in lower triangle
   RAFT_CUSOLVER_TRY(raft::linalg::detail::cusolverDnpotrf(
     cusolver_h, CUBLAS_FILL_MODE_LOWER, k, W, k, workspace, workspace_size, dev_info, stream));
diff --git a/cpp/include/raft/sparse/solver/detail/lanczos.cuh b/cpp/include/raft/sparse/solver/detail/lanczos.cuh
index 9aa4304c04..543bbbc16b 100644
--- a/cpp/include/raft/sparse/solver/detail/lanczos.cuh
+++ b/cpp/include/raft/sparse/solver/detail/lanczos.cuh
@@ -18,6 +18,7 @@
 #include <raft/core/mdspan_types.hpp>
 #include <raft/core/resource/cublas_handle.hpp>
 #include <raft/core/resource/cuda_stream.hpp>
+#include <raft/core/resource/dry_run_flag.hpp>
 #include <raft/core/resources.hpp>
 #include <raft/core/types.hpp>
 #include <raft/linalg/add.cuh>
@@ -142,7 +143,8 @@ void lanczos_solve_ritz(
   raft::device_vector_view<ValueTypeT> sm_eigenvalues,
   raft::device_matrix_view<ValueTypeT, uint32_t, raft::col_major> sm_eigenvectors)
 {
-  auto stream = resource::get_cuda_stream(handle);
+  auto stream           = resource::get_cuda_stream(handle);
+  bool const is_dry_run = resource::get_dry_run_flag(handle);
 
   ValueTypeT zero = 0;
   auto triangular_matrix =
@@ -153,19 +155,18 @@ void lanczos_solve_ritz(
     raft::make_device_vector_view<const ValueTypeT, uint32_t>(alpha.data_handle(), ncv);
   raft::matrix::set_diagonal(handle, alphaVec, triangular_matrix.view());
 
-  // raft::matrix::initializeDiagonalMatrix(
-  //   alpha.data_handle(), triangular_matrix.data_handle(), ncv, ncv, stream);
-
-  int blockSize = 256;
-  int numBlocks = (ncv + blockSize - 1) / blockSize;
-  kernel_triangular_populate<ValueTypeT>
-    <<<blockSize, numBlocks, 0, stream>>>(triangular_matrix.data_handle(), beta.data_handle(), ncv);
-
-  if (beta_k) {
-    int threadsPerBlock = 256;
-    int blocksPerGrid   = (k + threadsPerBlock - 1) / threadsPerBlock;
-    kernel_triangular_beta_k<ValueTypeT><<<blocksPerGrid, threadsPerBlock, 0, stream>>>(
-      triangular_matrix.data_handle(), beta_k.value().data_handle(), (int)k, ncv);
+  if (!is_dry_run) {
+    int blockSize = 256;
+    int numBlocks = (ncv + blockSize - 1) / blockSize;
+    kernel_triangular_populate<ValueTypeT><<<blockSize, numBlocks, 0, stream>>>(
+      triangular_matrix.data_handle(), beta.data_handle(), ncv);
+
+    if (beta_k) {
+      int threadsPerBlock = 256;
+      int blocksPerGrid   = (k + threadsPerBlock - 1) / threadsPerBlock;
+      kernel_triangular_beta_k<ValueTypeT><<<blocksPerGrid, threadsPerBlock, 0, stream>>>(
+        triangular_matrix.data_handle(), beta_k.value().data_handle(), (int)k, ncv);
+    }
   }
 
   auto triangular_matrix_view =
@@ -194,32 +195,36 @@ void lanczos_solve_ritz(
     eigenvectors_k_slice = raft::make_device_matrix_view<ValueTypeT, IndexTypeT, raft::col_major>(
       eigenvectors.data_handle() + (ncv - nEigVecs) * ncv, ncv, nEigVecs);
   } else if (which == LANCZOS_WHICH::SM || which == LANCZOS_WHICH::LM) {
-    thrust::sequence(thrust::device, indices.data_handle(), indices.data_handle() + ncv, 0);
-
-    // Sort indices by absolute eigenvalues (magnitude) using a custom comparator
-    thrust::sort(thrust::device,
-                 indices.data_handle(),
-                 indices.data_handle() + ncv,
-                 [eigenvalues = eigenvalues.data_handle()] __device__(int a, int b) {
-                   return fabsf(eigenvalues[a]) < fabsf(eigenvalues[b]);
-                 });
-
-    if (which == LANCZOS_WHICH::SM) {
-      // Take the first nEigVecs indices (smallest magnitude)
-      raft::copy(selected_indices.data_handle(), indices.data_handle(), nEigVecs, stream);
-    } else if (which == LANCZOS_WHICH::LM) {
-      // Take the last nEigVecs indices (largest magnitude)
-      raft::copy(
-        selected_indices.data_handle(), indices.data_handle() + (ncv - nEigVecs), nEigVecs, stream);
+    if (!is_dry_run) {  // TODO: we must be missing some allocations in thrust helpers here
+      thrust::sequence(thrust::device, indices.data_handle(), indices.data_handle() + ncv, 0);
+
+      // Sort indices by absolute eigenvalues (magnitude) using a custom comparator
+      thrust::sort(thrust::device,
+                   indices.data_handle(),
+                   indices.data_handle() + ncv,
+                   [eigenvalues = eigenvalues.data_handle()] __device__(int a, int b) {
+                     return fabsf(eigenvalues[a]) < fabsf(eigenvalues[b]);
+                   });
+
+      if (which == LANCZOS_WHICH::SM) {
+        // Take the first nEigVecs indices (smallest magnitude)
+        raft::copy(selected_indices.data_handle(), indices.data_handle(), nEigVecs, stream);
+      } else if (which == LANCZOS_WHICH::LM) {
+        // Take the last nEigVecs indices (largest magnitude)
+        raft::copy(selected_indices.data_handle(),
+                   indices.data_handle() + (ncv - nEigVecs),
+                   nEigVecs,
+                   stream);
+      }
+
+      // Re-sort these indices by algebraic value to maintain algebraic ordering
+      thrust::sort(thrust::device,
+                   selected_indices.data_handle(),
+                   selected_indices.data_handle() + nEigVecs,
+                   [eigenvalues = eigenvalues.data_handle()] __device__(int a, int b) {
+                     return eigenvalues[a] < eigenvalues[b];
+                   });
     }
-
-    // Re-sort these indices by algebraic value to maintain algebraic ordering
-    thrust::sort(thrust::device,
-                 selected_indices.data_handle(),
-                 selected_indices.data_handle() + nEigVecs,
-                 [eigenvalues = eigenvalues.data_handle()] __device__(int a, int b) {
-                   return eigenvalues[a] < eigenvalues[b];
-                 });
     raft::matrix::gather(
       handle,
       raft::make_device_matrix_view<const ValueTypeT, uint32_t, raft::row_major>(
@@ -270,15 +275,13 @@ void lanczos_aux(raft::resources const& handle,
   } else {
     spmv_alg = CUSPARSE_SPMV_ALG_DEFAULT;
   }
-  auto stream = resource::get_cuda_stream(handle);
+  auto stream           = resource::get_cuda_stream(handle);
+  bool const is_dry_run = resource::get_dry_run_flag(handle);
 
   IndexTypeT n  = A.structure_view().get_n_rows();
   auto v_vector = raft::make_device_vector_view<const ValueTypeT>(v.data_handle(), n);
   auto u_vector = raft::make_device_vector_view<const ValueTypeT>(u.data_handle(), n);
 
-  raft::copy(
-    v.data_handle(), V.data_handle() + start_idx * V.stride(0), n, stream);  // V(start_idx, 0)
-
   auto cusparse_h                 = resource::get_cusparse_handle(handle);
   cusparseSpMatDescr_t cusparse_A = raft::sparse::linalg::detail::create_descriptor(A);
 
@@ -300,17 +303,24 @@ void lanczos_aux(raft::resources const& handle,
                                                 stream);
   auto cusparse_spmv_buffer = raft::make_device_vector<ValueTypeT>(handle, bufferSize);
 
+  if (!is_dry_run) {
+    raft::copy(
+      v.data_handle(), V.data_handle() + start_idx * V.stride(0), n, stream);  // V(start_idx, 0)
+  }
+
   for (int i = start_idx; i < end_idx; i++) {
-    raft::sparse::detail::cusparsespmv(cusparse_h,
-                                       CUSPARSE_OPERATION_NON_TRANSPOSE,
-                                       &one,
-                                       cusparse_A,
-                                       cusparse_v,
-                                       &zero,
-                                       cusparse_u,
-                                       spmv_alg,
-                                       cusparse_spmv_buffer.data_handle(),
-                                       stream);
+    if (!is_dry_run) {
+      raft::sparse::detail::cusparsespmv(cusparse_h,
+                                         CUSPARSE_OPERATION_NON_TRANSPOSE,
+                                         &one,
+                                         cusparse_A,
+                                         cusparse_v,
+                                         &zero,
+                                         cusparse_u,
+                                         spmv_alg,
+                                         cusparse_spmv_buffer.data_handle(),
+                                         stream);
+    }
 
     auto alpha_i =
       raft::make_device_scalar_view(alpha.data_handle() + i * alpha.stride(1));  // alpha(0, i)
@@ -324,10 +334,12 @@ void lanczos_aux(raft::resources const& handle,
     ValueTypeT b            = 0;
     ValueTypeT mone         = -1;
 
-    raft::copy<ValueTypeT>(
-      &b, beta.data_handle() + ((i - 1 + ncv) % ncv) * beta.stride(1), 1, stream);
-    raft::copy<ValueTypeT>(
-      &alpha_i_host, alpha.data_handle() + i * alpha.stride(1), 1, stream);  // alpha(0, i)
+    if (!is_dry_run) {
+      raft::copy<ValueTypeT>(
+        &b, beta.data_handle() + ((i - 1 + ncv) % ncv) * beta.stride(1), 1, stream);
+      raft::copy<ValueTypeT>(
+        &alpha_i_host, alpha.data_handle() + i * alpha.stride(1), 1, stream);  // alpha(0, i)
+    }
 
     raft::linalg::axpy(handle, n, &alpha_i_host, v.data_handle(), 1, vv.data_handle(), 1, stream);
     raft::linalg::axpy(handle,
@@ -371,7 +383,9 @@ void lanczos_aux(raft::resources const& handle,
     auto uu_i = raft::make_device_scalar_view(uu.data_handle() + uu.stride(1) * i);  // uu(0, i)
     raft::linalg::add(handle, make_const_mdspan(alpha_i), make_const_mdspan(uu_i), alpha_i);
 
-    kernel_clamp_down<<<1, 1, 0, stream>>>(alpha_i.data_handle(), static_cast<ValueTypeT>(1e-9));
+    if (!is_dry_run) {
+      kernel_clamp_down<<<1, 1, 0, stream>>>(alpha_i.data_handle(), static_cast<ValueTypeT>(1e-9));
+    }
 
     auto output = raft::make_device_vector_view<ValueTypeT, uint32_t>(
       beta.data_handle() + beta.stride(1) * i, 1);
@@ -379,22 +393,26 @@ void lanczos_aux(raft::resources const& handle,
     raft::linalg::norm<raft::linalg::L2Norm, raft::Apply::ALONG_ROWS>(
       handle, input, output, raft::sqrt_op());
 
-    int blockSize = 256;
-    int numBlocks = (n + blockSize - 1) / blockSize;
+    if (!is_dry_run) {
+      int blockSize = 256;
+      int numBlocks = (n + blockSize - 1) / blockSize;
 
-    kernel_clamp_down_vector<<<numBlocks, blockSize, 0, stream>>>(
-      u.data_handle(), static_cast<ValueTypeT>(1e-7), n);
+      kernel_clamp_down_vector<<<numBlocks, blockSize, 0, stream>>>(
+        u.data_handle(), static_cast<ValueTypeT>(1e-7), n);
 
-    kernel_clamp_down<<<1, 1, 0, stream>>>(beta.data_handle() + beta.stride(1) * i,
-                                           static_cast<ValueTypeT>(1e-6));
+      kernel_clamp_down<<<1, 1, 0, stream>>>(beta.data_handle() + beta.stride(1) * i,
+                                             static_cast<ValueTypeT>(1e-6));
+    }
 
     if (i >= end_idx - 1) { break; }
 
-    int threadsPerBlock = 256;
-    int blocksPerGrid   = (n + threadsPerBlock - 1) / threadsPerBlock;
+    if (!is_dry_run) {
+      int threadsPerBlock = 256;
+      int blocksPerGrid   = (n + threadsPerBlock - 1) / threadsPerBlock;
 
-    kernel_normalize<ValueTypeT><<<blocksPerGrid, threadsPerBlock, 0, stream>>>(
-      u.data_handle(), beta.data_handle(), i, n, v.data_handle(), V.data_handle(), n);
+      kernel_normalize<ValueTypeT><<<blocksPerGrid, threadsPerBlock, 0, stream>>>(
+        u.data_handle(), beta.data_handle(), i, n, v.data_handle(), V.data_handle(), n);
+    }
   }
 }
 
@@ -422,9 +440,10 @@ auto lanczos_smallest(raft::resources const& handle,
   } else {
     spmv_alg = CUSPARSE_SPMV_ALG_DEFAULT;
   }
-  int n       = A.structure_view().get_n_rows();
-  int ncv     = restartIter;
-  auto stream = resource::get_cuda_stream(handle);
+  int n           = A.structure_view().get_n_rows();
+  int ncv         = restartIter;
+  auto stream     = resource::get_cuda_stream(handle);
+  bool is_dry_run = resource::get_dry_run_flag(handle);
 
   auto V = raft::make_device_matrix<ValueTypeT, uint32_t, raft::row_major>(handle, ncv, n);
   auto V_0_view =
@@ -433,7 +452,7 @@ auto lanczos_smallest(raft::resources const& handle,
 
   auto u        = raft::make_device_matrix<ValueTypeT, uint32_t, raft::row_major>(handle, 1, n);
   auto u_vector = raft::make_device_vector_view<ValueTypeT, uint32_t>(u.data_handle(), n);
-  raft::copy(u.data_handle(), v0, n, stream);
+  if (!is_dry_run) { raft::copy(u.data_handle(), v0, n, stream); }
 
   auto cublas_h = resource::get_cublas_handle(handle);
   auto v0nrm    = raft::make_device_vector<ValueTypeT, uint32_t>(handle, 1);
@@ -529,8 +548,14 @@ auto lanczos_smallest(raft::resources const& handle,
     raft::make_device_matrix_view<const ValueTypeT>(beta_k.data_handle(), 1, nEigVecs);
   raft::linalg::norm<raft::linalg::L2Norm, raft::Apply::ALONG_ROWS>(
     handle, input, output.view(), raft::sqrt_op());
-  raft::copy(&res, output.data_handle(), 1, stream);
-  resource::sync_stream(handle, stream);
+  if (!is_dry_run) {
+    raft::copy(&res, output.data_handle(), 1, stream);
+    resource::sync_stream(handle, stream);
+  } else {
+    // Force exactly one loop iteration so the MR records all allocations inside the loop body.
+    res     = tol + 1;
+    maxIter = ncv + (ncv - nEigVecs);
+  }
 
   auto uu  = raft::make_device_matrix<ValueTypeT>(handle, 1, nEigVecs);
   int iter = ncv;
@@ -539,12 +564,14 @@ auto lanczos_smallest(raft::resources const& handle,
       beta.data_handle(), 1, nEigVecs);
     raft::matrix::fill(handle, beta_view, zero);
 
-    raft::copy(alpha.data_handle(), eigenvalues_k.data_handle(), nEigVecs, stream);
+    if (!is_dry_run) {
+      raft::copy(alpha.data_handle(), eigenvalues_k.data_handle(), nEigVecs, stream);
+    }
 
     auto x_T =
       raft::make_device_matrix_view<ValueTypeT>(ritz_eigenvectors.data_handle(), nEigVecs, n);
 
-    raft::copy(V.data_handle(), x_T.data_handle(), nEigVecs * n, stream);
+    if (!is_dry_run) { raft::copy(V.data_handle(), x_T.data_handle(), nEigVecs * n, stream); }
 
     ValueTypeT one  = 1;
     ValueTypeT mone = -1;
@@ -612,16 +639,18 @@ auto lanczos_smallest(raft::resources const& handle,
                                                   stream);
     auto cusparse_spmv_buffer = raft::make_device_vector<ValueTypeT>(handle, bufferSize);
 
-    raft::sparse::detail::cusparsespmv(cusparse_h,
-                                       CUSPARSE_OPERATION_NON_TRANSPOSE,
-                                       &one,
-                                       cusparse_A,
-                                       cusparse_v,
-                                       &zero,
-                                       cusparse_u,
-                                       spmv_alg,
-                                       cusparse_spmv_buffer.data_handle(),
-                                       stream);
+    if (!is_dry_run) {
+      raft::sparse::detail::cusparsespmv(cusparse_h,
+                                         CUSPARSE_OPERATION_NON_TRANSPOSE,
+                                         &one,
+                                         cusparse_A,
+                                         cusparse_v,
+                                         &zero,
+                                         cusparse_u,
+                                         spmv_alg,
+                                         cusparse_spmv_buffer.data_handle(),
+                                         stream);
+    }
 
     auto alpha_k = raft::make_device_scalar_view<ValueTypeT>(alpha.data_handle() + nEigVecs);
 
@@ -742,13 +771,17 @@ auto lanczos_smallest(raft::resources const& handle,
       raft::make_device_matrix_view<const ValueTypeT>(beta_k.data_handle(), 1, nEigVecs);
     raft::linalg::norm<raft::linalg::L2Norm, raft::Apply::ALONG_ROWS>(
       handle, input2, output2.view(), raft::sqrt_op());
-    raft::copy(&res, output2.data_handle(), 1, stream);
-    resource::sync_stream(handle, stream);
+    if (!is_dry_run) {
+      raft::copy(&res, output2.data_handle(), 1, stream);
+      resource::sync_stream(handle, stream);
+    }
     RAFT_LOG_TRACE("Iteration %f: residual (tolerance) %d", iter, res);
   }
 
-  raft::copy(eigVals_dev, eigenvalues_k.data_handle(), nEigVecs, stream);
-  raft::copy(eigVecs_dev, ritz_eigenvectors.data_handle(), n * nEigVecs, stream);
+  if (!is_dry_run) {
+    raft::copy(eigVals_dev, eigenvalues_k.data_handle(), nEigVecs, stream);
+    raft::copy(eigVecs_dev, ritz_eigenvectors.data_handle(), n * nEigVecs, stream);
+  }
 
   return 0;
 }
diff --git a/cpp/include/raft/sparse/solver/detail/randomized_svds.cuh b/cpp/include/raft/sparse/solver/detail/randomized_svds.cuh
index 85268141ce..04821047a5 100644
--- a/cpp/include/raft/sparse/solver/detail/randomized_svds.cuh
+++ b/cpp/include/raft/sparse/solver/detail/randomized_svds.cuh
@@ -5,6 +5,7 @@
 
 #pragma once
 
+#include <raft/core/copy.hpp>
 #include <raft/core/device_mdarray.hpp>
 #include <raft/core/device_mdspan.hpp>
 #include <raft/core/logger.hpp>
@@ -228,7 +229,10 @@ void sparse_randomized_svd(
   }
 
   // Step 9: Truncate S, optionally compute Vt
-  raft::copy(singular_values.data_handle(), S_full.data_handle(), k, stream);
+  raft::copy(handle,
+             singular_values,
+             raft::make_device_vector_view<const ValueTypeT, uint32_t>(S_full.data_handle(),
+                                                                       static_cast<uint32_t>(k)));
 
   // Vt[:k, :] = U_bt[:, :k]^T
   // U_bt is col-major (n, block_size); transpose its first k columns to (k, n) col-major.
diff --git a/cpp/include/raft/sparse/solver/detail/svds_sign_correction.cuh b/cpp/include/raft/sparse/solver/detail/svds_sign_correction.cuh
index e35889d91d..6e1180d950 100644
--- a/cpp/include/raft/sparse/solver/detail/svds_sign_correction.cuh
+++ b/cpp/include/raft/sparse/solver/detail/svds_sign_correction.cuh
@@ -7,6 +7,7 @@
 
 #include <raft/core/device_mdspan.hpp>
 #include <raft/core/resource/cuda_stream.hpp>
+#include <raft/core/resource/dry_run_flag.hpp>
 #include <raft/core/resources.hpp>
 #include <raft/util/cuda_utils.cuh>
 
@@ -148,8 +149,10 @@ void svd_sign_correction(
   ValueTypeT* U_ptr  = U ? U->data_handle() : nullptr;
   ValueTypeT* Vt_ptr = Vt ? Vt->data_handle() : nullptr;
 
-  svd_sign_correction_kernel<<<k, threads_per_block, smem_size, stream>>>(U_ptr, Vt_ptr, m, n, k);
-  RAFT_CUDA_TRY(cudaPeekAtLastError());
+  if (!raft::resource::get_dry_run_flag(handle)) {
+    svd_sign_correction_kernel<<<k, threads_per_block, smem_size, stream>>>(U_ptr, Vt_ptr, m, n, k);
+    RAFT_CUDA_TRY(cudaPeekAtLastError());
+  }
 }
 
 }  // namespace raft::sparse::solver::detail
diff --git a/cpp/include/raft/spectral/detail/matrix_wrappers.hpp b/cpp/include/raft/spectral/detail/matrix_wrappers.hpp
index 87e45cb3fa..9db959b60d 100644
--- a/cpp/include/raft/spectral/detail/matrix_wrappers.hpp
+++ b/cpp/include/raft/spectral/detail/matrix_wrappers.hpp
@@ -11,6 +11,7 @@
 #include <raft/core/resource/cublas_handle.hpp>
 #include <raft/core/resource/cuda_stream.hpp>
 #include <raft/core/resource/cusparse_handle.hpp>
+#include <raft/core/resource/dry_run_flag.hpp>
 #include <raft/core/resource/thrust_policy.hpp>
 #include <raft/core/resources.hpp>
 #include <raft/linalg/detail/cublas_wrappers.hpp>
@@ -90,7 +91,8 @@ template <typename value_type>
 class vector_t {
  public:
   vector_t(resources const& raft_handle, size_type sz)
-    : buffer_(sz, resource::get_cuda_stream(raft_handle)),
+    : handle_(raft_handle),
+      buffer_(sz, resource::get_cuda_stream(raft_handle)),
       thrust_policy(resource::get_thrust_policy(raft_handle))
   {
   }
@@ -103,6 +105,7 @@ class vector_t {
 
   value_type nrm1() const
   {
+    if (resource::get_dry_run_flag(handle_)) { return value_type{0}; }
     return thrust::reduce(
       thrust_policy,
       buffer_.data(),
@@ -117,6 +120,7 @@ class vector_t {
 
   void fill(value_type value)
   {
+    if (resource::get_dry_run_flag(handle_)) { return; }
     thrust::fill_n(thrust_policy, buffer_.data(), buffer_.size(), value);
   }
 
@@ -124,6 +128,7 @@ class vector_t {
   using thrust_exec_policy_t =
     thrust::detail::execute_with_allocator<rmm::mr::thrust_allocator<char>,
                                            thrust::cuda_cub::execute_on_stream_nosync_base>;
+  raft::resources const& handle_;
   rmm::device_uvector<value_type> buffer_;
   const thrust_exec_policy_t thrust_policy;
 };
@@ -211,6 +216,7 @@ struct sparse_matrix_t {
 
     auto cusparse_h = resource::get_cusparse_handle(handle_);
     auto stream     = resource::get_cuda_stream(handle_);
+    bool is_dry_run = resource::get_dry_run_flag(handle_);
 
     cusparseOperation_t trans = transpose ? CUSPARSE_OPERATION_TRANSPOSE :  // transpose
                                   CUSPARSE_OPERATION_NON_TRANSPOSE;         // non-transpose
@@ -239,7 +245,7 @@ struct sparse_matrix_t {
     RAFT_CUSPARSE_TRY(raft::sparse::detail::cusparsecreatednvec(&vecX, size_x, x));
 
     rmm::device_uvector<value_type> y_tmp(size_y, stream);
-    raft::copy(y_tmp.data(), y, size_y, stream);
+    if (!is_dry_run) { raft::copy(y_tmp.data(), y, size_y, stream); }
 
     cusparseDnVecDescr_t vecY;
     RAFT_CUSPARSE_TRY(raft::sparse::detail::cusparsecreatednvec(&vecY, size_y, y_tmp.data()));
@@ -256,11 +262,21 @@ struct sparse_matrix_t {
 
     // finally perform SpMV:
     //
-    RAFT_CUSPARSE_TRY(raft::sparse::detail::cusparsespmv(
-      cusparse_h, trans, &alpha, matA, vecX, &beta, vecY, spmv_alg, external_buffer.raw(), stream));
-
-    // FIXME: This is a workaround for a cusparse issue being encountered in CUDA 12
-    raft::copy(y, y_tmp.data(), size_y, stream);
+    if (!is_dry_run) {
+      RAFT_CUSPARSE_TRY(raft::sparse::detail::cusparsespmv(cusparse_h,
+                                                           trans,
+                                                           &alpha,
+                                                           matA,
+                                                           vecX,
+                                                           &beta,
+                                                           vecY,
+                                                           spmv_alg,
+                                                           external_buffer.raw(),
+                                                           stream));
+
+      // FIXME: This is a workaround for a cusparse issue being encountered in CUDA 12
+      raft::copy(y, y_tmp.data(), size_y, stream);
+    }
     // free descriptors:
     //(TODO: maybe wrap them in a RAII struct?)
     //
@@ -268,6 +284,7 @@ struct sparse_matrix_t {
     RAFT_CUSPARSE_TRY(cusparseDestroyDnVec(vecX));
     RAFT_CUSPARSE_TRY(cusparseDestroySpMat(matA));
 #else
+    if (is_dry_run) { return; }
     RAFT_CUSPARSE_TRY(
       raft::sparse::detail::cusparsesetpointermode(cusparse_h, CUSPARSE_POINTER_MODE_HOST, stream));
     cusparseMatDescr_t descr = 0;
@@ -366,27 +383,30 @@ struct laplacian_matrix_t : sparse_matrix_t<index_type, value_type, nnz_type> {
     constexpr int BLOCK_SIZE = 1024;
     auto n                   = sparse_matrix_t<index_type, value_type, nnz_type>::nrows_;
 
-    auto handle   = sparse_matrix_t<index_type, value_type, nnz_type>::get_handle();
-    auto cublas_h = resource::get_cublas_handle(handle);
-    auto stream   = resource::get_cuda_stream(handle);
+    auto handle     = sparse_matrix_t<index_type, value_type, nnz_type>::get_handle();
+    auto cublas_h   = resource::get_cublas_handle(handle);
+    auto stream     = resource::get_cuda_stream(handle);
+    bool is_dry_run = resource::get_dry_run_flag(handle);
 
     // scales y by beta:
     //
-    if (beta == 0) {
-      RAFT_CUDA_TRY(cudaMemsetAsync(y, 0, n * sizeof(value_type), stream));
-    } else if (beta != 1) {
-      // TODO: Call from public API when ready
-      RAFT_CUBLAS_TRY(raft::linalg::detail::cublasscal(cublas_h, n, &beta, y, 1, stream));
+    if (!is_dry_run) {
+      if (beta == 0) {
+        RAFT_CUDA_TRY(cudaMemsetAsync(y, 0, n * sizeof(value_type), stream));
+      } else if (beta != 1) {
+        // TODO: Call from public API when ready
+        RAFT_CUBLAS_TRY(raft::linalg::detail::cublasscal(cublas_h, n, &beta, y, 1, stream));
+      }
+
+      // Apply diagonal matrix
+      //
+      dim3 gridDim{std::min<unsigned int>((n + BLOCK_SIZE - 1) / BLOCK_SIZE, 65535), 1, 1};
+
+      dim3 blockDim{BLOCK_SIZE, 1, 1};
+      diagmv<<<gridDim, blockDim, 0, stream>>>(n, alpha, diagonal_.raw(), x, y);
+      RAFT_CHECK_CUDA(stream);
     }
 
-    // Apply diagonal matrix
-    //
-    dim3 gridDim{std::min<unsigned int>((n + BLOCK_SIZE - 1) / BLOCK_SIZE, 65535), 1, 1};
-
-    dim3 blockDim{BLOCK_SIZE, 1, 1};
-    diagmv<<<gridDim, blockDim, 0, stream>>>(n, alpha, diagonal_.raw(), x, y);
-    RAFT_CHECK_CUDA(stream);
-
     // Apply adjacency matrix
     //
     sparse_matrix_t<index_type, value_type, nnz_type>::mv(
@@ -429,9 +449,10 @@ struct modularity_matrix_t : laplacian_matrix_t<index_type, value_type, nnz_type
   {
     auto n = sparse_matrix_t<index_type, value_type, nnz_type>::nrows_;
 
-    auto handle   = sparse_matrix_t<index_type, value_type, nnz_type>::get_handle();
-    auto cublas_h = resource::get_cublas_handle(handle);
-    auto stream   = resource::get_cuda_stream(handle);
+    auto handle     = sparse_matrix_t<index_type, value_type, nnz_type>::get_handle();
+    auto cublas_h   = resource::get_cublas_handle(handle);
+    auto stream     = resource::get_cuda_stream(handle);
+    bool is_dry_run = resource::get_dry_run_flag(handle);
 
     // y = A*x
     //
@@ -443,29 +464,31 @@ struct modularity_matrix_t : laplacian_matrix_t<index_type, value_type, nnz_type
     //
     // Cublas::dot(this->n, D.raw(), 1, x, 1, &dot_res);
     // TODO: Call from public API when ready
-    RAFT_CUBLAS_TRY(raft::linalg::detail::cublasdot(
-      cublas_h,
-      n,
-      laplacian_matrix_t<index_type, value_type, nnz_type>::diagonal_.raw(),
-      1,
-      x,
-      1,
-      &dot_res,
-      stream));
-
-    // y = y -(gamma/edge_sum)*d
-    //
-    value_type gamma_ = -dot_res / edge_sum_;
-    // TODO: Call from public API when ready
-    RAFT_CUBLAS_TRY(raft::linalg::detail::cublasaxpy(
-      cublas_h,
-      n,
-      &gamma_,
-      laplacian_matrix_t<index_type, value_type, nnz_type>::diagonal_.raw(),
-      1,
-      y,
-      1,
-      stream));
+    if (!is_dry_run) {
+      RAFT_CUBLAS_TRY(raft::linalg::detail::cublasdot(
+        cublas_h,
+        n,
+        laplacian_matrix_t<index_type, value_type, nnz_type>::diagonal_.raw(),
+        1,
+        x,
+        1,
+        &dot_res,
+        stream));
+
+      // y = y -(gamma/edge_sum)*d
+      //
+      value_type gamma_ = -dot_res / edge_sum_;
+      // TODO: Call from public API when ready
+      RAFT_CUBLAS_TRY(raft::linalg::detail::cublasaxpy(
+        cublas_h,
+        n,
+        &gamma_,
+        laplacian_matrix_t<index_type, value_type, nnz_type>::diagonal_.raw(),
+        1,
+        y,
+        1,
+        stream));
+    }
   }
 
   value_type edge_sum_;
diff --git a/cpp/include/raft/spectral/detail/modularity_maximization.hpp b/cpp/include/raft/spectral/detail/modularity_maximization.hpp
index c1ff1f52a7..a1a2a26005 100644
--- a/cpp/include/raft/spectral/detail/modularity_maximization.hpp
+++ b/cpp/include/raft/spectral/detail/modularity_maximization.hpp
@@ -9,6 +9,7 @@
 #include <raft/core/logger_macros.hpp>
 #include <raft/core/resource/cublas_handle.hpp>
 #include <raft/core/resource/cuda_stream.hpp>
+#include <raft/core/resource/dry_run_flag.hpp>
 #include <raft/linalg/detail/cublas_wrappers.hpp>
 #include <raft/linalg/normalize.cuh>
 #include <raft/spectral/detail/spectral_util.cuh>
@@ -47,6 +48,7 @@ void analyzeModularity(
   vertex_t const* __restrict__ clusters,
   weight_t& modularity)
 {
+  bool is_dry_run = resource::get_dry_run_flag(handle);
   RAFT_EXPECTS(clusters != nullptr, "Null clusters buffer.");
 
   vertex_t i;
@@ -61,11 +63,19 @@ void analyzeModularity(
   raft::spectral::matrix::vector_t<weight_t> Bx(handle, n);
 
   // Initialize cuBLAS
-  RAFT_CUBLAS_TRY(linalg::detail::cublassetpointermode(cublas_h, CUBLAS_POINTER_MODE_HOST, stream));
+  if (!is_dry_run) {
+    RAFT_CUBLAS_TRY(
+      linalg::detail::cublassetpointermode(cublas_h, CUBLAS_POINTER_MODE_HOST, stream));
+  }
 
   // Initialize Modularity
   raft::spectral::matrix::modularity_matrix_t<vertex_t, weight_t, nnz_t> B{handle, csr_m};
 
+  if (is_dry_run) {
+    // Early stopping because construct_indicator doesn't allocate anything.
+    return;
+  }
+
   // Initialize output
   modularity = 0;
 
diff --git a/cpp/include/raft/spectral/detail/partition.hpp b/cpp/include/raft/spectral/detail/partition.hpp
index 76ffa94e8e..d9031061c6 100644
--- a/cpp/include/raft/spectral/detail/partition.hpp
+++ b/cpp/include/raft/spectral/detail/partition.hpp
@@ -7,6 +7,7 @@
 #include <raft/core/detail/macros.hpp>
 #include <raft/core/resource/cublas_handle.hpp>
 #include <raft/core/resource/cuda_stream.hpp>
+#include <raft/core/resource/dry_run_flag.hpp>
 #include <raft/linalg/detail/cublas_wrappers.hpp>
 #include <raft/sparse/linalg/laplacian.cuh>
 #include <raft/spectral/detail/spectral_util.cuh>
@@ -57,23 +58,25 @@ void analyzePartition(raft::resources const& handle,
   vertex_t i;
   vertex_t n = csr_m.nrows_;
 
-  auto stream   = resource::get_cuda_stream(handle);
-  auto cublas_h = resource::get_cublas_handle(handle);
-
   weight_t partEdgesCut, clustersize;
 
-  // Device memory
+  // Device memory - allocate before dry-run check to track allocations
   spectral::matrix::vector_t<weight_t> part_i(handle, n);
   spectral::matrix::vector_t<weight_t> Lx(handle, n);
 
+  // Initialize Laplacian - allocate before dry-run check to track allocations
+  /// sparse_matrix_t<vertex_t, weight_t> A{handle, graph};
+  spectral::matrix::laplacian_matrix_t<vertex_t, weight_t, nnz_t> L{handle, csr_m};
+
+  if (resource::get_dry_run_flag(handle)) { return; }
+
+  auto stream   = resource::get_cuda_stream(handle);
+  auto cublas_h = resource::get_cublas_handle(handle);
+
   // Initialize cuBLAS
   RAFT_CUBLAS_TRY(
     raft::linalg::detail::cublassetpointermode(cublas_h, CUBLAS_POINTER_MODE_HOST, stream));
 
-  // Initialize Laplacian
-  /// sparse_matrix_t<vertex_t, weight_t> A{handle, graph};
-  spectral::matrix::laplacian_matrix_t<vertex_t, weight_t, nnz_t> L{handle, csr_m};
-
   // Initialize output
   cost    = 0;
   edgeCut = 0;
diff --git a/cpp/include/raft/spectral/detail/spectral_util.cuh b/cpp/include/raft/spectral/detail/spectral_util.cuh
index d60a9d16b2..be3dc19232 100644
--- a/cpp/include/raft/spectral/detail/spectral_util.cuh
+++ b/cpp/include/raft/spectral/detail/spectral_util.cuh
@@ -8,6 +8,7 @@
 #include <raft/core/detail/macros.hpp>
 #include <raft/core/resource/cublas_handle.hpp>
 #include <raft/core/resource/cuda_stream.hpp>
+#include <raft/core/resource/dry_run_flag.hpp>
 #include <raft/core/resource/thrust_policy.hpp>
 #include <raft/core/resources.hpp>
 #include <raft/linalg/detail/cublas_wrappers.hpp>
@@ -35,6 +36,11 @@ void transform_eigen_matrix(raft::resources const& handle,
                             vertex_t nEigVecs,
                             weight_t* eigVecs)
 {
+  // Allocate before dry-run check to track allocation
+  raft::spectral::matrix::vector_t<weight_t> work(handle, nEigVecs * n);
+
+  if (resource::get_dry_run_flag(handle)) { return; }
+
   auto stream             = resource::get_cuda_stream(handle);
   auto cublas_h           = resource::get_cublas_handle(handle);
   auto thrust_exec_policy = resource::get_thrust_policy(handle);
@@ -77,7 +83,6 @@ void transform_eigen_matrix(raft::resources const& handle,
   // Transpose eigenvector matrix
   //   TODO: in-place transpose
   {
-    raft::spectral::matrix::vector_t<weight_t> work(handle, nEigVecs * n);
     // TODO: Call from public API when ready
     RAFT_CUBLAS_TRY(
       raft::linalg::detail::cublassetpointermode(cublas_h, CUBLAS_POINTER_MODE_HOST, stream));
@@ -135,6 +140,7 @@ bool construct_indicator(
   raft::spectral::matrix::vector_t<weight_t>& Bx,
   raft::spectral::matrix::laplacian_matrix_t<vertex_t, weight_t, nnz_t> const& B)
 {
+  if (resource::get_dry_run_flag(handle)) { return {}; }
   auto stream             = resource::get_cuda_stream(handle);
   auto cublas_h           = resource::get_cublas_handle(handle);
   auto thrust_exec_policy = resource::get_thrust_policy(handle);
diff --git a/cpp/include/raft/stats/accuracy.cuh b/cpp/include/raft/stats/accuracy.cuh
index 8d14ac75ee..89965ee9be 100644
--- a/cpp/include/raft/stats/accuracy.cuh
+++ b/cpp/include/raft/stats/accuracy.cuh
@@ -11,6 +11,7 @@
 #include <raft/core/detail/macros.hpp>
 #include <raft/core/device_mdspan.hpp>
 #include <raft/core/resource/cuda_stream.hpp>
+#include <raft/core/resource/dry_run_flag.hpp>
 #include <raft/stats/detail/scores.cuh>
 
 namespace raft {
@@ -28,7 +29,7 @@ namespace stats {
 template <typename math_t>
 float accuracy(const math_t* predictions, const math_t* ref_predictions, int n, cudaStream_t stream)
 {
-  return detail::accuracy_score(predictions, ref_predictions, n, stream);
+  return detail::accuracy_score(false, predictions, ref_predictions, n, stream);
 }
 
 /**
@@ -54,7 +55,8 @@ float accuracy(raft::resources const& handle,
   RAFT_EXPECTS(predictions.is_exhaustive(), "predictions must be contiguous");
   RAFT_EXPECTS(ref_predictions.is_exhaustive(), "ref_predictions must be contiguous");
 
-  return detail::accuracy_score(predictions.data_handle(),
+  return detail::accuracy_score(resource::get_dry_run_flag(handle),
+                                predictions.data_handle(),
                                 ref_predictions.data_handle(),
                                 predictions.extent(0),
                                 resource::get_cuda_stream(handle));
diff --git a/cpp/include/raft/stats/adjusted_rand_index.cuh b/cpp/include/raft/stats/adjusted_rand_index.cuh
index e123b6a019..e691b6052f 100644
--- a/cpp/include/raft/stats/adjusted_rand_index.cuh
+++ b/cpp/include/raft/stats/adjusted_rand_index.cuh
@@ -16,6 +16,7 @@
 #include <raft/core/detail/macros.hpp>
 #include <raft/core/device_mdspan.hpp>
 #include <raft/core/resource/cuda_stream.hpp>
+#include <raft/core/resource/dry_run_flag.hpp>
 #include <raft/stats/detail/adjusted_rand_index.cuh>
 
 namespace raft {
@@ -38,7 +39,7 @@ double adjusted_rand_index(const T* firstClusterArray,
                            cudaStream_t stream)
 {
   return detail::compute_adjusted_rand_index<T, MathT>(
-    firstClusterArray, secondClusterArray, size, stream);
+    false, firstClusterArray, secondClusterArray, size, stream);
 }
 
 /**
@@ -66,7 +67,8 @@ double adjusted_rand_index(raft::resources const& handle,
   RAFT_EXPECTS(first_cluster_array.is_exhaustive(), "first_cluster_array must be contiguous");
   RAFT_EXPECTS(second_cluster_array.is_exhaustive(), "second_cluster_array must be contiguous");
 
-  return detail::compute_adjusted_rand_index<value_t, math_t>(first_cluster_array.data_handle(),
+  return detail::compute_adjusted_rand_index<value_t, math_t>(resource::get_dry_run_flag(handle),
+                                                              first_cluster_array.data_handle(),
                                                               second_cluster_array.data_handle(),
                                                               first_cluster_array.extent(0),
                                                               resource::get_cuda_stream(handle));
diff --git a/cpp/include/raft/stats/completeness_score.cuh b/cpp/include/raft/stats/completeness_score.cuh
index b60bc4c6ae..d056a43de0 100644
--- a/cpp/include/raft/stats/completeness_score.cuh
+++ b/cpp/include/raft/stats/completeness_score.cuh
@@ -11,6 +11,7 @@
 #include <raft/core/detail/macros.hpp>
 #include <raft/core/device_mdspan.hpp>
 #include <raft/core/resource/cuda_stream.hpp>
+#include <raft/core/resource/dry_run_flag.hpp>
 #include <raft/stats/detail/homogeneity_score.cuh>
 
 namespace raft {
@@ -35,7 +36,7 @@ double completeness_score(const T* truthClusterArray,
                           cudaStream_t stream)
 {
   return detail::homogeneity_score(
-    predClusterArray, truthClusterArray, size, lower_label_range, upper_label_range, stream);
+    false, predClusterArray, truthClusterArray, size, lower_label_range, upper_label_range, stream);
 }
 
 /**
@@ -65,7 +66,8 @@ double completeness_score(raft::resources const& handle,
   RAFT_EXPECTS(truth_cluster_array.size() == pred_cluster_array.size(), "Size mismatch");
   RAFT_EXPECTS(truth_cluster_array.is_exhaustive(), "truth_cluster_array must be contiguous");
   RAFT_EXPECTS(pred_cluster_array.is_exhaustive(), "pred_cluster_array must be contiguous");
-  return detail::homogeneity_score(pred_cluster_array.data_handle(),
+  return detail::homogeneity_score(resource::get_dry_run_flag(handle),
+                                   pred_cluster_array.data_handle(),
                                    truth_cluster_array.data_handle(),
                                    truth_cluster_array.extent(0),
                                    lower_label_range,
diff --git a/cpp/include/raft/stats/contingency_matrix.cuh b/cpp/include/raft/stats/contingency_matrix.cuh
index f796ff04d4..4c28594f4c 100644
--- a/cpp/include/raft/stats/contingency_matrix.cuh
+++ b/cpp/include/raft/stats/contingency_matrix.cuh
@@ -13,6 +13,7 @@
 #include <raft/core/device_mdspan.hpp>
 #include <raft/core/host_mdspan.hpp>
 #include <raft/core/resource/cuda_stream.hpp>
+#include <raft/core/resource/dry_run_flag.hpp>
 #include <raft/core/resources.hpp>
 #include <raft/stats/detail/contingencyMatrix.cuh>
 
@@ -53,7 +54,7 @@ size_t getContingencyMatrixWorkspaceSize(int nSamples,
                                          T maxLabel = std::numeric_limits<T>::max())
 {
   return detail::getContingencyMatrixWorkspaceSize(
-    nSamples, groundTruth, stream, minLabel, maxLabel);
+    false, nSamples, groundTruth, stream, minLabel, maxLabel);
 }
 
 /**
@@ -116,6 +117,7 @@ void get_input_class_cardinality(raft::resources const& handle,
                                  raft::host_scalar_view<value_t> minLabel,
                                  raft::host_scalar_view<value_t> maxLabel)
 {
+  if (resource::get_dry_run_flag(handle)) { return; }
   RAFT_EXPECTS(minLabel.data_handle() != nullptr, "Invalid minLabel pointer");
   RAFT_EXPECTS(maxLabel.data_handle() != nullptr, "Invalid maxLabel pointer");
   detail::getInputClassCardinality(groundTruth.data_handle(),
@@ -169,13 +171,16 @@ void contingency_matrix(raft::resources const& handle,
   if (min_label.has_value()) { min_label_value = min_label.value(); }
   if (max_label.has_value()) { max_label_value = max_label.value(); }
 
-  auto workspace_sz = detail::getContingencyMatrixWorkspaceSize(ground_truth.extent(0),
+  auto workspace_sz = detail::getContingencyMatrixWorkspaceSize(resource::get_dry_run_flag(handle),
+                                                                ground_truth.extent(0),
                                                                 ground_truth.data_handle(),
                                                                 resource::get_cuda_stream(handle),
                                                                 min_label_value,
                                                                 max_label_value);
   auto workspace    = raft::make_device_vector<char>(handle, workspace_sz);
 
+  if (resource::get_dry_run_flag(handle)) { return; }
+
   detail::contingencyMatrix<value_t, out_t>(ground_truth.data_handle(),
                                             predicted_label.data_handle(),
                                             ground_truth.extent(0),
diff --git a/cpp/include/raft/stats/detail/adjusted_rand_index.cuh b/cpp/include/raft/stats/detail/adjusted_rand_index.cuh
index bb1cc464e4..58685afea1 100644
--- a/cpp/include/raft/stats/detail/adjusted_rand_index.cuh
+++ b/cpp/include/raft/stats/detail/adjusted_rand_index.cuh
@@ -105,13 +105,15 @@ int countUnique(const T* arr, int size, T& minLabel, T& maxLabel, cudaStream_t s
  *        <a href="https://en.wikipedia.org/wiki/Rand_index">here</a>
  * @tparam T data-type for input label arrays
  * @tparam MathT integral data-type used for computing n-choose-r
+ * @param dry_run: whether to run in dry-run mode
  * @param firstClusterArray: the array of classes
  * @param secondClusterArray: the array of classes
  * @param size: the size of the data points of type int
  * @param stream: the cudaStream object
  */
 template <typename T, typename MathT = int>
-double compute_adjusted_rand_index(const T* firstClusterArray,
+double compute_adjusted_rand_index(bool dry_run,
+                                   const T* firstClusterArray,
                                    const T* secondClusterArray,
                                    int size,
                                    cudaStream_t stream)
@@ -120,56 +122,72 @@ double compute_adjusted_rand_index(const T* firstClusterArray,
     // 1 or 0 labels always have a perfect score. This also matches sklearn behavior.
     return 1.0;
   }
-  T minFirst, maxFirst, minSecond, maxSecond;
-  auto nUniqFirst      = countUnique(firstClusterArray, size, minFirst, maxFirst, stream);
-  auto nUniqSecond     = countUnique(secondClusterArray, size, minSecond, maxSecond, stream);
-  auto lowerLabelRange = std::min(minFirst, minSecond);
-  auto upperLabelRange = std::max(maxFirst, maxSecond);
-  auto nClasses        = upperLabelRange - lowerLabelRange + 1;
-  // degenerate case of single cluster or clusters each with just one element
-  if (nUniqFirst == nUniqSecond) {
-    if (nUniqFirst == 1 || nUniqFirst == size) return 1.0;
+  // Upper bound: worst case is each sample is a unique class
+  auto nClassesUpperBound = size;
+  T lowerLabelRange, upperLabelRange;
+  MathT nUniqClasses;
+  if (dry_run) {
+    // Use upper bound for allocations in dry-run mode
+    nUniqClasses    = MathT(nClassesUpperBound);
+    lowerLabelRange = T(0);
+    upperLabelRange = T(nClassesUpperBound - 1);
+  } else {
+    T minFirst, maxFirst, minSecond, maxSecond;
+    auto nUniqFirst  = countUnique(firstClusterArray, size, minFirst, maxFirst, stream);
+    auto nUniqSecond = countUnique(secondClusterArray, size, minSecond, maxSecond, stream);
+    lowerLabelRange  = std::min(minFirst, minSecond);
+    upperLabelRange  = std::max(maxFirst, maxSecond);
+    auto nClasses    = upperLabelRange - lowerLabelRange + 1;
+    // degenerate case of single cluster or clusters each with just one element
+    if (nUniqFirst == nUniqSecond) {
+      if (nUniqFirst == 1 || nUniqFirst == size) return 1.0;
+    }
+    nUniqClasses = MathT(nClasses);
   }
-  auto nUniqClasses = MathT(nClasses);
   rmm::device_uvector<MathT> dContingencyMatrix(nUniqClasses * nUniqClasses, stream);
-  RAFT_CUDA_TRY(cudaMemsetAsync(
-    dContingencyMatrix.data(), 0, nUniqClasses * nUniqClasses * sizeof(MathT), stream));
   auto workspaceSz = getContingencyMatrixWorkspaceSize<T, MathT>(
-    size, firstClusterArray, stream, lowerLabelRange, upperLabelRange);
+    dry_run, size, firstClusterArray, stream, lowerLabelRange, upperLabelRange);
   rmm::device_uvector<char> workspaceBuff(workspaceSz, stream);
-  contingencyMatrix<T, MathT>(firstClusterArray,
-                              secondClusterArray,
-                              size,
-                              dContingencyMatrix.data(),
-                              stream,
-                              workspaceBuff.data(),
-                              workspaceSz,
-                              lowerLabelRange,
-                              upperLabelRange);
+  if (!dry_run) {
+    RAFT_CUDA_TRY(cudaMemsetAsync(
+      dContingencyMatrix.data(), 0, nUniqClasses * nUniqClasses * sizeof(MathT), stream));
+    contingencyMatrix<T, MathT>(firstClusterArray,
+                                secondClusterArray,
+                                size,
+                                dContingencyMatrix.data(),
+                                stream,
+                                workspaceBuff.data(),
+                                workspaceSz,
+                                lowerLabelRange,
+                                upperLabelRange);
+  }
   rmm::device_uvector<MathT> a(nUniqClasses, stream);
   rmm::device_uvector<MathT> b(nUniqClasses, stream);
   rmm::device_scalar<MathT> d_aCTwoSum(stream);
   rmm::device_scalar<MathT> d_bCTwoSum(stream);
   rmm::device_scalar<MathT> d_nChooseTwoSum(stream);
   MathT h_aCTwoSum, h_bCTwoSum, h_nChooseTwoSum;
-  RAFT_CUDA_TRY(cudaMemsetAsync(a.data(), 0, nUniqClasses * sizeof(MathT), stream));
-  RAFT_CUDA_TRY(cudaMemsetAsync(b.data(), 0, nUniqClasses * sizeof(MathT), stream));
-  RAFT_CUDA_TRY(cudaMemsetAsync(d_aCTwoSum.data(), 0, sizeof(MathT), stream));
-  RAFT_CUDA_TRY(cudaMemsetAsync(d_bCTwoSum.data(), 0, sizeof(MathT), stream));
-  RAFT_CUDA_TRY(cudaMemsetAsync(d_nChooseTwoSum.data(), 0, sizeof(MathT), stream));
-  // calculating the sum of NijC2
-  raft::linalg::mapThenSumReduce<MathT, nCTwo<MathT>>(d_nChooseTwoSum.data(),
-                                                      nUniqClasses * nUniqClasses,
-                                                      nCTwo<MathT>(),
-                                                      stream,
-                                                      dContingencyMatrix.data(),
-                                                      dContingencyMatrix.data());
+  if (!dry_run) {
+    RAFT_CUDA_TRY(cudaMemsetAsync(a.data(), 0, nUniqClasses * sizeof(MathT), stream));
+    RAFT_CUDA_TRY(cudaMemsetAsync(b.data(), 0, nUniqClasses * sizeof(MathT), stream));
+    RAFT_CUDA_TRY(cudaMemsetAsync(d_aCTwoSum.data(), 0, sizeof(MathT), stream));
+    RAFT_CUDA_TRY(cudaMemsetAsync(d_bCTwoSum.data(), 0, sizeof(MathT), stream));
+    RAFT_CUDA_TRY(cudaMemsetAsync(d_nChooseTwoSum.data(), 0, sizeof(MathT), stream));
+    // calculating the sum of NijC2
+    raft::linalg::mapThenSumReduce<MathT, nCTwo<MathT>>(d_nChooseTwoSum.data(),
+                                                        nUniqClasses * nUniqClasses,
+                                                        nCTwo<MathT>(),
+                                                        stream,
+                                                        dContingencyMatrix.data(),
+                                                        dContingencyMatrix.data());
+  }
   // calculating the row-wise sums
-  raft::linalg::reduce<true, true, MathT, MathT>(
-    a.data(), dContingencyMatrix.data(), nUniqClasses, nUniqClasses, 0, stream);
+  raft::linalg::detail::reduce<true, true, MathT, MathT>(
+    dry_run, a.data(), dContingencyMatrix.data(), nUniqClasses, nUniqClasses, 0, stream);
   // calculating the column-wise sums
-  raft::linalg::reduce<true, false, MathT, MathT>(
-    b.data(), dContingencyMatrix.data(), nUniqClasses, nUniqClasses, 0, stream);
+  raft::linalg::detail::reduce<true, false, MathT, MathT>(
+    dry_run, b.data(), dContingencyMatrix.data(), nUniqClasses, nUniqClasses, 0, stream);
+  if (dry_run) { return 0.0; }
   // calculating the sum of number of unordered pairs for every element in a
   raft::linalg::mapThenSumReduce<MathT, nCTwo<MathT>>(
     d_aCTwoSum.data(), nUniqClasses, nCTwo<MathT>(), stream, a.data(), a.data());
diff --git a/cpp/include/raft/stats/detail/batched/silhouette_score.cuh b/cpp/include/raft/stats/detail/batched/silhouette_score.cuh
index 17829f1cd1..75a2e6b367 100644
--- a/cpp/include/raft/stats/detail/batched/silhouette_score.cuh
+++ b/cpp/include/raft/stats/detail/batched/silhouette_score.cuh
@@ -116,29 +116,18 @@ rmm::device_uvector<value_idx> get_cluster_counts(raft::resources const& handle,
 
   rmm::device_uvector<value_idx> cluster_counts(n_labels, stream);
 
-  rmm::device_uvector<char> workspace(1, stream);
+  // Query workspace size for countLabels (can run in dry-run)
+  size_t countLabels_ws_size = 0;
+  raft::stats::detail::countLabels<value_idx, label_idx>(
+    y, nullptr, n_rows, n_labels, nullptr, countLabels_ws_size, stream);
+  rmm::device_uvector<char> workspace(countLabels_ws_size, stream);
 
-  raft::stats::detail::countLabels(y, cluster_counts.data(), n_rows, n_labels, workspace, stream);
+  if (resource::get_dry_run_flag(handle)) { return cluster_counts; }
 
-  return cluster_counts;
-}
-
-template <typename value_t, typename value_idx>
-rmm::device_uvector<value_t> get_pairwise_distance(raft::resources const& handle,
-                                                   const value_t* left_begin,
-                                                   const value_t* right_begin,
-                                                   value_idx& n_left_rows,
-                                                   value_idx& n_right_rows,
-                                                   value_idx& n_cols,
-                                                   raft::distance::DistanceType metric,
-                                                   cudaStream_t stream)
-{
-  rmm::device_uvector<value_t> distances(n_left_rows * n_right_rows, stream);
-
-  raft::distance::pairwise_distance(
-    handle, left_begin, right_begin, distances.data(), n_left_rows, n_right_rows, n_cols, metric);
+  raft::stats::detail::countLabels(
+    y, cluster_counts.data(), n_rows, n_labels, workspace.data(), countLabels_ws_size, stream);
 
-  return distances;
+  return cluster_counts;
 }
 
 template <typename value_t, typename value_idx, typename label_idx>
@@ -178,6 +167,8 @@ value_t silhouette_score(
   ASSERT(n_labels >= 2 && n_labels <= (n_rows - 1),
          "silhouette Score not defined for the given number of labels!");
 
+  bool is_dry_run = resource::get_dry_run_flag(handle);
+
   rmm::device_uvector<value_idx> cluster_counts = get_cluster_counts(handle, y, n_rows, n_labels);
 
   auto stream = resource::get_cuda_stream(handle);
@@ -186,19 +177,24 @@ value_t silhouette_score(
   auto b_size = n_rows * n_labels;
 
   value_t *a_ptr, *b_ptr;
-  rmm::device_uvector<value_t> a(0, stream);
+  // since a and silhouette score per sample are same size, reusing
+  rmm::device_uvector<value_t> a((scores == nullptr || scores == NULL) ? n_rows : 0, stream);
   rmm::device_uvector<value_t> b(b_size, stream);
 
   b_ptr = b.data();
 
-  // since a and silhouette score per sample are same size, reusing
-  if (scores == nullptr || scores == NULL) {
-    a.resize(n_rows, stream);
+  if (a.size() > 0) {
     a_ptr = a.data();
   } else {
     a_ptr = scores;
   }
 
+  // Pre-allocate maximum distance buffer size (chunk * chunk) before dry-run guard
+  // to ensure allocation is tracked in dry-run mode and avoid reallocations in the loop
+  rmm::device_uvector<value_t> distances_buffer(chunk * chunk, stream);
+
+  if (is_dry_run) { return value_t{0}; }
+
   thrust::fill(policy, a_ptr, a_ptr + n_rows, 0);
 
   dim3 block_size(std::min(n_rows, 32), std::min(n_labels, 32));
@@ -223,8 +219,15 @@ value_t silhouette_score(
       auto n_left_rows  = (i + chunk) < n_rows ? chunk : (n_rows - i);
       auto n_right_rows = (j + chunk) < n_rows ? chunk : (n_rows - j);
 
-      rmm::device_uvector<value_t> distances = get_pairwise_distance(
-        handle, left_begin, right_begin, n_left_rows, n_right_rows, n_cols, metric, chunk_stream);
+      // Reuse pre-allocated buffer (size is at most chunk * chunk)
+      raft::distance::pairwise_distance(handle,
+                                        left_begin,
+                                        right_begin,
+                                        distances_buffer.data(),
+                                        n_left_rows,
+                                        n_right_rows,
+                                        n_cols,
+                                        metric);
 
       compute_chunked_a_b(handle,
                           a_ptr,
@@ -234,7 +237,7 @@ value_t silhouette_score(
                           y,
                           n_labels,
                           cluster_counts.data(),
-                          distances.data(),
+                          distances_buffer.data(),
                           n_left_rows,
                           n_right_rows,
                           chunk_stream);
diff --git a/cpp/include/raft/stats/detail/contingencyMatrix.cuh b/cpp/include/raft/stats/detail/contingencyMatrix.cuh
index c6b47ee47e..935a5bebe6 100644
--- a/cpp/include/raft/stats/detail/contingencyMatrix.cuh
+++ b/cpp/include/raft/stats/detail/contingencyMatrix.cuh
@@ -190,6 +190,7 @@ void getInputClassCardinality(
  * @brief Calculate workspace size for running contingency matrix calculations
  * @tparam T label type
  * @tparam OutT output matrix type
+ * @param dry_run: whether to run in dry-run mode (returns upper-bound estimate)
  * @param nSamples: number of elements in input array
  * @param groundTruth: device 1-d array for ground truth (num of rows)
  * @param stream: cuda stream for execution
@@ -197,13 +198,26 @@ void getInputClassCardinality(
  * @param maxLabel: Optional, max value in input array
  */
 template <typename T, typename OutT = int>
-size_t getContingencyMatrixWorkspaceSize(int nSamples,
+size_t getContingencyMatrixWorkspaceSize(bool dry_run,
+                                         int nSamples,
                                          const T* groundTruth,
                                          cudaStream_t stream,
                                          T minLabel = std::numeric_limits<T>::max(),
                                          T maxLabel = std::numeric_limits<T>::max())
 {
   size_t workspaceSize = 0;
+  if (dry_run) {
+    // Upper-bound estimate for dry-run mode:
+    // Worst case: each sample is a unique class, so outDimN = nSamples
+    // For SORT_AND_GATOMICS implementation:
+    // - tmpStagingMemorySize = alignTo(nSamples * sizeof(T), 256) * 2
+    // - CUB workspace: conservative upper bound of 4 * nSamples * sizeof(T)
+    auto tmpStagingMemorySize = raft::alignTo<size_t>(nSamples * sizeof(T), 256);
+    tmpStagingMemorySize *= 2;
+    size_t cubWorkspaceUpperBound = 4 * nSamples * sizeof(T);
+    workspaceSize                 = tmpStagingMemorySize + cubWorkspaceUpperBound;
+    return workspaceSize;
+  }
   // below is a redundant computation - can be avoided
   if (minLabel == std::numeric_limits<T>::max() || maxLabel == std::numeric_limits<T>::max()) {
     getInputClassCardinality<T>(groundTruth, nSamples, stream, minLabel, maxLabel);
diff --git a/cpp/include/raft/stats/detail/cov.cuh b/cpp/include/raft/stats/detail/cov.cuh
index 9416b07dfb..bb8f5b735c 100644
--- a/cpp/include/raft/stats/detail/cov.cuh
+++ b/cpp/include/raft/stats/detail/cov.cuh
@@ -7,6 +7,7 @@
 
 #include <raft/core/detail/macros.hpp>
 #include <raft/core/resource/cublas_handle.hpp>
+#include <raft/core/resource/dry_run_flag.hpp>
 #include <raft/linalg/gemm.cuh>
 #include <raft/stats/mean_center.cuh>
 
@@ -45,6 +46,7 @@ void cov(raft::resources const& handle,
          bool stable,
          cudaStream_t stream)
 {
+  if (resource::get_dry_run_flag(handle)) { return; }
   if (stable) {
     // since mean operation is assumed to be along a given column, broadcast
     // must be along rows!
diff --git a/cpp/include/raft/stats/detail/dispersion.cuh b/cpp/include/raft/stats/detail/dispersion.cuh
index 17ea419b48..7e86237965 100644
--- a/cpp/include/raft/stats/detail/dispersion.cuh
+++ b/cpp/include/raft/stats/detail/dispersion.cuh
@@ -88,7 +88,8 @@ RAFT_KERNEL dispersionKernel(DataT* result,
  * @return the cluster dispersion value
  */
 template <typename DataT, typename IdxT = int, int TPB = 256>
-DataT dispersion(const DataT* centroids,
+DataT dispersion(bool dry_run,
+                 const DataT* centroids,
                  const IdxT* clusterSizes,
                  DataT* globalCentroid,
                  IdxT nClusters,
@@ -96,17 +97,17 @@ DataT dispersion(const DataT* centroids,
                  IdxT dim,
                  cudaStream_t stream)
 {
+  rmm::device_uvector<DataT> mean(globalCentroid == nullptr ? dim : 0, stream);
+  rmm::device_uvector<DataT> result(1, stream);
+  DataT* mu = globalCentroid;
+  if (globalCentroid == nullptr) { mu = mean.data(); }
+
+  if (dry_run) { return DataT{0}; }
+
   static const int RowsPerThread = 4;
   static const int ColsPerBlk    = 32;
   static const int RowsPerBlk    = (TPB / ColsPerBlk) * RowsPerThread;
   dim3 grid(raft::ceildiv(nPoints, (IdxT)RowsPerBlk), raft::ceildiv(dim, (IdxT)ColsPerBlk));
-  rmm::device_uvector<DataT> mean(0, stream);
-  rmm::device_uvector<DataT> result(1, stream);
-  DataT* mu = globalCentroid;
-  if (globalCentroid == nullptr) {
-    mean.resize(dim, stream);
-    mu = mean.data();
-  }
   RAFT_CUDA_TRY(cudaMemsetAsync(mu, 0, sizeof(DataT) * dim, stream));
   RAFT_CUDA_TRY(cudaMemsetAsync(result.data(), 0, sizeof(DataT), stream));
   weightedMeanKernel<DataT, IdxT, TPB, ColsPerBlk>
@@ -122,7 +123,7 @@ DataT dispersion(const DataT* centroids,
   RAFT_CUDA_TRY(cudaGetLastError());
   DataT h_result;
   raft::update_host(&h_result, result.data(), 1, stream);
-  raft::interruptible::synchronize(stream);
+  if (!dry_run) { raft::interruptible::synchronize(stream); }
   return sqrt(h_result);
 }
 
diff --git a/cpp/include/raft/stats/detail/entropy.cuh b/cpp/include/raft/stats/detail/entropy.cuh
index 9e468dda8f..e695c5ee46 100644
--- a/cpp/include/raft/stats/detail/entropy.cuh
+++ b/cpp/include/raft/stats/detail/entropy.cuh
@@ -46,12 +46,17 @@ struct entropyOp {
  * @brief function to calculate the bincounts of number of samples in every label
  *
  * @tparam LabelT: type of the labels
+ * @param dry_run: whether to run in dry-run mode
  * @param labels: the pointer to the array containing labels for every data sample
- * @param binCountArray: pointer to the 1D array that contains the count of samples per cluster
+ * @param binCountArray: pointer to the 1D array that contains the count of samples per cluster.
+ *                       Can be nullptr when workspace is nullptr (for size query).
  * @param nRows: number of data samples
  * @param lowerLabelRange
  * @param upperLabelRange
- * @param workspace: device buffer containing workspace memory
+ * @param workspace: device buffer containing workspace memory. Pass nullptr to query workspace
+ * size.
+ * @param workspace_size: [in/out] When workspace is nullptr, this is set to the required workspace
+ * size. When workspace is not nullptr, this must be the size of the workspace.
  * @param stream: the cuda stream where to launch this kernel
  */
 template <typename LabelT>
@@ -60,28 +65,16 @@ void countLabels(const LabelT* labels,
                  int nRows,
                  LabelT lowerLabelRange,
                  LabelT upperLabelRange,
-                 rmm::device_uvector<char>& workspace,
+                 void* workspace,
+                 size_t& workspace_size,
                  cudaStream_t stream)
 {
-  int num_levels            = upperLabelRange - lowerLabelRange + 2;
-  LabelT lower_level        = lowerLabelRange;
-  LabelT upper_level        = upperLabelRange + 1;
-  size_t temp_storage_bytes = 0;
+  int num_levels     = upperLabelRange - lowerLabelRange + 2;
+  LabelT lower_level = lowerLabelRange;
+  LabelT upper_level = upperLabelRange + 1;
 
-  RAFT_CUDA_TRY(cub::DeviceHistogram::HistogramEven(nullptr,
-                                                    temp_storage_bytes,
-                                                    labels,
-                                                    binCountArray,
-                                                    num_levels,
-                                                    lower_level,
-                                                    upper_level,
-                                                    nRows,
-                                                    stream));
-
-  workspace.resize(temp_storage_bytes, stream);
-
-  RAFT_CUDA_TRY(cub::DeviceHistogram::HistogramEven(workspace.data(),
-                                                    temp_storage_bytes,
+  RAFT_CUDA_TRY(cub::DeviceHistogram::HistogramEven(workspace,
+                                                    workspace_size,
                                                     labels,
                                                     binCountArray,
                                                     num_levels,
@@ -95,6 +88,7 @@ void countLabels(const LabelT* labels,
  * @brief Function to calculate entropy
  * <a href="https://en.wikipedia.org/wiki/Entropy_(information_theory)">more info on entropy</a>
  *
+ * @param dry_run: whether to run in dry-run mode
  * @param clusterArray: the array of classes of type T
  * @param size: the size of the data points of type int
  * @param lowerLabelRange: the lower bound of the range of labels
@@ -103,7 +97,8 @@ void countLabels(const LabelT* labels,
  * @return the entropy score
  */
 template <typename T>
-double entropy(const T* clusterArray,
+double entropy(bool dry_run,
+               const T* clusterArray,
                const int size,
                const T lowerLabelRange,
                const T upperLabelRange,
@@ -115,15 +110,36 @@ double entropy(const T* clusterArray,
 
   // declaring, allocating and initializing memory for bincount array and entropy values
   rmm::device_uvector<double> prob(numUniqueClasses, stream);
-  RAFT_CUDA_TRY(cudaMemsetAsync(prob.data(), 0, numUniqueClasses * sizeof(double), stream));
+  if (!dry_run) {
+    RAFT_CUDA_TRY(cudaMemsetAsync(prob.data(), 0, numUniqueClasses * sizeof(double), stream));
+  }
   rmm::device_scalar<double> d_entropy(stream);
-  RAFT_CUDA_TRY(cudaMemsetAsync(d_entropy.data(), 0, sizeof(double), stream));
-
+  if (!dry_run) { RAFT_CUDA_TRY(cudaMemsetAsync(d_entropy.data(), 0, sizeof(double), stream)); }
+
+  // Query workspace size for countLabels (can run in dry-run)
+  size_t countLabels_ws_size = 0;
+  countLabels(clusterArray,
+              nullptr,
+              size,
+              lowerLabelRange,
+              upperLabelRange,
+              nullptr,
+              countLabels_ws_size,
+              stream);
   // workspace allocation
-  rmm::device_uvector<char> workspace(1, stream);
+  rmm::device_uvector<char> workspace(countLabels_ws_size, stream);
+
+  if (dry_run) { return 0.0; }
 
   // calculating the bincounts and populating the prob array
-  countLabels(clusterArray, prob.data(), size, lowerLabelRange, upperLabelRange, workspace, stream);
+  countLabels(clusterArray,
+              prob.data(),
+              size,
+              lowerLabelRange,
+              upperLabelRange,
+              workspace.data(),
+              countLabels_ws_size,
+              stream);
 
   // scalar dividing by size
   raft::linalg::divideScalar<double>(
diff --git a/cpp/include/raft/stats/detail/homogeneity_score.cuh b/cpp/include/raft/stats/detail/homogeneity_score.cuh
index 79aa78417b..3392528089 100644
--- a/cpp/include/raft/stats/detail/homogeneity_score.cuh
+++ b/cpp/include/raft/stats/detail/homogeneity_score.cuh
@@ -22,6 +22,7 @@ namespace detail {
  * @brief Function to calculate the homogeneity score between two clusters
  * <a href="https://en.wikipedia.org/wiki/Homogeneity_(statistics)">more info on mutual
  * information</a>
+ * @param dry_run: whether to run in dry-run mode
  * @param truthClusterArray: the array of truth classes of type T
  * @param predClusterArray: the array of predicted classes of type T
  * @param size: the size of the data points of type int
@@ -30,7 +31,8 @@ namespace detail {
  * @param stream: the cudaStream object
  */
 template <typename T>
-double homogeneity_score(const T* truthClusterArray,
+double homogeneity_score(bool dry_run,
+                         const T* truthClusterArray,
                          const T* predClusterArray,
                          int size,
                          T lowerLabelRange,
@@ -41,10 +43,10 @@ double homogeneity_score(const T* truthClusterArray,
 
   double computedMI, computedEntropy;
 
-  computedMI = raft::stats::mutual_info_score(
-    truthClusterArray, predClusterArray, size, lowerLabelRange, upperLabelRange, stream);
+  computedMI = mutual_info_score(
+    dry_run, truthClusterArray, predClusterArray, size, lowerLabelRange, upperLabelRange, stream);
   computedEntropy =
-    raft::stats::entropy(truthClusterArray, size, lowerLabelRange, upperLabelRange, stream);
+    entropy(dry_run, truthClusterArray, size, lowerLabelRange, upperLabelRange, stream);
 
   double homogeneity;
 
diff --git a/cpp/include/raft/stats/detail/kl_divergence.cuh b/cpp/include/raft/stats/detail/kl_divergence.cuh
index e32a2b219a..3dbfa33065 100644
--- a/cpp/include/raft/stats/detail/kl_divergence.cuh
+++ b/cpp/include/raft/stats/detail/kl_divergence.cuh
@@ -48,15 +48,21 @@ struct KLDOp {
  * Divergence</a>
  *
  * @tparam DataT: Data type of the input array
+ * @param dry_run: whether to run in dry-run mode (skip CUDA work)
  * @param modelPDF: the model array of probability density functions of type DataT
  * @param candidatePDF: the candidate array of probability density functions of type DataT
  * @param size: the size of the data points of type int
  * @param stream: the cudaStream object
  */
 template <typename DataT>
-DataT kl_divergence(const DataT* modelPDF, const DataT* candidatePDF, int size, cudaStream_t stream)
+DataT kl_divergence(
+  bool dry_run, const DataT* modelPDF, const DataT* candidatePDF, int size, cudaStream_t stream)
 {
   rmm::device_scalar<DataT> d_KLDVal(stream);
+
+  if (dry_run) { return DataT{0}; }
+
+  // Note: No allocations below - only CUDA operations on pre-allocated memory
   RAFT_CUDA_TRY(cudaMemsetAsync(d_KLDVal.data(), 0, sizeof(DataT), stream));
 
   raft::linalg::mapThenSumReduce<DataT, KLDOp<DataT>, size_t, 256, const DataT*>(
diff --git a/cpp/include/raft/stats/detail/mean.cuh b/cpp/include/raft/stats/detail/mean.cuh
index 05e17d6d8a..6b33be5e6e 100644
--- a/cpp/include/raft/stats/detail/mean.cuh
+++ b/cpp/include/raft/stats/detail/mean.cuh
@@ -15,36 +15,38 @@ namespace stats {
 namespace detail {
 
 template <bool rowMajor, typename Type, typename IdxType = int>
-void mean(Type* mu, const Type* data, IdxType D, IdxType N, cudaStream_t stream)
+void mean(bool dry_run, Type* mu, const Type* data, IdxType D, IdxType N, cudaStream_t stream)
 {
   Type ratio = Type(1) / Type(N);
-  raft::linalg::reduce<rowMajor, false>(mu,
-                                        data,
-                                        D,
-                                        N,
-                                        Type(0),
-                                        stream,
-                                        false,
-                                        raft::identity_op(),
-                                        raft::add_op(),
-                                        raft::mul_const_op<Type>(ratio));
+  raft::linalg::detail::reduce<rowMajor, false>(dry_run,
+                                                mu,
+                                                data,
+                                                D,
+                                                N,
+                                                Type(0),
+                                                stream,
+                                                false,
+                                                raft::identity_op(),
+                                                raft::add_op(),
+                                                raft::mul_const_op<Type>(ratio));
 }
 
 template <bool rowMajor, typename Type, typename IdxType = int>
 [[deprecated]] void mean(
-  Type* mu, const Type* data, IdxType D, IdxType N, bool sample, cudaStream_t stream)
+  bool dry_run, Type* mu, const Type* data, IdxType D, IdxType N, bool sample, cudaStream_t stream)
 {
   Type ratio = Type(1) / ((sample) ? Type(N - 1) : Type(N));
-  raft::linalg::reduce<rowMajor, false>(mu,
-                                        data,
-                                        D,
-                                        N,
-                                        Type(0),
-                                        stream,
-                                        false,
-                                        raft::identity_op(),
-                                        raft::add_op(),
-                                        raft::mul_const_op<Type>(ratio));
+  raft::linalg::detail::reduce<rowMajor, false>(dry_run,
+                                                mu,
+                                                data,
+                                                D,
+                                                N,
+                                                Type(0),
+                                                stream,
+                                                false,
+                                                raft::identity_op(),
+                                                raft::add_op(),
+                                                raft::mul_const_op<Type>(ratio));
 }
 
 }  // namespace detail
diff --git a/cpp/include/raft/stats/detail/mean_center.cuh b/cpp/include/raft/stats/detail/mean_center.cuh
index a4206ac414..9b43b21afa 100644
--- a/cpp/include/raft/stats/detail/mean_center.cuh
+++ b/cpp/include/raft/stats/detail/mean_center.cuh
@@ -16,45 +16,58 @@ namespace detail {
 
 /**
  * @brief Center the input matrix wrt its mean
+ * @tparam rowMajor whether input is row or col major
+ * @tparam bcastAlongRows whether to broadcast vector along rows or columns
  * @tparam Type the data type
  * @tparam IdxType Integer type used to for addressing
  * @tparam TPB threads per block of the cuda kernel launched
+ * @param dry_run whether to run in dry-run mode (skip CUDA work)
  * @param out the output mean-centered matrix
  * @param data input matrix
  * @param mu the mean vector
  * @param D number of columns of data
  * @param N number of rows of data
- * @param rowMajor whether input is row or col major
- * @param bcastAlongRows whether to broadcast vector along rows or columns
  * @param stream cuda stream where to launch work
  */
 template <bool rowMajor, bool bcastAlongRows, typename Type, typename IdxType = int, int TPB = 256>
-void meanCenter(
-  Type* out, const Type* data, const Type* mu, IdxType D, IdxType N, cudaStream_t stream)
+void meanCenter(bool dry_run,
+                Type* out,
+                const Type* data,
+                const Type* mu,
+                IdxType D,
+                IdxType N,
+                cudaStream_t stream)
 {
-  raft::linalg::matrixVectorOp<rowMajor, bcastAlongRows>(
-    out, data, mu, D, N, raft::sub_op{}, stream);
+  raft::linalg::detail::matrixVectorOp<rowMajor, bcastAlongRows>(
+    dry_run, out, data, mu, D, N, raft::sub_op{}, stream);
 }
 
 /**
  * @brief Add the input matrix wrt its mean
+ * @tparam rowMajor whether input is row or col major
+ * @tparam bcastAlongRows whether to broadcast vector along rows or columns
  * @tparam Type the data type
  * @tparam IdxType Integer type used to for addressing
  * @tparam TPB threads per block of the cuda kernel launched
+ * @param dry_run whether to run in dry-run mode (skip CUDA work)
  * @param out the output mean-added matrix
  * @param data input matrix
  * @param mu the mean vector
  * @param D number of columns of data
  * @param N number of rows of data
- * @param rowMajor whether input is row or col major
- * @param bcastAlongRows whether to broadcast vector along rows or columns
  * @param stream cuda stream where to launch work
  */
 template <bool rowMajor, bool bcastAlongRows, typename Type, typename IdxType = int, int TPB = 256>
-void meanAdd(Type* out, const Type* data, const Type* mu, IdxType D, IdxType N, cudaStream_t stream)
+void meanAdd(bool dry_run,
+             Type* out,
+             const Type* data,
+             const Type* mu,
+             IdxType D,
+             IdxType N,
+             cudaStream_t stream)
 {
-  raft::linalg::matrixVectorOp<rowMajor, bcastAlongRows>(
-    out, data, mu, D, N, raft::add_op{}, stream);
+  raft::linalg::detail::matrixVectorOp<rowMajor, bcastAlongRows>(
+    dry_run, out, data, mu, D, N, raft::add_op{}, stream);
 }
 
 };  // end namespace detail
diff --git a/cpp/include/raft/stats/detail/meanvar.cuh b/cpp/include/raft/stats/detail/meanvar.cuh
index 2ca64536b2..90be7357c5 100644
--- a/cpp/include/raft/stats/detail/meanvar.cuh
+++ b/cpp/include/raft/stats/detail/meanvar.cuh
@@ -184,8 +184,15 @@ RAFT_KERNEL meanvar_kernel_fill(T* mean, T* var, const mean_var<T>* aggr, I D, b
 }
 
 template <typename T, typename I = int, int BlockSize = 256>
-void meanvar(
-  T* mean, T* var, const T* data, I D, I N, bool sample, bool rowMajor, cudaStream_t stream)
+void meanvar(bool dry_run,
+             T* mean,
+             T* var,
+             const T* data,
+             I D,
+             I N,
+             bool sample,
+             bool rowMajor,
+             cudaStream_t stream)
 {
   if (rowMajor) {
     static_assert(BlockSize >= WarpSize, "Block size must be not smaller than the warp size.");
@@ -202,20 +209,25 @@ void meanvar(
     // Global memory: one mean_var<T> for each column
     //                one lock per all blocks working on the same set of columns
     rmm::device_buffer buf(sizeof(mean_var<T>) * D + sizeof(int) * gs.x, stream);
-    RAFT_CUDA_TRY(cudaMemsetAsync(buf.data(), 0, buf.size(), stream));
-    mean_var<T>* mvs = static_cast<mean_var<T>*>(buf.data());
-    int* locks       = static_cast<int*>(static_cast<void*>(mvs + D));
-
-    const uint64_t len = uint64_t(D) * uint64_t(N);
-    ASSERT(len <= uint64_t(std::numeric_limits<I>::max()), "N * D does not fit the indexing type");
-    meanvar_kernel_rowmajor<T, I, BlockSize><<<gs, bs, 0, stream>>>(data, mvs, locks, len, D);
-    meanvar_kernel_fill<T, I>
-      <<<raft::ceildiv<I>(D, BlockSize), BlockSize, 0, stream>>>(mean, var, mvs, D, sample);
+    if (!dry_run) {
+      RAFT_CUDA_TRY(cudaMemsetAsync(buf.data(), 0, buf.size(), stream));
+      mean_var<T>* mvs = static_cast<mean_var<T>*>(buf.data());
+      int* locks       = static_cast<int*>(static_cast<void*>(mvs + D));
+
+      const uint64_t len = uint64_t(D) * uint64_t(N);
+      ASSERT(len <= uint64_t(std::numeric_limits<I>::max()),
+             "N * D does not fit the indexing type");
+      meanvar_kernel_rowmajor<T, I, BlockSize><<<gs, bs, 0, stream>>>(data, mvs, locks, len, D);
+      meanvar_kernel_fill<T, I>
+        <<<raft::ceildiv<I>(D, BlockSize), BlockSize, 0, stream>>>(mean, var, mvs, D, sample);
+    }
   } else {
-    meanvar_kernel_colmajor<T, I, BlockSize>
-      <<<D, BlockSize, 0, stream>>>(mean, var, data, D, N, sample);
+    if (!dry_run) {
+      meanvar_kernel_colmajor<T, I, BlockSize>
+        <<<D, BlockSize, 0, stream>>>(mean, var, data, D, N, sample);
+    }
   }
-  RAFT_CHECK_CUDA(stream);
+  if (!dry_run) { RAFT_CHECK_CUDA(stream); }
 }
 
 };  // namespace stats::detail
diff --git a/cpp/include/raft/stats/detail/mutual_info_score.cuh b/cpp/include/raft/stats/detail/mutual_info_score.cuh
index e4c72b36fd..fe20385352 100644
--- a/cpp/include/raft/stats/detail/mutual_info_score.cuh
+++ b/cpp/include/raft/stats/detail/mutual_info_score.cuh
@@ -83,6 +83,7 @@ RAFT_KERNEL mutual_info_kernel(const int* dContingencyMatrix,
 /**
  * @brief Function to calculate the mutual information between two clusters
  * <a href="https://en.wikipedia.org/wiki/Mutual_information">more info on mutual information</a>
+ * @param dry_run: whether to run in dry-run mode
  * @param firstClusterArray: the array of classes of type T
  * @param secondClusterArray: the array of classes of type T
  * @param size: the size of the data points of type int
@@ -91,7 +92,8 @@ RAFT_KERNEL mutual_info_kernel(const int* dContingencyMatrix,
  * @param stream: the cudaStream object
  */
 template <typename T>
-double mutual_info_score(const T* firstClusterArray,
+double mutual_info_score(bool dry_run,
+                         const T* firstClusterArray,
                          const T* secondClusterArray,
                          int size,
                          T lowerLabelRange,
@@ -102,24 +104,28 @@ double mutual_info_score(const T* firstClusterArray,
 
   // declaring, allocating and initializing memory for the contingency marix
   rmm::device_uvector<int> dContingencyMatrix(numUniqueClasses * numUniqueClasses, stream);
-  RAFT_CUDA_TRY(cudaMemsetAsync(
-    dContingencyMatrix.data(), 0, numUniqueClasses * numUniqueClasses * sizeof(int), stream));
+  if (!dry_run) {
+    RAFT_CUDA_TRY(cudaMemsetAsync(
+      dContingencyMatrix.data(), 0, numUniqueClasses * numUniqueClasses * sizeof(int), stream));
+  }
 
   // workspace allocation
-  size_t workspaceSz = raft::stats::getContingencyMatrixWorkspaceSize(
-    size, firstClusterArray, stream, lowerLabelRange, upperLabelRange);
+  size_t workspaceSz = raft::stats::detail::getContingencyMatrixWorkspaceSize(
+    dry_run, size, firstClusterArray, stream, lowerLabelRange, upperLabelRange);
   rmm::device_uvector<char> pWorkspace(workspaceSz, stream);
 
   // calculating the contingency matrix
-  raft::stats::contingencyMatrix(firstClusterArray,
-                                 secondClusterArray,
-                                 (int)size,
-                                 (int*)dContingencyMatrix.data(),
-                                 stream,
-                                 (void*)pWorkspace.data(),
-                                 workspaceSz,
-                                 lowerLabelRange,
-                                 upperLabelRange);
+  if (!dry_run) {
+    raft::stats::contingencyMatrix(firstClusterArray,
+                                   secondClusterArray,
+                                   (int)size,
+                                   (int*)dContingencyMatrix.data(),
+                                   stream,
+                                   (void*)pWorkspace.data(),
+                                   workspaceSz,
+                                   lowerLabelRange,
+                                   upperLabelRange);
+  }
 
   // creating device buffers for all the parameters involved in ARI calculation
   // device variables
@@ -131,17 +137,21 @@ double mutual_info_score(const T* firstClusterArray,
   double h_MI;
 
   // initializing device memory
-  RAFT_CUDA_TRY(cudaMemsetAsync(a.data(), 0, numUniqueClasses * sizeof(int), stream));
-  RAFT_CUDA_TRY(cudaMemsetAsync(b.data(), 0, numUniqueClasses * sizeof(int), stream));
-  RAFT_CUDA_TRY(cudaMemsetAsync(d_MI.data(), 0, sizeof(double), stream));
+  if (!dry_run) {
+    RAFT_CUDA_TRY(cudaMemsetAsync(a.data(), 0, numUniqueClasses * sizeof(int), stream));
+    RAFT_CUDA_TRY(cudaMemsetAsync(b.data(), 0, numUniqueClasses * sizeof(int), stream));
+    RAFT_CUDA_TRY(cudaMemsetAsync(d_MI.data(), 0, sizeof(double), stream));
+  }
 
   // calculating the row-wise sums
-  raft::linalg::reduce<true, true, int, int, int>(
-    a.data(), dContingencyMatrix.data(), numUniqueClasses, numUniqueClasses, 0, stream);
+  raft::linalg::detail::reduce<true, true, int, int, int>(
+    dry_run, a.data(), dContingencyMatrix.data(), numUniqueClasses, numUniqueClasses, 0, stream);
 
   // calculating the column-wise sums
-  raft::linalg::reduce<true, false, int, int, int>(
-    b.data(), dContingencyMatrix.data(), numUniqueClasses, numUniqueClasses, 0, stream);
+  raft::linalg::detail::reduce<true, false, int, int, int>(
+    dry_run, b.data(), dContingencyMatrix.data(), numUniqueClasses, numUniqueClasses, 0, stream);
+
+  if (dry_run) { return 0.0; }
 
   // kernel configuration
   static const int BLOCK_DIM_Y = 16, BLOCK_DIM_X = 16;
diff --git a/cpp/include/raft/stats/detail/neighborhood_recall.cuh b/cpp/include/raft/stats/detail/neighborhood_recall.cuh
index 347f77c094..4fd0934a6b 100644
--- a/cpp/include/raft/stats/detail/neighborhood_recall.cuh
+++ b/cpp/include/raft/stats/detail/neighborhood_recall.cuh
@@ -13,6 +13,7 @@
 #include <raft/core/mdspan_types.hpp>
 #include <raft/core/operators.hpp>
 #include <raft/core/resource/cuda_stream.hpp>
+#include <raft/core/resource/dry_run_flag.hpp>
 #include <raft/core/resources.hpp>
 
 #include <cub/block/block_reduce.cuh>
@@ -95,6 +96,7 @@ void neighborhood_recall(
   raft::device_scalar_view<ScalarType> recall_score,
   DistanceValueType const eps)
 {
+  if (resource::get_dry_run_flag(res)) { return; }
   // One warp per row, launch a warp-width block per-row kernel
   auto constexpr kThreadsPerBlock = 32;
   auto const num_blocks           = indices.extent(0);
diff --git a/cpp/include/raft/stats/detail/rand_index.cuh b/cpp/include/raft/stats/detail/rand_index.cuh
index d3c9353926..fd20c8c6ef 100644
--- a/cpp/include/raft/stats/detail/rand_index.cuh
+++ b/cpp/include/raft/stats/detail/rand_index.cuh
@@ -112,13 +112,15 @@ RAFT_KERNEL computeTheNumerator(
 /**
  * @brief Function to calculate RandIndex
  * <a href="https://en.wikipedia.org/wiki/Rand_index">more info on rand index</a>
+ * @param dry_run: whether to run in dry-run mode
  * @param firstClusterArray: the array of classes of type T
  * @param secondClusterArray: the array of classes of type T
  * @param size: the size of the data points of type uint64_t
  * @param stream: the cudaStream object
  */
 template <typename T>
-double compute_rand_index(const T* firstClusterArray,
+double compute_rand_index(bool dry_run,
+                          const T* firstClusterArray,
                           const T* secondClusterArray,
                           uint64_t size,
                           cudaStream_t stream)
@@ -130,6 +132,7 @@ double compute_rand_index(const T* firstClusterArray,
 
   // allocating and initializing memory for a and b in the GPU
   rmm::device_uvector<uint64_t> arr_buf(2, stream);
+  if (dry_run) { return 0.0; }
   RAFT_CUDA_TRY(cudaMemsetAsync(arr_buf.data(), 0, 2 * sizeof(uint64_t), stream));
 
   // kernel configuration
diff --git a/cpp/include/raft/stats/detail/scores.cuh b/cpp/include/raft/stats/detail/scores.cuh
index 062a47f2e4..087a15f065 100644
--- a/cpp/include/raft/stats/detail/scores.cuh
+++ b/cpp/include/raft/stats/detail/scores.cuh
@@ -37,6 +37,7 @@ namespace detail {
  * in a linear regression model. The larger the R-squared value, the
  * more variability is explained by the linear regression model.
  *
+ * @param dry_run: whether to run in dry-run mode (track allocations but skip CUDA work)
  * @param y: Array of ground-truth response variables
  * @param y_hat: Array of predicted response variables
  * @param n: Number of elements in y and y_hat
@@ -44,21 +45,21 @@ namespace detail {
  * @return: The R-squared value.
  */
 template <typename math_t>
-math_t r2_score(math_t* y, math_t* y_hat, int n, cudaStream_t stream)
+math_t r2_score(bool dry_run, math_t* y, math_t* y_hat, int n, cudaStream_t stream)
 {
   rmm::device_scalar<math_t> y_bar(stream);
+  rmm::device_uvector<math_t> sse_arr(n, stream);
+  rmm::device_uvector<math_t> ssto_arr(n, stream);
 
-  raft::stats::mean<false>(y_bar.data(), y, 1, n, stream);
-  RAFT_CUDA_TRY(cudaPeekAtLastError());
+  if (dry_run) { return math_t{0}; }
 
-  rmm::device_uvector<math_t> sse_arr(n, stream);
+  raft::stats::detail::mean<false>(dry_run, y_bar.data(), y, 1, n, stream);
+  RAFT_CUDA_TRY(cudaPeekAtLastError());
 
   raft::linalg::eltwiseSub(sse_arr.data(), y, y_hat, n, stream);
   raft::linalg::powerScalar(sse_arr.data(), sse_arr.data(), math_t(2.0), n, stream);
   RAFT_CUDA_TRY(cudaPeekAtLastError());
 
-  rmm::device_uvector<math_t> ssto_arr(n, stream);
-
   raft::linalg::subtractDevScalar(ssto_arr.data(), y, y_bar.data(), n, stream);
   raft::linalg::powerScalar(ssto_arr.data(), ssto_arr.data(), math_t(2.0), n, stream);
   RAFT_CUDA_TRY(cudaPeekAtLastError());
@@ -75,6 +76,7 @@ math_t r2_score(math_t* y, math_t* y_hat, int n, cudaStream_t stream)
 /**
  * @brief Compute accuracy of predictions. Useful for classification.
  * @tparam math_t: data type for predictions (e.g., int for classification)
+ * @param[in] dry_run: whether to run in dry-run mode (track allocations but skip CUDA work)
  * @param[in] predictions: array of predictions (GPU pointer).
  * @param[in] ref_predictions: array of reference (ground-truth) predictions (GPU pointer).
  * @param[in] n: number of elements in each of predictions, ref_predictions.
@@ -82,15 +84,18 @@ math_t r2_score(math_t* y, math_t* y_hat, int n, cudaStream_t stream)
  * @return: Accuracy score in [0, 1]; higher is better.
  */
 template <typename math_t>
-float accuracy_score(const math_t* predictions,
+float accuracy_score(bool dry_run,
+                     const math_t* predictions,
                      const math_t* ref_predictions,
                      int n,
                      cudaStream_t stream)
 {
-  unsigned long long correctly_predicted = 0ULL;
   rmm::device_uvector<math_t> diffs_array(n, stream);
 
+  if (dry_run) { return 0.0f; }
+
   // TODO could write a kernel instead
+  unsigned long long correctly_predicted = 0ULL;
   raft::linalg::eltwiseSub(diffs_array.data(), predictions, ref_predictions, n, stream);
   RAFT_CUDA_TRY(cudaGetLastError());
   correctly_predicted =
@@ -132,6 +137,7 @@ RAFT_KERNEL reg_metrics_kernel(
 /**
  * @brief Compute regression metrics mean absolute error, mean squared error, median absolute error
  * @tparam T: data type for predictions (e.g., float or double for regression).
+ * @param[in] dry_run: whether to run in dry-run mode (track allocations but skip CUDA work)
  * @param[in] predictions: array of predictions (GPU pointer).
  * @param[in] ref_predictions: array of reference (ground-truth) predictions (GPU pointer).
  * @param[in] n: number of elements in each of predictions, ref_predictions. Should be > 0.
@@ -144,7 +150,8 @@ RAFT_KERNEL reg_metrics_kernel(
  * ref_predictions[i]| for i in [0, n).
  */
 template <typename T>
-void regression_metrics(const T* predictions,
+void regression_metrics(bool dry_run,
+                        const T* predictions,
                         const T* ref_predictions,
                         int n,
                         cudaStream_t stream,
@@ -152,15 +159,30 @@ void regression_metrics(const T* predictions,
                         double& mean_squared_error,
                         double& median_abs_error)
 {
+  int array_size = n * sizeof(double);
+  rmm::device_uvector<double> abs_diffs_array(array_size, stream);
+  rmm::device_uvector<double> sorted_abs_diffs(array_size, stream);
+  rmm::device_uvector<double> tmp_sums(2 * sizeof(double), stream);
+
+  // CUB workspace size query (safe even in dry-run — no kernel launch)
+  size_t temp_storage_bytes = 0;
+  RAFT_CUDA_TRY(cub::DeviceRadixSort::SortKeys((void*)nullptr,
+                                               temp_storage_bytes,
+                                               abs_diffs_array.data(),
+                                               sorted_abs_diffs.data(),
+                                               n,
+                                               0,
+                                               8 * sizeof(double),
+                                               stream));
+  rmm::device_uvector<char> temp_storage_v(temp_storage_bytes, stream);
+
   std::vector<double> mean_errors(2);
   std::vector<double> h_sorted_abs_diffs(n);
   int thread_cnt = 256;
   int block_cnt  = raft::ceildiv(n, thread_cnt);
 
-  int array_size = n * sizeof(double);
-  rmm::device_uvector<double> abs_diffs_array(array_size, stream);
-  rmm::device_uvector<double> sorted_abs_diffs(array_size, stream);
-  rmm::device_uvector<double> tmp_sums(2 * sizeof(double), stream);
+  if (dry_run) { return; }
+
   RAFT_CUDA_TRY(cudaMemsetAsync(tmp_sums.data(), 0, 2 * sizeof(double), stream));
 
   reg_metrics_kernel<T><<<block_cnt, thread_cnt, 0, stream>>>(
@@ -173,18 +195,7 @@ void regression_metrics(const T* predictions,
   mean_squared_error = mean_errors[1] / n;
 
   // Compute median error. Sort diffs_array and pick median value
-  char* temp_storage = nullptr;
-  size_t temp_storage_bytes;
-  RAFT_CUDA_TRY(cub::DeviceRadixSort::SortKeys((void*)temp_storage,
-                                               temp_storage_bytes,
-                                               abs_diffs_array.data(),
-                                               sorted_abs_diffs.data(),
-                                               n,
-                                               0,
-                                               8 * sizeof(double),
-                                               stream));
-  rmm::device_uvector<char> temp_storage_v(temp_storage_bytes, stream);
-  temp_storage = temp_storage_v.data();
+  char* temp_storage = temp_storage_v.data();
   RAFT_CUDA_TRY(cub::DeviceRadixSort::SortKeys((void*)temp_storage,
                                                temp_storage_bytes,
                                                abs_diffs_array.data(),
diff --git a/cpp/include/raft/stats/detail/silhouette_score.cuh b/cpp/include/raft/stats/detail/silhouette_score.cuh
index ded61f93c9..ff2ed405d0 100644
--- a/cpp/include/raft/stats/detail/silhouette_score.cuh
+++ b/cpp/include/raft/stats/detail/silhouette_score.cuh
@@ -8,6 +8,7 @@
 #include <raft/core/detail/macros.hpp>
 #include <raft/core/operators.hpp>
 #include <raft/core/resource/cuda_stream.hpp>
+#include <raft/core/resource/dry_run_flag.hpp>
 #include <raft/distance/distance.cuh>
 #include <raft/distance/distance_types.hpp>
 #include <raft/linalg/add.cuh>
@@ -91,10 +92,13 @@ RAFT_KERNEL populateAKernel(DataT* sampleToClusterSumOfDistances,
  * @tparam LabelT: type of the labels
  * @param labels: the pointer to the array containing labels for every data sample (1 x nRows)
  * @param binCountArray: pointer to the 1D array that contains the count of samples per cluster (1 x
- * nLabels)
+ * nLabels). Can be nullptr when workspace is nullptr (for size query).
  * @param nRows: number of data samples
  * @param nUniqueLabels: number of Labels
- * @param workspace: device buffer containing workspace memory
+ * @param workspace: device buffer containing workspace memory. Pass nullptr to query workspace
+ * size.
+ * @param workspace_size: [in/out] When workspace is nullptr, this is set to the required workspace
+ * size. When workspace is not nullptr, this must be the size of the workspace.
  * @param stream: the cuda stream where to launch this kernel
  */
 template <typename DataT, typename LabelT>
@@ -102,30 +106,16 @@ void countLabels(const LabelT* labels,
                  DataT* binCountArray,
                  int nRows,
                  int nUniqueLabels,
-                 rmm::device_uvector<char>& workspace,
+                 void* workspace,
+                 size_t& workspace_size,
                  cudaStream_t stream)
 {
-  int num_levels            = nUniqueLabels + 1;
-  LabelT lower_level        = 0;
-  LabelT upper_level        = nUniqueLabels;
-  size_t temp_storage_bytes = 0;
-
-  rmm::device_uvector<int> countArray(nUniqueLabels, stream);
-
-  RAFT_CUDA_TRY(cub::DeviceHistogram::HistogramEven(nullptr,
-                                                    temp_storage_bytes,
-                                                    labels,
-                                                    binCountArray,
-                                                    num_levels,
-                                                    lower_level,
-                                                    upper_level,
-                                                    nRows,
-                                                    stream));
-
-  workspace.resize(temp_storage_bytes, stream);
+  int num_levels     = nUniqueLabels + 1;
+  LabelT lower_level = 0;
+  LabelT upper_level = nUniqueLabels;
 
-  RAFT_CUDA_TRY(cub::DeviceHistogram::HistogramEven(workspace.data(),
-                                                    temp_storage_bytes,
+  RAFT_CUDA_TRY(cub::DeviceHistogram::HistogramEven(workspace,
+                                                    workspace_size,
                                                     labels,
                                                     binCountArray,
                                                     num_levels,
@@ -196,34 +186,56 @@ DataT silhouette_score(
   cudaStream_t stream,
   raft::distance::DistanceType metric = raft::distance::DistanceType::L2Unexpanded)
 {
+  bool is_dry_run = resource::get_dry_run_flag(handle);
   ASSERT(nLabels >= 2 && nLabels <= (nRows - 1),
          "silhouette Score not defined for the given number of labels!");
 
   // compute the distance matrix
   rmm::device_uvector<DataT> distanceMatrix(nRows * nRows, stream);
-  rmm::device_uvector<char> workspace(1, stream);
 
-  raft::distance::pairwise_distance(
-    handle, X_in, X_in, distanceMatrix.data(), nRows, nRows, nCols, metric);
+  // Query workspace size for countLabels (can run in dry-run)
+  size_t countLabels_ws_size = 0;
+  countLabels<DataT, LabelT>(labels, nullptr, nRows, nLabels, nullptr, countLabels_ws_size, stream);
+  rmm::device_uvector<char> workspace(countLabels_ws_size, stream);
 
   // deciding on the array of silhouette scores for each dataPoint
-  rmm::device_uvector<DataT> silhouette_scoreSamples(0, stream);
+  rmm::device_uvector<DataT> silhouette_scoreSamples(
+    silhouette_scorePerSample == nullptr ? nRows : 0, stream);
   DataT* perSampleSilScore = nullptr;
   if (silhouette_scorePerSample == nullptr) {
-    silhouette_scoreSamples.resize(nRows, stream);
     perSampleSilScore = silhouette_scoreSamples.data();
   } else {
     perSampleSilScore = silhouette_scorePerSample;
   }
-  RAFT_CUDA_TRY(cudaMemsetAsync(perSampleSilScore, 0, nRows * sizeof(DataT), stream));
 
   // getting the sample count per cluster
   rmm::device_uvector<DataT> binCountArray(nLabels, stream);
-  RAFT_CUDA_TRY(cudaMemsetAsync(binCountArray.data(), 0, nLabels * sizeof(DataT), stream));
-  countLabels(labels, binCountArray.data(), nRows, nLabels, workspace, stream);
 
   // calculating the sample-cluster-distance-sum-array
   rmm::device_uvector<DataT> sampleToClusterSumOfDistances(nRows * nLabels, stream);
+
+  // creating the a array and b array
+  rmm::device_uvector<DataT> d_aArray(nRows, stream);
+  rmm::device_uvector<DataT> d_bArray(nRows, stream);
+
+  // elementwise dividing by bincounts
+  rmm::device_uvector<DataT> averageDistanceBetweenSampleAndCluster(nRows * nLabels, stream);
+
+  // calculating the sum of all the silhouette score
+  rmm::device_scalar<DataT> d_avgSilhouetteScore(stream);
+
+  if (is_dry_run) { return DataT{0}; }
+
+  raft::distance::pairwise_distance(
+    handle, X_in, X_in, distanceMatrix.data(), nRows, nRows, nCols, metric);
+
+  RAFT_CUDA_TRY(cudaMemsetAsync(perSampleSilScore, 0, nRows * sizeof(DataT), stream));
+
+  RAFT_CUDA_TRY(cudaMemsetAsync(binCountArray.data(), 0, nLabels * sizeof(DataT), stream));
+  size_t workspace_size = workspace.size();
+  countLabels<DataT, LabelT>(
+    labels, binCountArray.data(), nRows, nLabels, workspace.data(), workspace_size, stream);
+
   RAFT_CUDA_TRY(cudaMemsetAsync(
     sampleToClusterSumOfDistances.data(), 0, nRows * nLabels * sizeof(DataT), stream));
   raft::linalg::reduce_cols_by_key(distanceMatrix.data(),
@@ -234,18 +246,13 @@ DataT silhouette_score(
                                    nLabels,
                                    stream);
 
-  // creating the a array and b array
-  rmm::device_uvector<DataT> d_aArray(nRows, stream);
-  rmm::device_uvector<DataT> d_bArray(nRows, stream);
   RAFT_CUDA_TRY(cudaMemsetAsync(d_aArray.data(), 0, nRows * sizeof(DataT), stream));
   RAFT_CUDA_TRY(cudaMemsetAsync(d_bArray.data(), 0, nRows * sizeof(DataT), stream));
 
   // kernel that populates the d_aArray
-  // kernel configuration
   dim3 numThreadsPerBlock(32, 1, 1);
   dim3 numBlocks(raft::ceildiv<int>(nRows, numThreadsPerBlock.x), 1, 1);
 
-  // calling the kernel
   populateAKernel<<<numBlocks, numThreadsPerBlock, 0, stream>>>(
     sampleToClusterSumOfDistances.data(),
     binCountArray.data(),
@@ -255,19 +262,18 @@ DataT silhouette_score(
     nLabels,
     std::numeric_limits<DataT>::max());
 
-  // elementwise dividing by bincounts
-  rmm::device_uvector<DataT> averageDistanceBetweenSampleAndCluster(nRows * nLabels, stream);
   RAFT_CUDA_TRY(cudaMemsetAsync(
     averageDistanceBetweenSampleAndCluster.data(), 0, nRows * nLabels * sizeof(DataT), stream));
 
-  raft::linalg::matrixVectorOp<true, true>(averageDistanceBetweenSampleAndCluster.data(),
-                                           sampleToClusterSumOfDistances.data(),
-                                           binCountArray.data(),
-                                           binCountArray.data(),
-                                           nLabels,
-                                           nRows,
-                                           DivOp<DataT>(),
-                                           stream);
+  raft::linalg::matrix_vector_op<raft::Apply::ALONG_ROWS>(
+    handle,
+    raft::make_device_matrix_view<const DataT, int, raft::row_major>(
+      sampleToClusterSumOfDistances.data(), nRows, nLabels),
+    raft::make_device_vector_view<const DataT, int>(binCountArray.data(), nLabels),
+    raft::make_device_vector_view<const DataT, int>(binCountArray.data(), nLabels),
+    raft::make_device_matrix_view<DataT, int, raft::row_major>(
+      averageDistanceBetweenSampleAndCluster.data(), nRows, nLabels),
+    DivOp<DataT>());
 
   // calculating row-wise minimum
   raft::linalg::reduce<DataT, DataT, int, raft::identity_op, raft::min_op>(
@@ -287,8 +293,6 @@ DataT silhouette_score(
   raft::linalg::binaryOp<DataT, SilOp<DataT>>(
     perSampleSilScore, d_aArray.data(), d_bArray.data(), nRows, SilOp<DataT>(), stream);
 
-  // calculating the sum of all the silhouette score
-  rmm::device_scalar<DataT> d_avgSilhouetteScore(stream);
   RAFT_CUDA_TRY(cudaMemsetAsync(d_avgSilhouetteScore.data(), 0, sizeof(DataT), stream));
 
   raft::linalg::mapThenSumReduce<double, raft::identity_op>(d_avgSilhouetteScore.data(),
diff --git a/cpp/include/raft/stats/detail/stddev.cuh b/cpp/include/raft/stats/detail/stddev.cuh
index e265d05082..295a20f02d 100644
--- a/cpp/include/raft/stats/detail/stddev.cuh
+++ b/cpp/include/raft/stats/detail/stddev.cuh
@@ -23,6 +23,7 @@ namespace detail {
  * @tparam rowMajor whether the input data is row or col major
  * @tparam Type the data type
  * @tparam IdxType Integer type used to for addressing
+ * @param dry_run whether to run in dry-run mode (skip CUDA work)
  * @param std the output stddev vector
  * @param data the input matrix
  * @param mu the mean vector
@@ -31,11 +32,11 @@ namespace detail {
  * @param sample whether to evaluate sample stddev or not. In other words,
  * whether
  *  to normalize the output using N-1 or N, for true or false, respectively
- * @param rowMajor whether the input data is row or col major
  * @param stream cuda stream where to launch work
  */
 template <bool rowMajor, typename Type, typename IdxType = int>
-void stddev(Type* std,
+void stddev(bool dry_run,
+            Type* std,
             const Type* data,
             const Type* mu,
             IdxType D,
@@ -43,8 +44,11 @@ void stddev(Type* std,
             bool sample,
             cudaStream_t stream)
 {
-  raft::linalg::reduce<rowMajor, false>(
-    std, data, D, N, Type(0), stream, false, [mu] __device__(Type a, IdxType i) { return a * a; });
+  raft::linalg::detail::reduce<rowMajor, false>(
+    dry_run, std, data, D, N, Type(0), stream, false, [mu] __device__(Type a, IdxType i) {
+      return a * a;
+    });
+  if (dry_run) { return; }
   Type ratio      = Type(1) / ((sample) ? Type(N - 1) : Type(N));
   Type ratio_mean = sample ? ratio * Type(N) : Type(1);
   raft::linalg::binaryOp(std,
@@ -67,18 +71,20 @@ void stddev(Type* std,
  * @tparam rowMajor whether the input data is row or col major
  * @tparam Type the data type
  * @tparam IdxType Integer type used to for addressing
- * @param var the output stddev vector
+ * @param dry_run whether to run in dry-run mode (skip CUDA work)
+ * @param var the output variance vector
  * @param data the input matrix
  * @param mu the mean vector
  * @param D number of columns of data
  * @param N number of rows of data
- * @param sample whether to evaluate sample stddev or not. In other words,
+ * @param sample whether to evaluate sample variance or not. In other words,
  * whether
  *  to normalize the output using N-1 or N, for true or false, respectively
  * @param stream cuda stream where to launch work
  */
 template <bool rowMajor, typename Type, typename IdxType = int>
-void vars(Type* var,
+void vars(bool dry_run,
+          Type* var,
           const Type* data,
           const Type* mu,
           IdxType D,
@@ -86,8 +92,11 @@ void vars(Type* var,
           bool sample,
           cudaStream_t stream)
 {
-  raft::linalg::reduce<rowMajor, false>(
-    var, data, D, N, Type(0), stream, false, [mu] __device__(Type a, IdxType i) { return a * a; });
+  raft::linalg::detail::reduce<rowMajor, false>(
+    dry_run, var, data, D, N, Type(0), stream, false, [mu] __device__(Type a, IdxType i) {
+      return a * a;
+    });
+  if (dry_run) { return; }
   Type ratio      = Type(1) / ((sample) ? Type(N - 1) : Type(N));
   Type ratio_mean = sample ? ratio * Type(N) : Type(1);
   raft::linalg::binaryOp(var,
diff --git a/cpp/include/raft/stats/detail/sum.cuh b/cpp/include/raft/stats/detail/sum.cuh
deleted file mode 100644
index a079fcc12d..0000000000
--- a/cpp/include/raft/stats/detail/sum.cuh
+++ /dev/null
@@ -1,25 +0,0 @@
-/*
- * SPDX-FileCopyrightText: Copyright (c) 2021-2026, NVIDIA CORPORATION.
- * SPDX-License-Identifier: Apache-2.0
- */
-
-#pragma once
-
-#include <raft/core/detail/macros.hpp>
-#include <raft/linalg/eltwise.cuh>
-#include <raft/linalg/reduce.cuh>
-#include <raft/util/cuda_utils.cuh>
-
-namespace raft {
-namespace stats {
-namespace detail {
-
-template <bool rowMajor, typename Type, typename IdxType = int>
-void sum(Type* output, const Type* input, IdxType D, IdxType N, cudaStream_t stream)
-{
-  raft::linalg::reduce<rowMajor, false>(output, input, D, N, Type(0), stream);
-}
-
-}  // namespace detail
-}  // namespace stats
-}  // namespace raft
diff --git a/cpp/include/raft/stats/detail/trustworthiness_score.cuh b/cpp/include/raft/stats/detail/trustworthiness_score.cuh
index d92ecf355c..ddcda13eb6 100644
--- a/cpp/include/raft/stats/detail/trustworthiness_score.cuh
+++ b/cpp/include/raft/stats/detail/trustworthiness_score.cuh
@@ -4,7 +4,9 @@
  */
 
 #include <raft/core/detail/macros.hpp>
+#include <raft/core/device_mdspan.hpp>
 #include <raft/core/resource/cuda_stream.hpp>
+#include <raft/core/resource/dry_run_flag.hpp>
 #include <raft/distance/distance.cuh>
 #include <raft/matrix/col_wise_sort.cuh>
 #include <raft/spatial/knn/knn.cuh>
@@ -153,30 +155,15 @@ double trustworthiness_score(const raft::resources& h,
     raft::distance::pairwise_distance(
       h, &X[(n - toDo) * m], X, X_dist.data(), curBatchSize, n, m, distance_type);
 
-    size_t colSortWorkspaceSize = 0;
-    bool bAllocWorkspace        = false;
-
-    raft::matrix::sort_cols_per_row(X_dist.data(),
-                                    X_ind.data(),
-                                    curBatchSize,
-                                    n,
-                                    bAllocWorkspace,
-                                    nullptr,
-                                    colSortWorkspaceSize,
-                                    stream);
-
-    if (bAllocWorkspace) {
-      rmm::device_uvector<char> sortColsWorkspace(colSortWorkspaceSize, stream);
-
-      raft::matrix::sort_cols_per_row(X_dist.data(),
-                                      X_ind.data(),
-                                      curBatchSize,
-                                      n,
-                                      bAllocWorkspace,
-                                      sortColsWorkspace.data(),
-                                      colSortWorkspaceSize,
-                                      stream);
-    }
+    // Use dry-run compliant handle-based overload that manages workspace internally
+    auto dist_view = raft::make_device_matrix_view<const math_t, int, raft::row_major>(
+      X_dist.data(), curBatchSize, n);
+    auto ind_view =
+      raft::make_device_matrix_view<int, int, raft::row_major>(X_ind.data(), curBatchSize, n);
+    raft::matrix::sort_cols_per_row(h, dist_view, ind_view, std::nullopt);
+
+    // The workspace won't grow anymore
+    if (resource::get_dry_run_flag(h)) { return 0.0; }
 
     int work     = curBatchSize * n;
     int n_blocks = raft::ceildiv(work, N_THREADS);
diff --git a/cpp/include/raft/stats/detail/v_measure.cuh b/cpp/include/raft/stats/detail/v_measure.cuh
index 0042d726c6..5dacb10e3f 100644
--- a/cpp/include/raft/stats/detail/v_measure.cuh
+++ b/cpp/include/raft/stats/detail/v_measure.cuh
@@ -16,6 +16,7 @@ namespace detail {
 /**
  * @brief Function to calculate the v-measure between two clusters
  *
+ * @param dry_run: whether to run in dry-run mode
  * @param truthClusterArray: the array of truth classes of type T
  * @param predClusterArray: the array of predicted classes of type T
  * @param size: the size of the data points of type int
@@ -25,7 +26,8 @@ namespace detail {
  * @param beta: v_measure parameter
  */
 template <typename T>
-double v_measure(const T* truthClusterArray,
+double v_measure(bool dry_run,
+                 const T* truthClusterArray,
                  const T* predClusterArray,
                  int size,
                  T lowerLabelRange,
@@ -35,10 +37,10 @@ double v_measure(const T* truthClusterArray,
 {
   double computedHomogeity, computedCompleteness, computedVMeasure;
 
-  computedHomogeity = raft::stats::homogeneity_score(
-    truthClusterArray, predClusterArray, size, lowerLabelRange, upperLabelRange, stream);
-  computedCompleteness = raft::stats::homogeneity_score(
-    predClusterArray, truthClusterArray, size, lowerLabelRange, upperLabelRange, stream);
+  computedHomogeity = homogeneity_score(
+    dry_run, truthClusterArray, predClusterArray, size, lowerLabelRange, upperLabelRange, stream);
+  computedCompleteness = homogeneity_score(
+    dry_run, predClusterArray, truthClusterArray, size, lowerLabelRange, upperLabelRange, stream);
 
   if (computedCompleteness + computedHomogeity == 0.0)
     computedVMeasure = 0.0;
diff --git a/cpp/include/raft/stats/detail/weighted_mean.cuh b/cpp/include/raft/stats/detail/weighted_mean.cuh
index 3bc4993a04..2182bd9ef1 100644
--- a/cpp/include/raft/stats/detail/weighted_mean.cuh
+++ b/cpp/include/raft/stats/detail/weighted_mean.cuh
@@ -23,6 +23,7 @@ namespace detail {
  * @tparam along_rows whether to reduce along rows or columns
  * @tparam Type the data type
  * @tparam IdxType Integer type used to for addressing
+ * @param dry_run whether to run in dry-run mode
  * @param mu the output mean vector
  * @param data the input matrix
  * @param weights weight of size D if along_row is true, else of size N
@@ -31,16 +32,23 @@ namespace detail {
  * @param stream cuda stream to launch work on
  */
 template <bool row_major, bool along_rows, typename Type, typename IdxType = int>
-void weightedMean(
-  Type* mu, const Type* data, const Type* weights, IdxType D, IdxType N, cudaStream_t stream)
+void weightedMean(bool dry_run,
+                  Type* mu,
+                  const Type* data,
+                  const Type* weights,
+                  IdxType D,
+                  IdxType N,
+                  cudaStream_t stream)
 {
   // sum the weights & copy back to CPU
   auto weight_size = along_rows ? D : N;
   Type WS          = 0;
-  raft::stats::sum<false>(mu, weights, (IdxType)1, weight_size, stream);
-  raft::update_host(&WS, mu, 1, stream);
+  raft::linalg::detail::reduce<false, false>(
+    dry_run, mu, weights, (IdxType)1, weight_size, (Type)0, stream);
+  if (!dry_run) { raft::update_host(&WS, mu, 1, stream); }
 
-  raft::linalg::reduce<row_major, along_rows>(
+  raft::linalg::detail::reduce<row_major, along_rows>(
+    dry_run,
     mu,
     data,
     D,
diff --git a/cpp/include/raft/stats/dispersion.cuh b/cpp/include/raft/stats/dispersion.cuh
index caeaa8bf55..6f0c07f8e7 100644
--- a/cpp/include/raft/stats/dispersion.cuh
+++ b/cpp/include/raft/stats/dispersion.cuh
@@ -11,6 +11,7 @@
 #include <raft/core/detail/macros.hpp>
 #include <raft/core/device_mdspan.hpp>
 #include <raft/core/resource/cuda_stream.hpp>
+#include <raft/core/resource/dry_run_flag.hpp>
 #include <raft/stats/detail/dispersion.cuh>
 
 #include <optional>
@@ -46,7 +47,7 @@ DataT dispersion(const DataT* centroids,
                  cudaStream_t stream)
 {
   return detail::dispersion<DataT, IdxT, TPB>(
-    centroids, clusterSizes, globalCentroid, nClusters, nPoints, dim, stream);
+    false, centroids, clusterSizes, globalCentroid, nClusters, nPoints, dim, stream);
 }
 
 /**
@@ -89,7 +90,8 @@ value_t cluster_dispersion(
     RAFT_EXPECTS(global_centroid.value().is_exhaustive(), "global_centroid must be contiguous");
     global_centroid_ptr = global_centroid.value().data_handle();
   }
-  return detail::dispersion<value_t, idx_t>(centroids.data_handle(),
+  return detail::dispersion<value_t, idx_t>(resource::get_dry_run_flag(handle),
+                                            centroids.data_handle(),
                                             cluster_sizes.data_handle(),
                                             global_centroid_ptr,
                                             centroids.extent(0),
diff --git a/cpp/include/raft/stats/entropy.cuh b/cpp/include/raft/stats/entropy.cuh
index 5bc8cf515a..b517cf0ae1 100644
--- a/cpp/include/raft/stats/entropy.cuh
+++ b/cpp/include/raft/stats/entropy.cuh
@@ -10,6 +10,7 @@
 #include <raft/core/detail/macros.hpp>
 #include <raft/core/device_mdspan.hpp>
 #include <raft/core/resource/cuda_stream.hpp>
+#include <raft/core/resource/dry_run_flag.hpp>
 #include <raft/stats/detail/entropy.cuh>
 
 namespace raft {
@@ -34,7 +35,7 @@ double entropy(const T* clusterArray,
                const T upperLabelRange,
                cudaStream_t stream)
 {
-  return detail::entropy(clusterArray, size, lowerLabelRange, upperLabelRange, stream);
+  return detail::entropy(false, clusterArray, size, lowerLabelRange, upperLabelRange, stream);
 }
 
 /**
@@ -61,7 +62,8 @@ double entropy(raft::resources const& handle,
                const value_t upper_label_range)
 {
   RAFT_EXPECTS(cluster_array.is_exhaustive(), "cluster_array must be contiguous");
-  return detail::entropy(cluster_array.data_handle(),
+  return detail::entropy(resource::get_dry_run_flag(handle),
+                         cluster_array.data_handle(),
                          cluster_array.extent(0),
                          lower_label_range,
                          upper_label_range,
diff --git a/cpp/include/raft/stats/histogram.cuh b/cpp/include/raft/stats/histogram.cuh
index d3b573e6a2..9d07ef3a07 100644
--- a/cpp/include/raft/stats/histogram.cuh
+++ b/cpp/include/raft/stats/histogram.cuh
@@ -11,6 +11,7 @@
 #include <raft/core/detail/macros.hpp>
 #include <raft/core/device_mdspan.hpp>
 #include <raft/core/resource/cuda_stream.hpp>
+#include <raft/core/resource/dry_run_flag.hpp>
 #include <raft/stats/detail/histogram.cuh>
 #include <raft/stats/stats_types.hpp>
 
@@ -88,6 +89,10 @@ void histogram(raft::resources const& handle,
                raft::device_matrix_view<int, idx_t, raft::col_major> bins,
                binner_op binner = IdentityBinner<value_t, idx_t>())
 {
+  // Seems like neither implementation of histogram does any CUDA allocations.
+  // There is one allocation of std::vector inside Seive object in computeHashTableSize,
+  // but it doesn't go through sdt::pmr, so isn't counted.
+  if (resource::get_dry_run_flag(handle)) { return; }
   RAFT_EXPECTS(std::is_integral_v<idx_t> && data.extent(0) <= std::numeric_limits<int>::max(),
                "Index type not supported");
   RAFT_EXPECTS(bins.extent(1) == data.extent(1), "Size mismatch");
diff --git a/cpp/include/raft/stats/homogeneity_score.cuh b/cpp/include/raft/stats/homogeneity_score.cuh
index 8edf1624f2..e76642fb9a 100644
--- a/cpp/include/raft/stats/homogeneity_score.cuh
+++ b/cpp/include/raft/stats/homogeneity_score.cuh
@@ -11,6 +11,7 @@
 #include <raft/core/detail/macros.hpp>
 #include <raft/core/device_mdspan.hpp>
 #include <raft/core/resource/cuda_stream.hpp>
+#include <raft/core/resource/dry_run_flag.hpp>
 #include <raft/stats/detail/homogeneity_score.cuh>
 
 namespace raft {
@@ -36,7 +37,7 @@ double homogeneity_score(const T* truthClusterArray,
                          cudaStream_t stream)
 {
   return detail::homogeneity_score(
-    truthClusterArray, predClusterArray, size, lowerLabelRange, upperLabelRange, stream);
+    false, truthClusterArray, predClusterArray, size, lowerLabelRange, upperLabelRange, stream);
 }
 
 /**
@@ -68,7 +69,8 @@ double homogeneity_score(raft::resources const& handle,
   RAFT_EXPECTS(truth_cluster_array.size() == pred_cluster_array.size(), "Size mismatch");
   RAFT_EXPECTS(truth_cluster_array.is_exhaustive(), "truth_cluster_array must be contiguous");
   RAFT_EXPECTS(pred_cluster_array.is_exhaustive(), "pred_cluster_array must be contiguous");
-  return detail::homogeneity_score(truth_cluster_array.data_handle(),
+  return detail::homogeneity_score(resource::get_dry_run_flag(handle),
+                                   truth_cluster_array.data_handle(),
                                    pred_cluster_array.data_handle(),
                                    truth_cluster_array.extent(0),
                                    lower_label_range,
diff --git a/cpp/include/raft/stats/information_criterion.cuh b/cpp/include/raft/stats/information_criterion.cuh
index 8969c0f24a..de08544a1f 100644
--- a/cpp/include/raft/stats/information_criterion.cuh
+++ b/cpp/include/raft/stats/information_criterion.cuh
@@ -21,6 +21,7 @@
 #include <raft/core/detail/macros.hpp>
 #include <raft/core/device_mdspan.hpp>
 #include <raft/core/resource/cuda_stream.hpp>
+#include <raft/core/resource/dry_run_flag.hpp>
 #include <raft/core/resources.hpp>
 #include <raft/stats/detail/batched/information_criterion.cuh>
 #include <raft/stats/stats_types.hpp>
@@ -89,6 +90,7 @@ void information_criterion_batched(raft::resources const& handle,
                                    idx_t n_params,
                                    idx_t n_samples)
 {
+  if (resource::get_dry_run_flag(handle)) { return; }
   RAFT_EXPECTS(d_ic.size() == d_loglikelihood.size(), "Size mismatch");
   RAFT_EXPECTS(d_ic.is_exhaustive(), "d_ic must be contiguous");
   RAFT_EXPECTS(d_loglikelihood.is_exhaustive(), "d_loglikelihood must be contiguous");
diff --git a/cpp/include/raft/stats/kl_divergence.cuh b/cpp/include/raft/stats/kl_divergence.cuh
index 86ddbd388f..5ace40174a 100644
--- a/cpp/include/raft/stats/kl_divergence.cuh
+++ b/cpp/include/raft/stats/kl_divergence.cuh
@@ -11,6 +11,7 @@
 #include <raft/core/detail/macros.hpp>
 #include <raft/core/device_mdspan.hpp>
 #include <raft/core/resource/cuda_stream.hpp>
+#include <raft/core/resource/dry_run_flag.hpp>
 #include <raft/stats/detail/kl_divergence.cuh>
 
 namespace raft {
@@ -30,7 +31,7 @@ namespace stats {
 template <typename DataT>
 DataT kl_divergence(const DataT* modelPDF, const DataT* candidatePDF, int size, cudaStream_t stream)
 {
-  return detail::kl_divergence(modelPDF, candidatePDF, size, stream);
+  return detail::kl_divergence(false, modelPDF, candidatePDF, size, stream);
 }
 
 /**
@@ -58,7 +59,8 @@ value_t kl_divergence(raft::resources const& handle,
   RAFT_EXPECTS(modelPDF.size() == candidatePDF.size(), "Size mismatch");
   RAFT_EXPECTS(modelPDF.is_exhaustive(), "modelPDF must be contiguous");
   RAFT_EXPECTS(candidatePDF.is_exhaustive(), "candidatePDF must be contiguous");
-  return detail::kl_divergence(modelPDF.data_handle(),
+  return detail::kl_divergence(resource::get_dry_run_flag(handle),
+                               modelPDF.data_handle(),
                                candidatePDF.data_handle(),
                                modelPDF.extent(0),
                                resource::get_cuda_stream(handle));
diff --git a/cpp/include/raft/stats/mean.cuh b/cpp/include/raft/stats/mean.cuh
index 87310a0bb2..e9104fcc0b 100644
--- a/cpp/include/raft/stats/mean.cuh
+++ b/cpp/include/raft/stats/mean.cuh
@@ -34,7 +34,7 @@ namespace stats {
 template <bool rowMajor, typename Type, typename IdxType = int>
 void mean(Type* mu, const Type* data, IdxType D, IdxType N, cudaStream_t stream)
 {
-  detail::mean<rowMajor>(mu, data, D, N, stream);
+  detail::mean<rowMajor>(false, mu, data, D, N, stream);
 }
 
 /**
@@ -59,7 +59,7 @@ template <bool rowMajor, typename Type, typename IdxType = int>
 [[deprecated("'sample' parameter deprecated")]] void mean(
   Type* mu, const Type* data, IdxType D, IdxType N, bool sample, cudaStream_t stream)
 {
-  detail::mean<rowMajor>(mu, data, D, N, sample, stream);
+  detail::mean<rowMajor>(false, mu, data, D, N, sample, stream);
 }
 
 /**
@@ -90,7 +90,8 @@ void mean(raft::resources const& handle,
   RAFT_EXPECTS(data.extent(1) == mu.extent(0), "Size mismatch between data and mu");
   RAFT_EXPECTS(mu.is_exhaustive(), "mu must be contiguous");
   RAFT_EXPECTS(data.is_exhaustive(), "data must be contiguous");
-  detail::mean<std::is_same_v<layout_t, raft::row_major>>(mu.data_handle(),
+  detail::mean<std::is_same_v<layout_t, raft::row_major>>(raft::resource::get_dry_run_flag(handle),
+                                                          mu.data_handle(),
                                                           data.data_handle(),
                                                           data.extent(1),
                                                           data.extent(0),
@@ -125,7 +126,8 @@ template <typename value_t, typename idx_t, typename layout_t>
   RAFT_EXPECTS(data.extent(1) == mu.extent(0), "Size mismatch between data and mu");
   RAFT_EXPECTS(mu.is_exhaustive(), "mu must be contiguous");
   RAFT_EXPECTS(data.is_exhaustive(), "data must be contiguous");
-  detail::mean<std::is_same_v<layout_t, raft::row_major>>(mu.data_handle(),
+  detail::mean<std::is_same_v<layout_t, raft::row_major>>(raft::resource::get_dry_run_flag(handle),
+                                                          mu.data_handle(),
                                                           data.data_handle(),
                                                           data.extent(1),
                                                           data.extent(0),
diff --git a/cpp/include/raft/stats/mean_center.cuh b/cpp/include/raft/stats/mean_center.cuh
index 9ccfd82705..1dd150310a 100644
--- a/cpp/include/raft/stats/mean_center.cuh
+++ b/cpp/include/raft/stats/mean_center.cuh
@@ -11,6 +11,7 @@
 #include <raft/core/detail/macros.hpp>
 #include <raft/core/device_mdspan.hpp>
 #include <raft/core/resource/cuda_stream.hpp>
+#include <raft/core/resource/dry_run_flag.hpp>
 #include <raft/stats/detail/mean_center.cuh>
 
 namespace raft {
@@ -34,7 +35,8 @@ template <bool rowMajor, bool bcastAlongRows, typename Type, typename IdxType =
 void meanCenter(
   Type* out, const Type* data, const Type* mu, IdxType D, IdxType N, cudaStream_t stream)
 {
-  detail::meanCenter<rowMajor, bcastAlongRows, Type, IdxType, TPB>(out, data, mu, D, N, stream);
+  detail::meanCenter<rowMajor, bcastAlongRows, Type, IdxType, TPB>(
+    false, out, data, mu, D, N, stream);
 }
 
 /**
@@ -54,7 +56,7 @@ void meanCenter(
 template <bool rowMajor, bool bcastAlongRows, typename Type, typename IdxType = int, int TPB = 256>
 void meanAdd(Type* out, const Type* data, const Type* mu, IdxType D, IdxType N, cudaStream_t stream)
 {
-  detail::meanAdd<rowMajor, bcastAlongRows, Type, IdxType, TPB>(out, data, mu, D, N, stream);
+  detail::meanAdd<rowMajor, bcastAlongRows, Type, IdxType, TPB>(false, out, data, mu, D, N, stream);
 }
 
 /**
@@ -91,7 +93,8 @@ void mean_center(raft::resources const& handle,
   detail::meanCenter<std::is_same_v<layout_t, raft::row_major>,
                      apply == Apply::ALONG_ROWS,
                      value_t,
-                     idx_t>(out.data_handle(),
+                     idx_t>(resource::get_dry_run_flag(handle),
+                            out.data_handle(),
                             data.data_handle(),
                             mu.data_handle(),
                             data.extent(1),
@@ -128,6 +131,7 @@ void mean_add(raft::resources const& handle,
   RAFT_EXPECTS(data.is_exhaustive(), "data must be contiguous");
   detail::
     meanAdd<std::is_same_v<layout_t, raft::row_major>, apply == Apply::ALONG_ROWS, value_t, idx_t>(
+      resource::get_dry_run_flag(handle),
       out.data_handle(),
       data.data_handle(),
       mu.data_handle(),
diff --git a/cpp/include/raft/stats/meanvar.cuh b/cpp/include/raft/stats/meanvar.cuh
index e21f586f9e..a81819ec67 100644
--- a/cpp/include/raft/stats/meanvar.cuh
+++ b/cpp/include/raft/stats/meanvar.cuh
@@ -10,6 +10,7 @@
 #include <raft/core/detail/macros.hpp>
 #include <raft/core/device_mdspan.hpp>
 #include <raft/core/resource/cuda_stream.hpp>
+#include <raft/core/resource/dry_run_flag.hpp>
 #include <raft/stats/detail/meanvar.cuh>
 
 namespace raft {
@@ -45,7 +46,7 @@ void meanvar(Type* mean,
              bool rowMajor,
              cudaStream_t stream)
 {
-  detail::meanvar(mean, var, data, D, N, sample, rowMajor, stream);
+  detail::meanvar(false, mean, var, data, D, N, sample, rowMajor, stream);
 }
 
 /**
@@ -86,7 +87,8 @@ void meanvar(raft::resources const& handle,
   RAFT_EXPECTS(mean.is_exhaustive(), "mean must be contiguous");
   RAFT_EXPECTS(var.is_exhaustive(), "var must be contiguous");
   RAFT_EXPECTS(data.is_exhaustive(), "data must be contiguous");
-  detail::meanvar(mean.data_handle(),
+  detail::meanvar(resource::get_dry_run_flag(handle),
+                  mean.data_handle(),
                   var.data_handle(),
                   data.data_handle(),
                   data.extent(1),
diff --git a/cpp/include/raft/stats/minmax.cuh b/cpp/include/raft/stats/minmax.cuh
index 91f6a56537..cc44cb3e79 100644
--- a/cpp/include/raft/stats/minmax.cuh
+++ b/cpp/include/raft/stats/minmax.cuh
@@ -10,6 +10,7 @@
 #include <raft/core/detail/macros.hpp>
 #include <raft/core/device_mdspan.hpp>
 #include <raft/core/resource/cuda_stream.hpp>
+#include <raft/core/resource/dry_run_flag.hpp>
 #include <raft/stats/detail/minmax.cuh>
 #include <raft/util/cuda_utils.cuh>
 #include <raft/util/cudart_utils.hpp>
@@ -96,6 +97,7 @@ void minmax(raft::resources const& handle,
             raft::device_vector_view<value_t, idx_t> globalmax,
             std::optional<raft::device_vector_view<value_t, idx_t>> sampledcols)
 {
+  if (resource::get_dry_run_flag(handle)) { return; }
   const unsigned* rowids_ptr = nullptr;
   const unsigned* colids_ptr = nullptr;
   value_t* sampledcols_ptr   = nullptr;
diff --git a/cpp/include/raft/stats/mutual_info_score.cuh b/cpp/include/raft/stats/mutual_info_score.cuh
index 7e9c0e4cd7..54364cb92c 100644
--- a/cpp/include/raft/stats/mutual_info_score.cuh
+++ b/cpp/include/raft/stats/mutual_info_score.cuh
@@ -11,6 +11,7 @@
 #include <raft/core/detail/macros.hpp>
 #include <raft/core/device_mdspan.hpp>
 #include <raft/core/resource/cuda_stream.hpp>
+#include <raft/core/resource/dry_run_flag.hpp>
 #include <raft/stats/detail/mutual_info_score.cuh>
 
 namespace raft {
@@ -35,7 +36,7 @@ double mutual_info_score(const T* firstClusterArray,
                          cudaStream_t stream)
 {
   return detail::mutual_info_score(
-    firstClusterArray, secondClusterArray, size, lowerLabelRange, upperLabelRange, stream);
+    false, firstClusterArray, secondClusterArray, size, lowerLabelRange, upperLabelRange, stream);
 }
 
 /**
@@ -66,7 +67,8 @@ double mutual_info_score(raft::resources const& handle,
                "Size mismatch between first_cluster_array and second_cluster_array");
   RAFT_EXPECTS(first_cluster_array.is_exhaustive(), "first_cluster_array must be contiguous");
   RAFT_EXPECTS(second_cluster_array.is_exhaustive(), "second_cluster_array must be contiguous");
-  return detail::mutual_info_score(first_cluster_array.data_handle(),
+  return detail::mutual_info_score(resource::get_dry_run_flag(handle),
+                                   first_cluster_array.data_handle(),
                                    second_cluster_array.data_handle(),
                                    first_cluster_array.extent(0),
                                    lower_label_range,
diff --git a/cpp/include/raft/stats/neighborhood_recall.cuh b/cpp/include/raft/stats/neighborhood_recall.cuh
index b3c10d3e85..ab80ad58c5 100644
--- a/cpp/include/raft/stats/neighborhood_recall.cuh
+++ b/cpp/include/raft/stats/neighborhood_recall.cuh
@@ -15,6 +15,7 @@
 #include <raft/core/host_mdspan.hpp>
 #include <raft/core/mdspan_types.hpp>
 #include <raft/core/resource/cuda_stream.hpp>
+#include <raft/core/resource/dry_run_flag.hpp>
 #include <raft/core/resources.hpp>
 
 #include <optional>
@@ -133,7 +134,7 @@ void neighborhood_recall(
  * // run brute-force KNN for reference
  *
  * auto scalar = 0.0f;
- * auto recall_score = raft::make_host_scalar(scalar);
+ * auto recall_score = raft::make_host_scalar(res, scalar);
  *
  * raft::stats::neighborhood_recall(res,
                                     raft::make_const_mdspan(indices.view()),
@@ -173,6 +174,7 @@ void neighborhood_recall(
   auto recall_score_d = raft::make_device_scalar(res, *recall_score.data_handle());
   neighborhood_recall(
     res, indices, ref_indices, recall_score_d.view(), distances, ref_distances, eps);
+  if (resource::get_dry_run_flag(res)) { return; }
   raft::update_host(recall_score.data_handle(),
                     recall_score_d.data_handle(),
                     1,
diff --git a/cpp/include/raft/stats/r2_score.cuh b/cpp/include/raft/stats/r2_score.cuh
index a4b7bfdf03..35ae279feb 100644
--- a/cpp/include/raft/stats/r2_score.cuh
+++ b/cpp/include/raft/stats/r2_score.cuh
@@ -11,6 +11,7 @@
 #include <raft/core/detail/macros.hpp>
 #include <raft/core/device_mdspan.hpp>
 #include <raft/core/resource/cuda_stream.hpp>
+#include <raft/core/resource/dry_run_flag.hpp>
 #include <raft/stats/detail/scores.cuh>
 
 namespace raft {
@@ -34,7 +35,7 @@ namespace stats {
 template <typename math_t>
 math_t r2_score(math_t* y, math_t* y_hat, int n, cudaStream_t stream)
 {
-  return detail::r2_score(y, y_hat, n, stream);
+  return detail::r2_score(false, y, y_hat, n, stream);
 }
 
 /**
@@ -69,7 +70,8 @@ value_t r2_score(raft::resources const& handle,
   RAFT_EXPECTS(y_hat.is_exhaustive(), "y_hat must be contiguous");
 
   // TODO: Change the underlying implementation to remove the need to const_cast
-  return detail::r2_score(const_cast<value_t*>(y.data_handle()),
+  return detail::r2_score(resource::get_dry_run_flag(handle),
+                          const_cast<value_t*>(y.data_handle()),
                           const_cast<value_t*>(y_hat.data_handle()),
                           y.extent(0),
                           resource::get_cuda_stream(handle));
diff --git a/cpp/include/raft/stats/rand_index.cuh b/cpp/include/raft/stats/rand_index.cuh
index c0334e27af..bb608a6364 100644
--- a/cpp/include/raft/stats/rand_index.cuh
+++ b/cpp/include/raft/stats/rand_index.cuh
@@ -10,6 +10,7 @@
 #include <raft/core/detail/macros.hpp>
 #include <raft/core/device_mdspan.hpp>
 #include <raft/core/resource/cuda_stream.hpp>
+#include <raft/core/resource/dry_run_flag.hpp>
 #include <raft/core/resources.hpp>
 #include <raft/stats/detail/rand_index.cuh>
 
@@ -27,7 +28,7 @@ namespace stats {
 template <typename T>
 double rand_index(T* firstClusterArray, T* secondClusterArray, uint64_t size, cudaStream_t stream)
 {
-  return detail::compute_rand_index(firstClusterArray, secondClusterArray, size, stream);
+  return detail::compute_rand_index(false, firstClusterArray, secondClusterArray, size, stream);
 }
 
 /**
@@ -54,7 +55,8 @@ double rand_index(raft::resources const& handle,
                "Size mismatch between first_cluster_array and second_cluster_array");
   RAFT_EXPECTS(first_cluster_array.is_exhaustive(), "first_cluster_array must be contiguous");
   RAFT_EXPECTS(second_cluster_array.is_exhaustive(), "second_cluster_array must be contiguous");
-  return detail::compute_rand_index(first_cluster_array.data_handle(),
+  return detail::compute_rand_index(resource::get_dry_run_flag(handle),
+                                    first_cluster_array.data_handle(),
                                     second_cluster_array.data_handle(),
                                     second_cluster_array.extent(0),
                                     resource::get_cuda_stream(handle));
diff --git a/cpp/include/raft/stats/regression_metrics.cuh b/cpp/include/raft/stats/regression_metrics.cuh
index 5fb7abd503..1fb95086a8 100644
--- a/cpp/include/raft/stats/regression_metrics.cuh
+++ b/cpp/include/raft/stats/regression_metrics.cuh
@@ -11,6 +11,7 @@
 #include <raft/core/device_mdspan.hpp>
 #include <raft/core/host_mdspan.hpp>
 #include <raft/core/resource/cuda_stream.hpp>
+#include <raft/core/resource/dry_run_flag.hpp>
 #include <raft/core/resources.hpp>
 #include <raft/stats/detail/scores.cuh>
 
@@ -40,8 +41,14 @@ void regression_metrics(const T* predictions,
                         double& mean_squared_error,
                         double& median_abs_error)
 {
-  detail::regression_metrics(
-    predictions, ref_predictions, n, stream, mean_abs_error, mean_squared_error, median_abs_error);
+  detail::regression_metrics(false,
+                             predictions,
+                             ref_predictions,
+                             n,
+                             stream,
+                             mean_abs_error,
+                             mean_squared_error,
+                             median_abs_error);
 }
 
 /**
@@ -80,7 +87,8 @@ void regression_metrics(raft::resources const& handle,
                "mean_squared_error view must not be empty");
   RAFT_EXPECTS(median_abs_error.data_handle() != nullptr,
                "median_abs_error view must not be empty");
-  detail::regression_metrics(predictions.data_handle(),
+  detail::regression_metrics(resource::get_dry_run_flag(handle),
+                             predictions.data_handle(),
                              ref_predictions.data_handle(),
                              predictions.extent(0),
                              resource::get_cuda_stream(handle),
diff --git a/cpp/include/raft/stats/stddev.cuh b/cpp/include/raft/stats/stddev.cuh
index 6caae0f62e..a41571a685 100644
--- a/cpp/include/raft/stats/stddev.cuh
+++ b/cpp/include/raft/stats/stddev.cuh
@@ -10,6 +10,7 @@
 #include <raft/core/detail/macros.hpp>
 #include <raft/core/device_mdspan.hpp>
 #include <raft/core/resource/cuda_stream.hpp>
+#include <raft/core/resource/dry_run_flag.hpp>
 #include <raft/core/resources.hpp>
 #include <raft/stats/detail/stddev.cuh>
 
@@ -43,7 +44,7 @@ void stddev(Type* std,
             bool sample,
             cudaStream_t stream)
 {
-  detail::stddev<rowMajor>(std, data, mu, D, N, sample, stream);
+  detail::stddev<rowMajor>(false, std, data, mu, D, N, sample, stream);
 }
 
 /**
@@ -73,7 +74,7 @@ void vars(Type* var,
           bool sample,
           cudaStream_t stream)
 {
-  detail::vars<rowMajor>(var, data, mu, D, N, sample, stream);
+  detail::vars<rowMajor>(false, var, data, mu, D, N, sample, stream);
 }
 
 /**
@@ -111,7 +112,8 @@ void stddev(raft::resources const& handle,
                 "raft::row_major or raft::col_major (or one of their aliases)");
   RAFT_EXPECTS(mu.size() == std.size(), "Size mismatch between mu and std");
   RAFT_EXPECTS(mu.extent(0) == data.extent(1), "Size mismatch between data and mu");
-  detail::stddev<is_row_major>(std.data_handle(),
+  detail::stddev<is_row_major>(resource::get_dry_run_flag(handle),
+                               std.data_handle(),
                                data.data_handle(),
                                mu.data_handle(),
                                data.extent(1),
@@ -157,7 +159,8 @@ void vars(raft::resources const& handle,
                 "raft::row_major or raft::col_major (or one of their aliases)");
   RAFT_EXPECTS(mu.size() == var.size(), "Size mismatch between mu and std");
   RAFT_EXPECTS(mu.extent(0) == data.extent(1), "Size mismatch between data and mu");
-  detail::vars<is_row_major>(var.data_handle(),
+  detail::vars<is_row_major>(resource::get_dry_run_flag(handle),
+                             var.data_handle(),
                              data.data_handle(),
                              mu.data_handle(),
                              data.extent(1),
diff --git a/cpp/include/raft/stats/sum.cuh b/cpp/include/raft/stats/sum.cuh
index 8f006f9234..331bde835a 100644
--- a/cpp/include/raft/stats/sum.cuh
+++ b/cpp/include/raft/stats/sum.cuh
@@ -11,7 +11,10 @@
 #include <raft/core/detail/macros.hpp>
 #include <raft/core/device_mdspan.hpp>
 #include <raft/core/resource/cuda_stream.hpp>
-#include <raft/stats/detail/sum.cuh>
+#include <raft/core/resource/dry_run_flag.hpp>
+#include <raft/core/types.hpp>
+#include <raft/linalg/detail/reduce.cuh>
+#include <raft/linalg/reduce.cuh>
 #include <raft/util/cudart_utils.hpp>
 
 namespace raft {
@@ -34,7 +37,8 @@ namespace stats {
 template <bool rowMajor, typename Type, typename IdxType = int>
 void sum(Type* output, const Type* input, IdxType D, IdxType N, cudaStream_t stream)
 {
-  detail::sum<rowMajor>(output, input, D, N, stream);
+  // Inline detail::sum: reduce along columns (alongRows=false)
+  raft::linalg::detail::reduce<rowMajor, false>(false, output, input, D, N, Type(0), stream);
 }
 
 /**
@@ -65,11 +69,9 @@ void sum(raft::resources const& handle,
                 "sum: Layout must be either "
                 "raft::row_major or raft::col_major (or one of their aliases)");
   RAFT_EXPECTS(input.extent(1) == output.extent(0), "Size mismatch between input and output");
-  detail::sum<is_row_major>(output.data_handle(),
-                            input.data_handle(),
-                            input.extent(1),
-                            input.extent(0),
-                            resource::get_cuda_stream(handle));
+  // Use public reduce API that handles dry-run internally
+  // Sum along columns (Apply::ALONG_COLUMNS), reduce_op is add_op (default)
+  raft::linalg::reduce<raft::Apply::ALONG_COLUMNS>(handle, input, output, value_t(0));
 }
 
 /** @} */  // end group stats_sum
diff --git a/cpp/include/raft/stats/v_measure.cuh b/cpp/include/raft/stats/v_measure.cuh
index 6c7274d950..3451883798 100644
--- a/cpp/include/raft/stats/v_measure.cuh
+++ b/cpp/include/raft/stats/v_measure.cuh
@@ -10,6 +10,7 @@
 #include <raft/core/detail/macros.hpp>
 #include <raft/core/device_mdspan.hpp>
 #include <raft/core/resource/cuda_stream.hpp>
+#include <raft/core/resource/dry_run_flag.hpp>
 #include <raft/core/resources.hpp>
 #include <raft/stats/detail/v_measure.cuh>
 
@@ -36,8 +37,14 @@ double v_measure(const T* truthClusterArray,
                  cudaStream_t stream,
                  double beta = 1.0)
 {
-  return detail::v_measure(
-    truthClusterArray, predClusterArray, size, lowerLabelRange, upperLabelRange, stream, beta);
+  return detail::v_measure(false,
+                           truthClusterArray,
+                           predClusterArray,
+                           size,
+                           lowerLabelRange,
+                           upperLabelRange,
+                           stream,
+                           beta);
 }
 
 /**
@@ -71,7 +78,8 @@ double v_measure(raft::resources const& handle,
   RAFT_EXPECTS(truth_cluster_array.is_exhaustive(), "truth_cluster_array must be contiguous");
   RAFT_EXPECTS(pred_cluster_array.is_exhaustive(), "pred_cluster_array must be contiguous");
 
-  return detail::v_measure(truth_cluster_array.data_handle(),
+  return detail::v_measure(resource::get_dry_run_flag(handle),
+                           truth_cluster_array.data_handle(),
                            pred_cluster_array.data_handle(),
                            truth_cluster_array.extent(0),
                            lower_label_range,
diff --git a/cpp/include/raft/stats/weighted_mean.cuh b/cpp/include/raft/stats/weighted_mean.cuh
index 62cb0d9ff9..c1e6f023f3 100644
--- a/cpp/include/raft/stats/weighted_mean.cuh
+++ b/cpp/include/raft/stats/weighted_mean.cuh
@@ -11,6 +11,7 @@
 #include <raft/core/detail/macros.hpp>
 #include <raft/core/device_mdspan.hpp>
 #include <raft/core/resource/cuda_stream.hpp>
+#include <raft/core/resource/dry_run_flag.hpp>
 #include <raft/core/types.hpp>
 #include <raft/stats/detail/weighted_mean.cuh>
 
@@ -36,7 +37,7 @@ template <bool row_major, bool along_rows, typename Type, typename IdxType = int
 void weightedMean(
   Type* mu, const Type* data, const Type* weights, IdxType D, IdxType N, cudaStream_t stream)
 {
-  detail::weightedMean<row_major, along_rows>(mu, data, weights, D, N, stream);
+  detail::weightedMean<row_major, along_rows>(false, mu, data, weights, D, N, stream);
 }
 
 /**
@@ -117,7 +118,8 @@ void weighted_mean(raft::resources const& handle,
   RAFT_EXPECTS(mu.extent(0) == mean_vec_size,
                "Size mismatch between mu and expected mean_vec_size");
 
-  detail::weightedMean<is_row_major, is_along_rows>(mu.data_handle(),
+  detail::weightedMean<is_row_major, is_along_rows>(resource::get_dry_run_flag(handle),
+                                                    mu.data_handle(),
                                                     data.data_handle(),
                                                     weights.data_handle(),
                                                     data.extent(1),
diff --git a/cpp/tests/CMakeLists.txt b/cpp/tests/CMakeLists.txt
index 00bfe1f32a..8c95b084a9 100644
--- a/cpp/tests/CMakeLists.txt
+++ b/cpp/tests/CMakeLists.txt
@@ -323,6 +323,8 @@ if(BUILD_TESTS)
     util/popc.cu
     util/pow2_utils.cu
     util/reduction.cu
+    util/dry_run_resources.cpp
+    util/dry_run_guards.cu
   )
 endif()
 
diff --git a/cpp/tests/core/bitmap.cu b/cpp/tests/core/bitmap.cu
index 8ba85a4bc1..e3576cbcdd 100644
--- a/cpp/tests/core/bitmap.cu
+++ b/cpp/tests/core/bitmap.cu
@@ -1,12 +1,14 @@
 /*
- * SPDX-FileCopyrightText: Copyright (c) 2024-2025, NVIDIA CORPORATION.
+ * SPDX-FileCopyrightText: Copyright (c) 2024-2026, NVIDIA CORPORATION.
  * SPDX-License-Identifier: Apache-2.0
  */
 
 #include "../test_utils.cuh"
 
 #include <raft/core/bitmap.cuh>
+#include <raft/core/copy.cuh>
 #include <raft/core/device_mdarray.hpp>
+#include <raft/core/host_mdspan.hpp>
 #include <raft/linalg/init.cuh>
 #include <raft/linalg/map.cuh>
 #include <raft/random/rng.cuh>
@@ -102,69 +104,85 @@ class BitmapTest : public testing::TestWithParam<test_spec_bitmap<index_t>> {
 
     create_cpu_bitmap(bitmap_ref, mask_cpu, spec.rows, spec.cols);
 
-    auto bitset_d = raft::core::bitset<bitmap_t, index_t>(
-      res, raft::make_const_mdspan(mask_device.view()), index_t(spec.rows * spec.cols));
-
-    auto bitmap_view_d =
-      raft::core::bitmap_view<bitmap_t, index_t>(bitset_d.data(), spec.rows, spec.cols);
-
-    ASSERT_EQ(bitmap_view_d.get_n_rows(), spec.rows);
-    ASSERT_EQ(bitmap_view_d.get_n_cols(), spec.cols);
-
     auto query_device  = raft::make_device_vector<index_t, index_t>(res, spec.query_len);
     auto result_device = raft::make_device_vector<uint8_t, index_t>(res, spec.query_len);
     auto query_cpu     = std::vector<index_t>(spec.query_len);
     auto result_cpu    = std::vector<uint8_t>(spec.query_len);
     auto result_ref    = std::vector<uint8_t>(spec.query_len);
 
-    raft::random::uniformInt(
-      res, rng, query_device.view(), index_t(0), index_t(spec.rows * spec.cols));
-    raft::update_host(query_cpu.data(), query_device.data_handle(), query_device.extent(0), stream);
-
-    auto queries_device_view =
-      raft::make_device_vector_view<const index_t>(query_device.data_handle(), spec.query_len);
-
-    raft::linalg::map(
+    raft::execute_with_dry_run_check(
       res,
-      result_device.view(),
-      [bitmap_view_d] __device__(index_t query) {
-        auto row = query / bitmap_view_d.get_n_cols();
-        auto col = query % bitmap_view_d.get_n_cols();
-        return (uint8_t)(bitmap_view_d.test(row, col));
+      [&](raft::resources const& h) {
+        raft::core::bitset<bitmap_t, index_t> bitset_d(
+          h, raft::make_const_mdspan(mask_device.view()), index_t(spec.rows * spec.cols));
+
+        auto bitmap_view_d =
+          raft::core::bitmap_view<bitmap_t, index_t>(bitset_d.data(), spec.rows, spec.cols);
+
+        ASSERT_EQ(bitmap_view_d.get_n_rows(), spec.rows);
+        ASSERT_EQ(bitmap_view_d.get_n_cols(), spec.cols);
+
+        raft::random::uniformInt(
+          h, rng, query_device.view(), index_t(0), index_t(spec.rows * spec.cols));
+        raft::copy(h,
+                   raft::make_host_vector_view(query_cpu.data(), query_device.extent(0)),
+                   raft::make_const_mdspan(query_device.view()));
+
+        auto queries_device_view =
+          raft::make_device_vector_view<const index_t>(query_device.data_handle(), spec.query_len);
+
+        raft::linalg::map(
+          h,
+          result_device.view(),
+          [bitmap_view_d] __device__(index_t query) {
+            auto row = query / bitmap_view_d.get_n_cols();
+            auto col = query % bitmap_view_d.get_n_cols();
+            return (uint8_t)(bitmap_view_d.test(row, col));
+          },
+          queries_device_view);
+
+        raft::copy(h,
+                   raft::make_host_vector_view(result_cpu.data(), result_device.extent(0)),
+                   raft::make_const_mdspan(result_device.view()));
+        resource::sync_stream(h, stream);
+
+        test_cpu_bitmap(bitmap_ref, query_cpu, result_ref, spec.rows, spec.cols);
+
+        if (resource::get_dry_run_flag(h)) { return; }
+        ASSERT_TRUE(hostVecMatch(result_cpu, result_ref, Compare<uint8_t>()));
+
+        raft::random::uniformInt(
+          h, rng, mask_device.view(), index_t(0), index_t(spec.rows * spec.cols));
+        raft::copy(h,
+                   raft::make_host_vector_view(mask_cpu.data(), mask_device.extent(0)),
+                   raft::make_const_mdspan(mask_device.view()));
+        resource::sync_stream(h, stream);
+
+        thrust::for_each_n(raft::resource::get_thrust_policy(h),
+                           mask_device.data_handle(),
+                           mask_device.extent(0),
+                           [bitmap_view_d] __device__(const index_t sample_index) {
+                             auto row = sample_index / bitmap_view_d.get_n_cols();
+                             auto col = sample_index % bitmap_view_d.get_n_cols();
+                             bitmap_view_d.set(row, col, false);
+                           });
+
+        raft::copy(h,
+                   raft::make_host_vector_view(bitmap_result.data(), bitmap_result.size()),
+                   raft::make_device_vector_view<const bitmap_t>(bitmap_view_d.data(),
+                                                                 bitmap_result.size()));
+
+        for (size_t i = 0; i < mask_cpu.size(); i++) {
+          auto row = mask_cpu[i] / spec.cols;
+          auto col = mask_cpu[i] % spec.cols;
+          auto idx = row * spec.cols + col;
+          bitmap_ref[idx / bitmap_element_size] &= ~(bitmap_t{1} << (idx % bitmap_element_size));
+        }
+        resource::sync_stream(h, stream);
+        ASSERT_TRUE(hostVecMatch(bitmap_ref, bitmap_result, raft::Compare<bitmap_t>()));
       },
-      queries_device_view);
-
-    raft::update_host(result_cpu.data(), result_device.data_handle(), query_device.size(), stream);
-    resource::sync_stream(res, stream);
-
-    test_cpu_bitmap(bitmap_ref, query_cpu, result_ref, spec.rows, spec.cols);
-
-    ASSERT_TRUE(hostVecMatch(result_cpu, result_ref, Compare<uint8_t>()));
-
-    raft::random::uniformInt(
-      res, rng, mask_device.view(), index_t(0), index_t(spec.rows * spec.cols));
-    raft::update_host(mask_cpu.data(), mask_device.data_handle(), mask_device.extent(0), stream);
-    resource::sync_stream(res, stream);
-
-    thrust::for_each_n(raft::resource::get_thrust_policy(res),
-                       mask_device.data_handle(),
-                       mask_device.extent(0),
-                       [bitmap_view_d] __device__(const index_t sample_index) {
-                         auto row = sample_index / bitmap_view_d.get_n_cols();
-                         auto col = sample_index % bitmap_view_d.get_n_cols();
-                         bitmap_view_d.set(row, col, false);
-                       });
-
-    raft::update_host(bitmap_result.data(), bitmap_view_d.data(), bitmap_result.size(), stream);
-
-    for (size_t i = 0; i < mask_cpu.size(); i++) {
-      auto row = mask_cpu[i] / spec.cols;
-      auto col = mask_cpu[i] % spec.cols;
-      auto idx = row * spec.cols + col;
-      bitmap_ref[idx / bitmap_element_size] &= ~(bitmap_t{1} << (idx % bitmap_element_size));
-    }
-    resource::sync_stream(res, stream);
-    ASSERT_TRUE(hostVecMatch(bitmap_ref, bitmap_result, raft::Compare<bitmap_t>()));
+      raft::alloc_behavior::ARGUMENT_DRIVEN,
+      raft::ceildiv(index_t(spec.rows * spec.cols), bitmap_element_size) * sizeof(bitmap_t));
   }
 };
 
diff --git a/cpp/tests/core/bitset.cu b/cpp/tests/core/bitset.cu
index 9f5e095ffa..d2dac44935 100644
--- a/cpp/tests/core/bitset.cu
+++ b/cpp/tests/core/bitset.cu
@@ -1,12 +1,15 @@
 /*
- * SPDX-FileCopyrightText: Copyright (c) 2023-2024, NVIDIA CORPORATION.
+ * SPDX-FileCopyrightText: Copyright (c) 2023-2026, NVIDIA CORPORATION.
  * SPDX-License-Identifier: Apache-2.0
  */
 
 #include "../test_utils.cuh"
 
 #include <raft/core/bitset.cuh>
+#include <raft/core/copy.cuh>
 #include <raft/core/device_mdarray.hpp>
+#include <raft/core/host_mdarray.hpp>
+#include <raft/core/host_mdspan.hpp>
 #include <raft/linalg/init.cuh>
 #include <raft/random/rng.cuh>
 
@@ -183,16 +186,6 @@ class BitsetTest : public testing::TestWithParam<test_spec_bitset> {
     update_host(mask_cpu.data(), mask_device.data_handle(), mask_device.extent(0), stream);
     resource::sync_stream(res, stream);
 
-    // calculate the results
-    auto my_bitset = raft::core::bitset<bitset_t, index_t>(
-      res, raft::make_const_mdspan(mask_device.view()), index_t(spec.bitset_len));
-    update_host(bitset_result.data(), my_bitset.data(), bitset_result.size(), stream);
-
-    // calculate the reference
-    create_cpu_bitset(bitset_ref, mask_cpu);
-    resource::sync_stream(res, stream);
-    ASSERT_TRUE(hostVecMatch(bitset_ref, bitset_result, raft::Compare<bitset_t>()));
-
     auto query_device     = raft::make_device_vector<index_t, index_t>(res, spec.query_len);
     auto result_device    = raft::make_device_vector<uint8_t, index_t>(res, spec.query_len);
     auto query_cpu        = std::vector<index_t>(spec.query_len);
@@ -200,140 +193,208 @@ class BitsetTest : public testing::TestWithParam<test_spec_bitset> {
     auto result_ref_nbits = std::vector<uint8_t>(spec.query_len);
     auto result_ref       = std::vector<uint8_t>(spec.query_len);
 
-    // Create queries and verify the test results
-    raft::random::uniformInt(res, rng, query_device.view(), index_t(0), index_t(spec.bitset_len));
-    update_host(query_cpu.data(), query_device.data_handle(), query_device.extent(0), stream);
-    my_bitset.test(res, raft::make_const_mdspan(query_device.view()), result_device.view());
-    update_host(result_cpu.data(), result_device.data_handle(), result_device.extent(0), stream);
-    test_cpu_bitset(bitset_ref, query_cpu, result_ref);
-    resource::sync_stream(res, stream);
-    ASSERT_TRUE(hostVecMatch(result_cpu, result_ref, Compare<uint8_t>()));
+    auto eval_n_elements =
+      bitset_view<bitset_t, index_t>::eval_n_elements(spec.bitset_len * spec.repeat_times);
+    auto repeat_device = raft::make_device_vector<bitset_t, index_t>(res, eval_n_elements);
+
+    // Verify dry-run compliance of bitset construction.
+    // The entire bitset lifetime must be within the dry-run scope to prevent
+    // its device_buffer from outliving the temporary resource wrapper.
+    raft::execute_with_dry_run_check(
+      res,
+      [&](raft::resources const& h) {
+        raft::core::bitset<bitset_t, index_t> my_bitset(
+          h, raft::make_const_mdspan(mask_device.view()), index_t(spec.bitset_len));
+
+        raft::copy(h,
+                   raft::make_host_vector_view(bitset_result.data(), bitset_result.size()),
+                   my_bitset.to_mdspan());
+
+        // calculate the reference
+        create_cpu_bitset(bitset_ref, mask_cpu);
+        resource::sync_stream(h, stream);
+        if (!resource::get_dry_run_flag(h)) {
+          ASSERT_TRUE(hostVecMatch(bitset_ref, bitset_result, raft::Compare<bitset_t>()));
+        }
 
-    // Add more sample to the bitset and re-test
-    raft::random::uniformInt(res, rng, mask_device.view(), index_t(0), index_t(spec.bitset_len));
-    update_host(mask_cpu.data(), mask_device.data_handle(), mask_device.extent(0), stream);
-    resource::sync_stream(res, stream);
-    my_bitset.set(res, mask_device.view());
-    update_host(bitset_result.data(), my_bitset.data(), bitset_result.size(), stream);
+        // Create queries and verify the test results
+        raft::random::uniformInt(h, rng, query_device.view(), index_t(0), index_t(spec.bitset_len));
+        raft::copy(h,
+                   raft::make_host_vector_view(query_cpu.data(), query_device.extent(0)),
+                   raft::make_const_mdspan(query_device.view()));
+        my_bitset.test(h, raft::make_const_mdspan(query_device.view()), result_device.view());
+        raft::copy(h,
+                   raft::make_host_vector_view(result_cpu.data(), result_device.extent(0)),
+                   raft::make_const_mdspan(result_device.view()));
+        test_cpu_bitset(bitset_ref, query_cpu, result_ref);
+        resource::sync_stream(h, stream);
+        if (!resource::get_dry_run_flag(h)) {
+          ASSERT_TRUE(hostVecMatch(result_cpu, result_ref, Compare<uint8_t>()));
+        }
 
-    add_cpu_bitset(bitset_ref, mask_cpu);
-    resource::sync_stream(res, stream);
-    ASSERT_TRUE(hostVecMatch(bitset_ref, bitset_result, raft::Compare<bitset_t>()));
-
-    // Reinterpret the bitset as uint8_t, uint32 then uint64_t
-    {
-      // Test CPU logic
-      test_cpu_bitset(bitset_ref, query_cpu, result_ref);
-      uint8_t* bitset_cpu_uint8 = (uint8_t*)std::malloc(sizeof(bitset_t) * bitset_ref.size());
-      std::memcpy(bitset_cpu_uint8, bitset_ref.data(), sizeof(bitset_t) * bitset_ref.size());
-      test_cpu_bitset_nbits(bitset_cpu_uint8, query_cpu, result_ref_nbits, sizeof(bitset_t) * 8);
-      ASSERT_TRUE(hostVecMatch(result_ref, result_ref_nbits, raft::Compare<uint8_t>()));
-      std::free(bitset_cpu_uint8);
-
-      // Test GPU uint8_t, uint32_t, uint64_t
-      auto my_bitset_view_uint8_t = raft::core::bitset_view<uint8_t, uint32_t>(
-        reinterpret_cast<uint8_t*>(my_bitset.data()), my_bitset.size(), sizeof(bitset_t) * 8);
-      raft::linalg::map(
-        res,
-        result_device.view(),
-        [my_bitset_view_uint8_t] __device__(index_t query) {
-          return my_bitset_view_uint8_t.test(query);
-        },
-        raft::make_const_mdspan(query_device.view()));
-      update_host(result_cpu.data(), result_device.data_handle(), result_device.extent(0), stream);
-      resource::sync_stream(res, stream);
-      ASSERT_TRUE(hostVecMatch(result_ref, result_cpu, Compare<uint8_t>()));
-
-      auto my_bitset_view_uint32_t = raft::core::bitset_view<uint32_t, uint32_t>(
-        reinterpret_cast<uint32_t*>(my_bitset.data()), my_bitset.size(), sizeof(bitset_t) * 8);
-      raft::linalg::map(
-        res,
-        result_device.view(),
-        [my_bitset_view_uint32_t] __device__(index_t query) {
-          return my_bitset_view_uint32_t.test(query);
-        },
-        raft::make_const_mdspan(query_device.view()));
-      update_host(result_cpu.data(), result_device.data_handle(), result_device.extent(0), stream);
-      resource::sync_stream(res, stream);
-      ASSERT_TRUE(hostVecMatch(result_ref, result_cpu, Compare<uint8_t>()));
-
-      auto my_bitset_view_uint64_t = raft::core::bitset_view<uint64_t, uint32_t>(
-        reinterpret_cast<uint64_t*>(my_bitset.data()), my_bitset.size(), sizeof(bitset_t) * 8);
-      raft::linalg::map(
-        res,
-        result_device.view(),
-        [my_bitset_view_uint64_t] __device__(index_t query) {
-          return my_bitset_view_uint64_t.test(query);
-        },
-        raft::make_const_mdspan(query_device.view()));
-      update_host(result_cpu.data(), result_device.data_handle(), result_device.extent(0), stream);
-      resource::sync_stream(res, stream);
-      ASSERT_TRUE(hostVecMatch(result_ref, result_cpu, Compare<uint8_t>()));
-    }
+        // Add more sample to the bitset and re-test
+        raft::random::uniformInt(h, rng, mask_device.view(), index_t(0), index_t(spec.bitset_len));
+        raft::copy(h,
+                   raft::make_host_vector_view(mask_cpu.data(), mask_device.extent(0)),
+                   raft::make_const_mdspan(mask_device.view()));
+        resource::sync_stream(h, stream);
+        my_bitset.set(h, mask_device.view());
+        raft::copy(h,
+                   raft::make_host_vector_view(bitset_result.data(), bitset_result.size()),
+                   my_bitset.to_mdspan());
+
+        add_cpu_bitset(bitset_ref, mask_cpu);
+        resource::sync_stream(h, stream);
+        if (!resource::get_dry_run_flag(h)) {
+          ASSERT_TRUE(hostVecMatch(bitset_ref, bitset_result, raft::Compare<bitset_t>()));
+        }
 
-    // test sparsity, repeat and eval_n_elements
-    {
-      auto my_bitset_view  = my_bitset.view();
-      auto sparsity_result = my_bitset_view.sparsity(res);
-      auto sparsity_ref    = sparsity_cpu_bitset(bitset_ref, size_t(spec.bitset_len));
-      ASSERT_EQ(sparsity_result, sparsity_ref);
-
-      auto eval_n_elements =
-        bitset_view<bitset_t, index_t>::eval_n_elements(spec.bitset_len * spec.repeat_times);
-      ASSERT_EQ(bitset_repeat_ref.size(), eval_n_elements);
-
-      auto repeat_device = raft::make_device_vector<bitset_t, index_t>(res, eval_n_elements);
-      RAFT_CUDA_TRY(cudaMemsetAsync(
-        repeat_device.data_handle(), 0, eval_n_elements * sizeof(bitset_t), stream));
-      repeat_cpu_bitset(
-        bitset_ref, size_t(spec.bitset_len), size_t(spec.repeat_times), bitset_repeat_ref);
-
-      my_bitset_view.repeat(res, index_t(spec.repeat_times), repeat_device.data_handle());
-
-      ASSERT_EQ(bitset_repeat_ref.size(), repeat_device.size());
-      update_host(
-        bitset_repeat_result.data(), repeat_device.data_handle(), repeat_device.size(), stream);
-      ASSERT_EQ(bitset_repeat_ref.size(), bitset_repeat_result.size());
-
-      index_t errors                        = 0;
-      static constexpr index_t len_per_item = sizeof(bitset_t) * 8;
-      bitset_t tail_len = (index_t(spec.bitset_len * spec.repeat_times) % len_per_item);
-      bitset_t tail_mask =
-        tail_len ? (bitset_t)((bitset_t{1} << tail_len) - bitset_t{1}) : ~bitset_t{0};
-      for (index_t i = 0; i < bitset_repeat_ref.size(); i++) {
-        if (i == bitset_repeat_ref.size() - 1) {
-          errors += (bitset_repeat_ref[i] & tail_mask) != (bitset_repeat_result[i] & tail_mask);
-        } else {
-          errors += (bitset_repeat_ref[i] != bitset_repeat_result[i]);
+        // Reinterpret the bitset as uint8_t, uint32 then uint64_t
+        {
+          // Test CPU logic
+          test_cpu_bitset(bitset_ref, query_cpu, result_ref);
+          auto bitset_cpu_uint8 =
+            raft::make_host_vector<uint8_t>(h, sizeof(bitset_t) * bitset_ref.size());
+          raft::copy(
+            h,
+            bitset_cpu_uint8.view(),
+            raft::make_host_vector_view(reinterpret_cast<const uint8_t*>(bitset_ref.data()),
+                                        bitset_cpu_uint8.extent(0)));
+          test_cpu_bitset_nbits(
+            bitset_cpu_uint8.data_handle(), query_cpu, result_ref_nbits, sizeof(bitset_t) * 8);
+          if (!resource::get_dry_run_flag(h)) {
+            ASSERT_TRUE(hostVecMatch(result_ref, result_ref_nbits, raft::Compare<uint8_t>()));
+          }
+
+          // Test GPU uint8_t, uint32_t, uint64_t
+          auto my_bitset_view_uint8_t = raft::core::bitset_view<uint8_t, uint32_t>(
+            reinterpret_cast<uint8_t*>(my_bitset.data()), my_bitset.size(), sizeof(bitset_t) * 8);
+          raft::linalg::map(
+            h,
+            result_device.view(),
+            [my_bitset_view_uint8_t] __device__(index_t query) {
+              return my_bitset_view_uint8_t.test(query);
+            },
+            raft::make_const_mdspan(query_device.view()));
+          raft::copy(h,
+                     raft::make_host_vector_view(result_cpu.data(), result_device.extent(0)),
+                     raft::make_const_mdspan(result_device.view()));
+          resource::sync_stream(h, stream);
+          if (!resource::get_dry_run_flag(h)) {
+            ASSERT_TRUE(hostVecMatch(result_ref, result_cpu, Compare<uint8_t>()));
+          }
+
+          auto my_bitset_view_uint32_t = raft::core::bitset_view<uint32_t, uint32_t>(
+            reinterpret_cast<uint32_t*>(my_bitset.data()), my_bitset.size(), sizeof(bitset_t) * 8);
+          raft::linalg::map(
+            h,
+            result_device.view(),
+            [my_bitset_view_uint32_t] __device__(index_t query) {
+              return my_bitset_view_uint32_t.test(query);
+            },
+            raft::make_const_mdspan(query_device.view()));
+          raft::copy(h,
+                     raft::make_host_vector_view(result_cpu.data(), result_device.extent(0)),
+                     raft::make_const_mdspan(result_device.view()));
+          resource::sync_stream(h, stream);
+          if (!resource::get_dry_run_flag(h)) {
+            ASSERT_TRUE(hostVecMatch(result_ref, result_cpu, Compare<uint8_t>()));
+          }
+
+          auto my_bitset_view_uint64_t = raft::core::bitset_view<uint64_t, uint32_t>(
+            reinterpret_cast<uint64_t*>(my_bitset.data()), my_bitset.size(), sizeof(bitset_t) * 8);
+          raft::linalg::map(
+            h,
+            result_device.view(),
+            [my_bitset_view_uint64_t] __device__(index_t query) {
+              return my_bitset_view_uint64_t.test(query);
+            },
+            raft::make_const_mdspan(query_device.view()));
+          raft::copy(h,
+                     raft::make_host_vector_view(result_cpu.data(), result_device.extent(0)),
+                     raft::make_const_mdspan(result_device.view()));
+          resource::sync_stream(h, stream);
+          if (!resource::get_dry_run_flag(h)) {
+            ASSERT_TRUE(hostVecMatch(result_ref, result_cpu, Compare<uint8_t>()));
+          }
         }
-      }
-      ASSERT_EQ(errors, 0);
 
-      // recheck the sparsity after repeat
-      sparsity_result =
-        sparsity_cpu_bitset(bitset_repeat_result, size_t(spec.bitset_len * spec.repeat_times));
-      ASSERT_EQ(sparsity_result, sparsity_ref);
-    }
+        // test sparsity, repeat and eval_n_elements
+        {
+          auto my_bitset_view  = my_bitset.view();
+          auto sparsity_result = my_bitset_view.sparsity(h);
+          auto sparsity_ref    = sparsity_cpu_bitset(bitset_ref, size_t(spec.bitset_len));
+          if (!resource::get_dry_run_flag(h)) { ASSERT_EQ(sparsity_result, sparsity_ref); }
+
+          ASSERT_EQ(bitset_repeat_ref.size(), eval_n_elements);
+
+          RAFT_CUDA_TRY(cudaMemsetAsync(
+            repeat_device.data_handle(), 0, eval_n_elements * sizeof(bitset_t), stream));
+          repeat_cpu_bitset(
+            bitset_ref, size_t(spec.bitset_len), size_t(spec.repeat_times), bitset_repeat_ref);
+
+          my_bitset_view.repeat(h, index_t(spec.repeat_times), repeat_device.data_handle());
+
+          ASSERT_EQ(bitset_repeat_ref.size(), repeat_device.size());
+          raft::copy(
+            h,
+            raft::make_host_vector_view(bitset_repeat_result.data(), repeat_device.extent(0)),
+            raft::make_const_mdspan(repeat_device.view()));
+          ASSERT_EQ(bitset_repeat_ref.size(), bitset_repeat_result.size());
+
+          if (!resource::get_dry_run_flag(h)) {
+            index_t errors                        = 0;
+            static constexpr index_t len_per_item = sizeof(bitset_t) * 8;
+            bitset_t tail_len = (index_t(spec.bitset_len * spec.repeat_times) % len_per_item);
+            bitset_t tail_mask =
+              tail_len ? (bitset_t)((bitset_t{1} << tail_len) - bitset_t{1}) : ~bitset_t{0};
+            for (index_t i = 0; i < bitset_repeat_ref.size(); i++) {
+              if (i == bitset_repeat_ref.size() - 1) {
+                errors +=
+                  (bitset_repeat_ref[i] & tail_mask) != (bitset_repeat_result[i] & tail_mask);
+              } else {
+                errors += (bitset_repeat_ref[i] != bitset_repeat_result[i]);
+              }
+            }
+            ASSERT_EQ(errors, 0);
+
+            // recheck the sparsity after repeat
+            sparsity_result = sparsity_cpu_bitset(bitset_repeat_result,
+                                                  size_t(spec.bitset_len * spec.repeat_times));
+            ASSERT_EQ(sparsity_result, sparsity_ref);
+          }
+        }
 
-    // Flip the bitset and re-test
-    auto bitset_count = my_bitset.count(res);
-    my_bitset.flip(res);
-    ASSERT_EQ(my_bitset.count(res), spec.bitset_len - bitset_count);
-    update_host(bitset_result.data(), my_bitset.data(), bitset_result.size(), stream);
-    flip_cpu_bitset(bitset_ref);
-    resource::sync_stream(res, stream);
-    ASSERT_TRUE(hostVecMatch(bitset_ref, bitset_result, raft::Compare<bitset_t>()));
-
-    // Test count() operations
-    my_bitset.reset(res, false);
-    ASSERT_EQ(my_bitset.any(res), false);
-    ASSERT_EQ(my_bitset.none(res), true);
-    raft::linalg::range(query_device.data_handle(), query_device.size(), stream);
-    my_bitset.set(res, raft::make_const_mdspan(query_device.view()), true);
-    bitset_count = my_bitset.count(res);
-    ASSERT_EQ(bitset_count, query_device.size());
-    ASSERT_EQ(my_bitset.any(res), true);
-    ASSERT_EQ(my_bitset.none(res), false);
+        // Flip the bitset and re-test
+        auto bitset_count = my_bitset.count(h);
+        my_bitset.flip(h);
+        if (!resource::get_dry_run_flag(h)) {
+          ASSERT_EQ(my_bitset.count(h), spec.bitset_len - bitset_count);
+        }
+        raft::copy(h,
+                   raft::make_host_vector_view(bitset_result.data(), bitset_result.size()),
+                   my_bitset.to_mdspan());
+        flip_cpu_bitset(bitset_ref);
+        resource::sync_stream(h, stream);
+        if (!resource::get_dry_run_flag(h)) {
+          ASSERT_TRUE(hostVecMatch(bitset_ref, bitset_result, raft::Compare<bitset_t>()));
+        }
+
+        // Test count() operations
+        my_bitset.reset(h, false);
+        ASSERT_EQ(my_bitset.any(h), false);
+        ASSERT_EQ(my_bitset.none(h), true);
+        raft::linalg::range(query_device.data_handle(), query_device.size(), stream);
+        my_bitset.set(h, raft::make_const_mdspan(query_device.view()), true);
+        bitset_count = my_bitset.count(h);
+        if (!resource::get_dry_run_flag(h)) {
+          ASSERT_EQ(bitset_count, query_device.size());
+          ASSERT_EQ(my_bitset.any(h), true);
+          ASSERT_EQ(my_bitset.none(h), false);
+        }
+      },
+      raft::alloc_behavior::ARGUMENT_DRIVEN,
+      raft::ceildiv(spec.bitset_len, uint64_t(bitset_element_size)) * sizeof(bitset_t));
   }
 };
 
diff --git a/cpp/tests/core/mdarray.cu b/cpp/tests/core/mdarray.cu
index 5c56177571..9c78b503b1 100644
--- a/cpp/tests/core/mdarray.cu
+++ b/cpp/tests/core/mdarray.cu
@@ -347,15 +347,20 @@ void test_factory_methods()
   }
   {
     raft::resources handle;
-    // device mdarray
-    auto d_matrix = make_device_matrix<float>(handle, n, n);
-    ASSERT_EQ(d_matrix.extent(0), n);
-    ASSERT_EQ(d_matrix.extent(1), n);
-    static_assert(d_matrix.rank() == 2);
-
-    auto d_vec = make_device_vector<float>(handle, n);
-    static_assert(d_vec.rank() == 1);
-    ASSERT_EQ(d_vec.extent(0), n);
+    raft::execute_with_dry_run_check(
+      handle,
+      [&](raft::resources const& h) {
+        auto d_matrix = make_device_matrix<float>(h, n, n);
+        ASSERT_EQ(d_matrix.extent(0), n);
+        ASSERT_EQ(d_matrix.extent(1), n);
+        static_assert(d_matrix.rank() == 2);
+
+        auto d_vec = make_device_vector<float>(h, n);
+        static_assert(d_vec.rank() == 1);
+        ASSERT_EQ(d_vec.extent(0), n);
+      },
+      raft::alloc_behavior::ARGUMENT_DRIVEN,
+      n * (n + 1) * sizeof(float));
   }
 
   {
diff --git a/cpp/tests/core/mdbuffer.cu b/cpp/tests/core/mdbuffer.cu
index b00a4c437b..d4eac951ec 100644
--- a/cpp/tests/core/mdbuffer.cu
+++ b/cpp/tests/core/mdbuffer.cu
@@ -1,9 +1,9 @@
 /*
- * SPDX-FileCopyrightText: Copyright (c) 2023-2024, NVIDIA CORPORATION.
+ * SPDX-FileCopyrightText: Copyright (c) 2023-2026, NVIDIA CORPORATION.
  * SPDX-License-Identifier: Apache-2.0
  */
 
-#include "../test_utils.h"
+#include "../test_utils.cuh"
 
 #include <raft/core/device_mdarray.hpp>
 #include <raft/core/device_resources.hpp>
@@ -89,61 +89,50 @@ TEST(MDBuffer, FromDevice)
   auto constexpr depth = std::uint32_t{5};
   auto constexpr rows  = std::uint32_t{3};
   auto constexpr cols  = std::uint32_t{2};
-  auto data = make_device_mdarray<int, std::uint32_t, layout_c_contiguous, depth, rows, cols>(
-    res, extents<std::uint32_t, depth, rows, cols>{});
-
-  auto buffer = mdbuffer(data);
-  EXPECT_FALSE(buffer.is_owning());
-  EXPECT_EQ(buffer.mem_type(), memory_type::device);
-  EXPECT_EQ(buffer.view<memory_type::device>().data_handle(), data.data_handle());
-  EXPECT_EQ(std::as_const(buffer).view<memory_type::device>().data_handle(), data.data_handle());
-  EXPECT_EQ(buffer.view<memory_type::device>().data_handle(),
-            std::as_const(buffer).view<memory_type::device>().data_handle());
-  EXPECT_EQ(buffer.view().index(), variant_index_from_memory_type(memory_type::device));
-
-  buffer = mdbuffer(data.view());
-  EXPECT_FALSE(buffer.is_owning());
-  EXPECT_EQ(buffer.mem_type(), memory_type::device);
-  EXPECT_EQ(buffer.view<memory_type::device>().data_handle(), data.data_handle());
-  EXPECT_EQ(std::as_const(buffer).view<memory_type::device>().data_handle(), data.data_handle());
-  EXPECT_EQ(buffer.view<memory_type::device>().data_handle(),
-            std::as_const(buffer).view<memory_type::device>().data_handle());
-
-  auto original_data_handle = data.data_handle();
-  buffer                    = mdbuffer(std::move(data));
-  EXPECT_TRUE(buffer.is_owning());
-  EXPECT_EQ(buffer.mem_type(), memory_type::device);
-  EXPECT_EQ(buffer.view<memory_type::device>().data_handle(), original_data_handle);
-
-  auto buffer2 = mdbuffer(res, buffer);
-  EXPECT_FALSE(buffer2.is_owning());
-  EXPECT_EQ(buffer2.mem_type(), memory_type::device);
-  EXPECT_EQ(buffer2.view<memory_type::device>().data_handle(),
-            buffer.view<memory_type::device>().data_handle());
-
-  buffer2 = mdbuffer(res, buffer, memory_type::host);
-  EXPECT_TRUE(buffer2.is_owning());
-  EXPECT_EQ(buffer2.mem_type(), memory_type::host);
-  EXPECT_NE(buffer2.view<memory_type::host>().data_handle(),
-            buffer.view<memory_type::device>().data_handle());
 
-  buffer2 = mdbuffer(res, buffer, memory_type::device);
-  EXPECT_FALSE(buffer2.is_owning());
-  EXPECT_EQ(buffer2.mem_type(), memory_type::device);
-  EXPECT_EQ(buffer2.view<memory_type::device>().data_handle(),
-            buffer.view<memory_type::device>().data_handle());
-
-  buffer2 = mdbuffer(res, buffer, memory_type::managed);
-  EXPECT_TRUE(buffer2.is_owning());
-  EXPECT_EQ(buffer2.mem_type(), memory_type::managed);
-  EXPECT_NE(buffer2.view<memory_type::managed>().data_handle(),
-            buffer.view<memory_type::device>().data_handle());
-
-  buffer2 = mdbuffer(res, buffer, memory_type::pinned);
-  EXPECT_TRUE(buffer2.is_owning());
-  EXPECT_EQ(buffer2.mem_type(), memory_type::pinned);
-  EXPECT_NE(buffer2.view<memory_type::pinned>().data_handle(),
-            buffer.view<memory_type::device>().data_handle());
+  execute_with_dry_run_check(
+    res,
+    [&](raft::resources const& h) {
+      auto data = make_device_mdarray<int, std::uint32_t, layout_c_contiguous, depth, rows, cols>(
+        h, extents<std::uint32_t, depth, rows, cols>{});
+
+      auto buffer = mdbuffer(data);
+      EXPECT_FALSE(buffer.is_owning());
+      EXPECT_EQ(buffer.mem_type(), memory_type::device);
+      EXPECT_EQ(buffer.view<memory_type::device>().data_handle(), data.data_handle());
+
+      buffer = mdbuffer(data.view());
+      EXPECT_FALSE(buffer.is_owning());
+      EXPECT_EQ(buffer.mem_type(), memory_type::device);
+
+      auto original_data_handle = data.data_handle();
+      buffer                    = mdbuffer(std::move(data));
+      EXPECT_TRUE(buffer.is_owning());
+      EXPECT_EQ(buffer.mem_type(), memory_type::device);
+      EXPECT_EQ(buffer.view<memory_type::device>().data_handle(), original_data_handle);
+
+      auto buffer2 = mdbuffer(h, buffer);
+      EXPECT_FALSE(buffer2.is_owning());
+      EXPECT_EQ(buffer2.mem_type(), memory_type::device);
+
+      buffer2 = mdbuffer(h, buffer, memory_type::host);
+      EXPECT_TRUE(buffer2.is_owning());
+      EXPECT_EQ(buffer2.mem_type(), memory_type::host);
+
+      buffer2 = mdbuffer(h, buffer, memory_type::device);
+      EXPECT_FALSE(buffer2.is_owning());
+      EXPECT_EQ(buffer2.mem_type(), memory_type::device);
+
+      buffer2 = mdbuffer(h, buffer, memory_type::managed);
+      EXPECT_TRUE(buffer2.is_owning());
+      EXPECT_EQ(buffer2.mem_type(), memory_type::managed);
+
+      buffer2 = mdbuffer(h, buffer, memory_type::pinned);
+      EXPECT_TRUE(buffer2.is_owning());
+      EXPECT_EQ(buffer2.mem_type(), memory_type::pinned);
+    },
+    alloc_behavior::ARGUMENT_DRIVEN,
+    depth * rows * cols * sizeof(int));
 }
 
 TEST(MDBuffer, FromManaged)
diff --git a/cpp/tests/core/mdspan_copy.cu b/cpp/tests/core/mdspan_copy.cu
index e89680211f..bfaf022ac2 100644
--- a/cpp/tests/core/mdspan_copy.cu
+++ b/cpp/tests/core/mdspan_copy.cu
@@ -1,8 +1,9 @@
 /*
- * SPDX-FileCopyrightText: Copyright (c) 2023-2024, NVIDIA CORPORATION.
+ * SPDX-FileCopyrightText: Copyright (c) 2023-2026, NVIDIA CORPORATION.
  * SPDX-License-Identifier: Apache-2.0
  */
 
+#include "../test_utils.cuh"
 #include "../test_utils.h"
 
 #include <raft/core/copy.cuh>
@@ -45,7 +46,10 @@ TEST(MDSpanCopy, Mdspan3DDeviceDeviceCuda)
   static_assert(
     detail::mdspan_copyable_with_kernel_v<decltype(out_long.view()), decltype(in_left.view())>,
     "Current implementation should use kernel for this copy");
-  copy(res, out_long.view(), in_left.view());
+  execute_with_dry_run_check(
+    res,
+    [&](raft::resources const& r) { copy(r, out_long.view(), in_left.view()); },
+    alloc_behavior::NO_ALLOCATIONS);
   res.sync_stream();
   for (auto i = std::uint32_t{}; i < depth; ++i) {
     for (auto j = std::uint32_t{}; j < rows; ++j) {
@@ -64,7 +68,10 @@ TEST(MDSpanCopy, Mdspan3DDeviceDeviceCuda)
   static_assert(
     detail::mdspan_copyable_with_kernel_v<decltype(out_right.view()), decltype(in_left.view())>,
     "Current implementation should use kernel for this copy");
-  copy(res, out_right.view(), in_left.view());
+  execute_with_dry_run_check(
+    res,
+    [&](raft::resources const& r) { copy(r, out_right.view(), in_left.view()); },
+    alloc_behavior::NO_ALLOCATIONS);
   res.sync_stream();
   for (auto i = std::uint32_t{}; i < depth; ++i) {
     for (auto j = std::uint32_t{}; j < rows; ++j) {
@@ -77,7 +84,10 @@ TEST(MDSpanCopy, Mdspan3DDeviceDeviceCuda)
   static_assert(
     detail::mdspan_copyable_with_kernel_v<decltype(out_left.view()), decltype(in_right.view())>,
     "Current implementation should use kernel for this copy");
-  copy(res, out_left.view(), in_right.view());
+  execute_with_dry_run_check(
+    res,
+    [&](raft::resources const& r) { copy(r, out_left.view(), in_right.view()); },
+    alloc_behavior::NO_ALLOCATIONS);
   res.sync_stream();
   for (auto i = std::uint32_t{}; i < depth; ++i) {
     for (auto j = std::uint32_t{}; j < rows; ++j) {
@@ -130,7 +140,10 @@ TEST(MDSpanCopy, Mdspan2DDeviceDeviceCuda)
   static_assert(
     detail::mdspan_copyable_with_kernel_v<decltype(out_right.view()), decltype(in_left.view())>,
     "Current implementation should use kernel for this copy");
-  copy(res, out_right.view(), in_left.view());
+  execute_with_dry_run_check(
+    res,
+    [&](raft::resources const& r) { copy(r, out_right.view(), in_left.view()); },
+    alloc_behavior::NO_ALLOCATIONS);
   res.sync_stream();
   for (auto i = std::uint32_t{}; i < rows; ++i) {
     for (auto j = std::uint32_t{}; j < cols; ++j) {
@@ -141,7 +154,10 @@ TEST(MDSpanCopy, Mdspan2DDeviceDeviceCuda)
   static_assert(
     detail::mdspan_copyable_with_kernel_v<decltype(out_left.view()), decltype(in_right.view())>,
     "Current implementation should use kernel for this copy");
-  copy(res, out_left.view(), in_right.view());
+  execute_with_dry_run_check(
+    res,
+    [&](raft::resources const& r) { copy(r, out_left.view(), in_right.view()); },
+    alloc_behavior::NO_ALLOCATIONS);
   res.sync_stream();
   for (auto i = std::uint32_t{}; i < rows; ++i) {
     for (auto j = std::uint32_t{}; j < cols; ++j) {
@@ -180,7 +196,10 @@ TEST(MDSpanCopy, Mdspan2DDeviceDeviceCudaHalfWithTranspose)
   static_assert(
     detail::mdspan_copyable_with_kernel_v<decltype(out_right.view()), decltype(in_left.view())>,
     "Current implementation should use kernel for this copy");
-  copy(res, out_right.view(), in_left.view());
+  execute_with_dry_run_check(
+    res,
+    [&](raft::resources const& r) { copy(r, out_right.view(), in_left.view()); },
+    alloc_behavior::NO_ALLOCATIONS);
   res.sync_stream();
   for (auto i = std::uint32_t{}; i < rows; ++i) {
     for (auto j = std::uint32_t{}; j < cols; ++j) {
diff --git a/cpp/tests/core/mdspan_utils.cu b/cpp/tests/core/mdspan_utils.cu
index b7c89fcdc7..776f9c3e34 100644
--- a/cpp/tests/core/mdspan_utils.cu
+++ b/cpp/tests/core/mdspan_utils.cu
@@ -1,8 +1,10 @@
 /*
- * SPDX-FileCopyrightText: Copyright (c) 2022-2025, NVIDIA CORPORATION.
+ * SPDX-FileCopyrightText: Copyright (c) 2022-2026, NVIDIA CORPORATION.
  * SPDX-License-Identifier: Apache-2.0
  */
 
+#include "../test_utils.cuh"
+
 #include <raft/core/device_container_policy.hpp>
 #include <raft/core/device_mdarray.hpp>
 #include <raft/core/host_container_policy.hpp>
@@ -110,43 +112,48 @@ TEST(MDArray, HostFlatten) { test_host_flatten(); }
 void test_device_flatten()
 {
   raft::resources handle;
-  // flatten 3d device mdspan
-  {
-    raft::resources handle;
-    using three_d_extents = extents<int, dynamic_extent, dynamic_extent, dynamic_extent>;
-    using three_d_mdarray = device_mdarray<int, three_d_extents>;
-
-    three_d_extents extents{3, 3, 3};
-    typename three_d_mdarray::mapping_type layout{extents};
-    typename three_d_mdarray::container_policy_type policy{};
-    three_d_mdarray mda{handle, layout, policy};
-
-    auto flat_view = flatten(mda);
-
-    static_assert(std::is_same_v<typename three_d_mdarray::layout_type,
-                                 typename decltype(flat_view)::layout_type>,
-                  "layouts not the same");
-
-    ASSERT_EQ(flat_view.extents().rank(), 1);
-    ASSERT_EQ(flat_view.size(), mda.size());
-  }
-
-  // flatten device vector
-  {
-    auto dv        = make_device_vector<int>(handle, 27);
-    auto flat_view = flatten(dv.view());
-
-    ASSERT_EQ(dv.extents().rank(), flat_view.extents().rank());
-    ASSERT_EQ(dv.extent(0), flat_view.extent(0));
-  }
-
-  // flatten device scalar
-  {
-    auto ds        = make_device_scalar<int>(handle, 27);
-    auto flat_view = flatten(ds.view());
-
-    ASSERT_EQ(flat_view.extent(0), 1);
-  }
+  execute_with_dry_run_check(
+    handle,
+    [&](raft::resources const& h) {
+      // flatten 3d device mdspan
+      {
+        using three_d_extents = extents<int, dynamic_extent, dynamic_extent, dynamic_extent>;
+        using three_d_mdarray = device_mdarray<int, three_d_extents>;
+
+        three_d_extents extents{3, 3, 3};
+        typename three_d_mdarray::mapping_type layout{extents};
+        typename three_d_mdarray::container_policy_type policy{};
+        three_d_mdarray mda{h, layout, policy};
+
+        auto flat_view = flatten(mda);
+
+        static_assert(std::is_same_v<typename three_d_mdarray::layout_type,
+                                     typename decltype(flat_view)::layout_type>,
+                      "layouts not the same");
+
+        ASSERT_EQ(flat_view.extents().rank(), 1);
+        ASSERT_EQ(flat_view.size(), mda.size());
+      }
+
+      // flatten device vector
+      {
+        auto dv        = make_device_vector<int>(h, 27);
+        auto flat_view = flatten(dv.view());
+
+        ASSERT_EQ(dv.extents().rank(), flat_view.extents().rank());
+        ASSERT_EQ(dv.extent(0), flat_view.extent(0));
+      }
+
+      // flatten device scalar
+      {
+        auto ds        = make_device_scalar<int>(h, 27);
+        auto flat_view = flatten(ds.view());
+
+        ASSERT_EQ(flat_view.extent(0), 1);
+      }
+    },
+    alloc_behavior::ARGUMENT_DRIVEN,
+    27 * sizeof(int));
 }
 
 TEST(MDArray, DeviceFlatten) { test_device_flatten(); }
@@ -172,21 +179,26 @@ void test_reshape()
 
   // reshape 4d device array to 2d
   {
-    raft::resources handle;
-    using four_d_extents =
-      extents<int, dynamic_extent, dynamic_extent, dynamic_extent, dynamic_extent>;
-    using four_d_mdarray = device_mdarray<int, four_d_extents>;
-
-    four_d_extents extents{2, 2, 2, 2};
-    typename four_d_mdarray::mapping_type layout{extents};
-    typename four_d_mdarray::container_policy_type policy{};
-    four_d_mdarray mda{handle, layout, policy};
-
-    auto matrix = reshape(mda, raft::extents<int, dynamic_extent, dynamic_extent>{4, 4});
-
-    ASSERT_EQ(matrix.extents().rank(), 2);
-    ASSERT_EQ(matrix.extent(0), 4);
-    ASSERT_EQ(matrix.extent(1), 4);
+    execute_with_dry_run_check(
+      handle,
+      [&](raft::resources const& h) {
+        using four_d_extents =
+          extents<int, dynamic_extent, dynamic_extent, dynamic_extent, dynamic_extent>;
+        using four_d_mdarray = device_mdarray<int, four_d_extents>;
+
+        four_d_extents extents{2, 2, 2, 2};
+        typename four_d_mdarray::mapping_type layout{extents};
+        typename four_d_mdarray::container_policy_type policy{};
+        four_d_mdarray mda{h, layout, policy};
+
+        auto matrix = reshape(mda, raft::extents<int, dynamic_extent, dynamic_extent>{4, 4});
+
+        ASSERT_EQ(matrix.extents().rank(), 2);
+        ASSERT_EQ(matrix.extent(0), 4);
+        ASSERT_EQ(matrix.extent(1), 4);
+      },
+      alloc_behavior::ARGUMENT_DRIVEN,
+      16 * sizeof(int));
   }
 
   // reshape 2d host matrix with static extents to vector
diff --git a/cpp/tests/core/sparse_matrix.cu b/cpp/tests/core/sparse_matrix.cu
index 8ac7ddb41a..d378824c1d 100644
--- a/cpp/tests/core/sparse_matrix.cu
+++ b/cpp/tests/core/sparse_matrix.cu
@@ -1,7 +1,9 @@
 /*
- * SPDX-FileCopyrightText: Copyright (c) 2022-2024, NVIDIA CORPORATION.
+ * SPDX-FileCopyrightText: Copyright (c) 2022-2026, NVIDIA CORPORATION.
  * SPDX-License-Identifier: Apache-2.0
  */
+#include "../test_utils.cuh"
+
 #include <raft/core/device_coo_matrix.hpp>
 #include <raft/core/device_csr_matrix.hpp>
 #include <raft/core/resources.hpp>
@@ -69,82 +71,97 @@ void test_device_coo_sparsity_preserving_ref(S& mat, void* d)
 void test_device_coo_matrix()
 {
   raft::resources handle;
-  auto sparsity_owning = raft::make_device_coo_matrix<float, int, int, int>(handle, 5, 5);
+  execute_with_dry_run_check(
+    handle,
+    [&](raft::resources const& h) {
+      auto sparsity_owning = raft::make_device_coo_matrix<float, int, int, int>(h, 5, 5);
 
-  auto structure_view = sparsity_owning.structure_view();
+      auto structure_view = sparsity_owning.structure_view();
 
-  ASSERT_EQ(structure_view.get_n_cols(), 5);
-  ASSERT_EQ(structure_view.get_n_rows(), 5);
-  ASSERT_EQ(structure_view.get_nnz(), 0);
+      ASSERT_EQ(structure_view.get_n_cols(), 5);
+      ASSERT_EQ(structure_view.get_n_rows(), 5);
+      ASSERT_EQ(structure_view.get_nnz(), 0);
 
-  auto coord_struct = raft::make_device_coordinate_structure(handle, 5, 5, 5);
-  auto sparsity_preserving =
-    raft::make_device_coo_matrix<float, int, int>(handle, coord_struct.view());
+      auto coord_struct = raft::make_device_coordinate_structure(h, 5, 5, 5);
+      auto sparsity_preserving =
+        raft::make_device_coo_matrix<float, int, int>(h, coord_struct.view());
 
-  sparsity_owning.initialize_sparsity(5);
+      sparsity_owning.initialize_sparsity(5);
 
-  auto structure_view2 = sparsity_owning.structure_view();
+      auto structure_view2 = sparsity_owning.structure_view();
 
-  ASSERT_EQ(structure_view2.get_n_cols(), 5);
-  ASSERT_EQ(structure_view2.get_n_rows(), 5);
-  ASSERT_EQ(structure_view2.get_nnz(), 5);
+      ASSERT_EQ(structure_view2.get_n_cols(), 5);
+      ASSERT_EQ(structure_view2.get_n_rows(), 5);
+      ASSERT_EQ(structure_view2.get_nnz(), 5);
 
-  void* d_owning     = static_cast<void*>(sparsity_owning.get_elements().data());
-  void* d_preserving = static_cast<void*>(sparsity_preserving.get_elements().data());
+      void* d_owning     = static_cast<void*>(sparsity_owning.get_elements().data());
+      void* d_preserving = static_cast<void*>(sparsity_preserving.get_elements().data());
 
-  test_device_coo_owning_ref(sparsity_owning, d_owning);
-  test_device_coo_owning_ref(sparsity_preserving, d_preserving);
+      test_device_coo_owning_ref(sparsity_owning, d_owning);
+      test_device_coo_owning_ref(sparsity_preserving, d_preserving);
 
-  test_device_coo_sparsity_owning_ref(sparsity_owning, d_owning);
-  test_device_coo_sparsity_preserving_ref(sparsity_preserving, d_preserving);
+      test_device_coo_sparsity_owning_ref(sparsity_owning, d_owning);
+      test_device_coo_sparsity_preserving_ref(sparsity_preserving, d_preserving);
+    },
+    alloc_behavior::ARGUMENT_DRIVEN,
+    4 * 5 * sizeof(int) + 2 * 5 * sizeof(float));
 }
 
 void test_device_csr_matrix()
 {
   raft::resources handle;
-  auto sparsity_owning = raft::make_device_csr_matrix<float, int, int, int>(handle, 5, 5);
+  execute_with_dry_run_check(
+    handle,
+    [&](raft::resources const& h) {
+      auto sparsity_owning = raft::make_device_csr_matrix<float, int, int, int>(h, 5, 5);
 
-  auto comp_struct = raft::make_device_compressed_structure(handle, 5, 5, 5);
-  auto sparsity_preserving =
-    raft::make_device_csr_matrix<float, int, int>(handle, comp_struct.view());
+      auto comp_struct = raft::make_device_compressed_structure(h, 5, 5, 5);
+      auto sparsity_preserving =
+        raft::make_device_csr_matrix<float, int, int>(h, comp_struct.view());
 
-  auto structure_view = sparsity_owning.structure_view();
+      auto structure_view = sparsity_owning.structure_view();
 
-  ASSERT_EQ(structure_view.get_n_cols(), 5);
-  ASSERT_EQ(structure_view.get_n_rows(), 5);
-  ASSERT_EQ(structure_view.get_nnz(), 0);
+      ASSERT_EQ(structure_view.get_n_cols(), 5);
+      ASSERT_EQ(structure_view.get_n_rows(), 5);
+      ASSERT_EQ(structure_view.get_nnz(), 0);
 
-  sparsity_owning.initialize_sparsity(5);
+      sparsity_owning.initialize_sparsity(5);
 
-  auto structure_view2 = sparsity_owning.structure_view();
+      auto structure_view2 = sparsity_owning.structure_view();
 
-  ASSERT_EQ(structure_view2.get_n_cols(), 5);
-  ASSERT_EQ(structure_view2.get_n_rows(), 5);
-  ASSERT_EQ(structure_view2.get_nnz(), 5);
+      ASSERT_EQ(structure_view2.get_n_cols(), 5);
+      ASSERT_EQ(structure_view2.get_n_rows(), 5);
+      ASSERT_EQ(structure_view2.get_nnz(), 5);
 
-  void* d_owning     = static_cast<void*>(sparsity_owning.get_elements().data());
-  void* d_preserving = static_cast<void*>(sparsity_preserving.get_elements().data());
+      void* d_owning     = static_cast<void*>(sparsity_owning.get_elements().data());
+      void* d_preserving = static_cast<void*>(sparsity_preserving.get_elements().data());
 
-  test_device_csr_owning_ref(sparsity_owning, d_owning);
-  test_device_csr_owning_ref(sparsity_preserving, d_preserving);
+      test_device_csr_owning_ref(sparsity_owning, d_owning);
+      test_device_csr_owning_ref(sparsity_preserving, d_preserving);
 
-  test_device_csr_sparsity_owning_ref(sparsity_owning, d_owning);
-  test_device_csr_sparsity_preserving_ref(sparsity_preserving, d_preserving);
+      test_device_csr_sparsity_owning_ref(sparsity_owning, d_owning);
+      test_device_csr_sparsity_preserving_ref(sparsity_preserving, d_preserving);
+    },
+    alloc_behavior::ARGUMENT_DRIVEN,
+    2 * (5 + 1) * sizeof(int) + 2 * 5 * sizeof(int) + 2 * 5 * sizeof(float));
 }
 
 TEST(DeviceCoordinateStructure, Initialization)
 {
   raft::resources handle;
-
-  auto uninitialized = raft::make_device_coordinate_structure(handle, 5, 5, 0);
-  // Note: the behaviour of calling `view` on an uninitialized structure is
-  // undefined, this is testing an implementation detail.
-  EXPECT_EQ(uninitialized.view().get_rows().size(), 0);
-  EXPECT_EQ(uninitialized.view().get_rows().data(), nullptr);
-
-  auto initialized = raft::make_device_coordinate_structure(handle, 5, 5, 5);
-  EXPECT_EQ(initialized.view().get_rows().size(), 5);
-  EXPECT_NE(initialized.view().get_rows().data(), nullptr);
+  execute_with_dry_run_check(
+    handle,
+    [&](raft::resources const& h) {
+      auto uninitialized = raft::make_device_coordinate_structure(h, 5, 5, 0);
+      EXPECT_EQ(uninitialized.view().get_rows().size(), 0);
+      EXPECT_EQ(uninitialized.view().get_rows().data(), nullptr);
+
+      auto initialized = raft::make_device_coordinate_structure(h, 5, 5, 5);
+      EXPECT_EQ(initialized.view().get_rows().size(), 5);
+      EXPECT_NE(initialized.view().get_rows().data(), nullptr);
+    },
+    alloc_behavior::ARGUMENT_DRIVEN,
+    2 * 5 * sizeof(int));
 }
 
 TEST(DeviceSparseCOOMatrix, Basic) { test_device_coo_matrix(); }
diff --git a/cpp/tests/core/temporary_device_buffer.cu b/cpp/tests/core/temporary_device_buffer.cu
index 89c299fe32..3cd705d8be 100644
--- a/cpp/tests/core/temporary_device_buffer.cu
+++ b/cpp/tests/core/temporary_device_buffer.cu
@@ -1,5 +1,5 @@
 /*
- * SPDX-FileCopyrightText: Copyright (c) 2022-2024, NVIDIA CORPORATION.
+ * SPDX-FileCopyrightText: Copyright (c) 2022-2026, NVIDIA CORPORATION.
  * SPDX-License-Identifier: Apache-2.0
  */
 
@@ -54,18 +54,21 @@ TEST(TemporaryDeviceBuffer, HostPointerWithWriteBack)
   rmm::device_uvector<int> result(5, resource::get_cuda_stream(handle));
 
   {
-    auto d_buf  = raft::make_writeback_temporary_device_buffer(handle, array.data_handle(), exts);
-    auto d_view = d_buf.view();
-
-    thrust::fill(rmm::exec_policy(resource::get_cuda_stream(handle)),
-                 d_view.data_handle(),
-                 d_view.data_handle() + d_view.extent(0),
-                 10);
-    raft::copy(
-      result.data(), d_view.data_handle(), d_view.extent(0), resource::get_cuda_stream(handle));
-
-    static_assert(!std::is_const_v<typename decltype(d_buf.view())::element_type>,
-                  "element_type should not be const");
+    execute_with_dry_run_check(
+      handle,
+      [&](raft::resources const& h) {
+        auto d_buf  = raft::make_writeback_temporary_device_buffer(h, array.data_handle(), exts);
+        auto d_view = d_buf.view();
+
+        thrust::fill(rmm::exec_policy(resource::get_cuda_stream(h)),
+                     d_view.data_handle(),
+                     d_view.data_handle() + d_view.extent(0),
+                     10);
+        raft::copy(
+          result.data(), d_view.data_handle(), d_view.extent(0), resource::get_cuda_stream(h));
+      },
+      alloc_behavior::ARGUMENT_DRIVEN,
+      5 * sizeof(int));
   }
 
   ASSERT_TRUE(raft::devArrMatchHost(array.data_handle(),
diff --git a/cpp/tests/linalg/add.cu b/cpp/tests/linalg/add.cu
index 04279a6cbe..490a85126f 100644
--- a/cpp/tests/linalg/add.cu
+++ b/cpp/tests/linalg/add.cu
@@ -1,5 +1,5 @@
 /*
- * SPDX-FileCopyrightText: Copyright (c) 2018-2024, NVIDIA CORPORATION.
+ * SPDX-FileCopyrightText: Copyright (c) 2018-2026, NVIDIA CORPORATION.
  * SPDX-License-Identifier: Apache-2.0
  */
 
@@ -43,8 +43,10 @@ class AddTest : public ::testing::TestWithParam<AddInputs<InT, OutT>> {
     auto in1_view = raft::make_device_vector_view<const InT>(in1.data(), in1.size());
     auto in2_view = raft::make_device_vector_view<const InT>(in2.data(), in2.size());
 
-    add(handle, in1_view, in2_view, out_view);
-    resource::sync_stream(handle, stream);
+    raft::execute_with_dry_run_check(
+      handle,
+      [&](raft::resources const& h) { add(h, in1_view, in2_view, out_view); },
+      raft::alloc_behavior::NO_ALLOCATIONS);
   }
 
   void compare()
diff --git a/cpp/tests/linalg/axpy.cu b/cpp/tests/linalg/axpy.cu
index 5f0ac772b4..b699c68493 100644
--- a/cpp/tests/linalg/axpy.cu
+++ b/cpp/tests/linalg/axpy.cu
@@ -1,5 +1,5 @@
 /*
- * SPDX-FileCopyrightText: Copyright (c) 2022-2024, NVIDIA CORPORATION.
+ * SPDX-FileCopyrightText: Copyright (c) 2022-2026, NVIDIA CORPORATION.
  * SPDX-License-Identifier: Apache-2.0
  */
 #include "../test_utils.cuh"
@@ -88,53 +88,58 @@ class AxpyTest : public ::testing::TestWithParam<AxpyInputs<T>> {
     rmm::device_scalar<T> device_alpha(params.alpha, stream);
     auto device_alpha_view = make_device_scalar_view<const T>(device_alpha.data());
 
-    if ((params.incx > 1) && (params.incy > 1)) {
-      auto x_view = make_device_vector_view<const T, IndexType, layout_stride>(
-        x.data(), make_vector_strided_layout<IndexType>(params.len, params.incx));
-      axpy(handle,
-           host_alpha_view,
-           x_view,
-           make_device_vector_view<T, IndexType, layout_stride>(
-             y_host_alpha.data(), make_vector_strided_layout(params.len, params.incy)));
-      axpy(handle,
-           device_alpha_view,
-           x_view,
-           make_device_vector_view<T, IndexType, layout_stride>(
-             y_device_alpha.data(), make_vector_strided_layout(params.len, params.incy)));
-    } else if (params.incx > 1) {
-      auto x_view = make_device_vector_view<const T, IndexType, layout_stride>(
-        x.data(), make_vector_strided_layout<IndexType>(params.len, params.incx));
-      axpy(handle,
-           host_alpha_view,
-           x_view,
-           make_device_vector_view<T>(y_host_alpha.data(), params.len));
-      axpy(handle,
-           device_alpha_view,
-           x_view,
-           make_device_vector_view<T>(y_device_alpha.data(), params.len));
-    } else if (params.incy > 1) {
-      auto x_view = make_device_vector_view<const T>(x.data(), params.len);
-      axpy(handle,
-           host_alpha_view,
-           x_view,
-           make_device_vector_view<T, IndexType, layout_stride>(
-             y_host_alpha.data(), make_vector_strided_layout(params.len, params.incy)));
-      axpy(handle,
-           device_alpha_view,
-           x_view,
-           make_device_vector_view<T, IndexType, layout_stride>(
-             y_device_alpha.data(), make_vector_strided_layout(params.len, params.incy)));
-    } else {
-      auto x_view = make_device_vector_view<const T>(x.data(), params.len);
-      axpy(handle,
-           host_alpha_view,
-           x_view,
-           make_device_vector_view<T>(y_host_alpha.data(), params.len));
-      axpy(handle,
-           device_alpha_view,
-           x_view,
-           make_device_vector_view<T>(y_device_alpha.data(), params.len));
-    }
+    raft::execute_with_dry_run_check(
+      handle,
+      [&](raft::resources const& h) {
+        if ((params.incx > 1) && (params.incy > 1)) {
+          auto x_view = make_device_vector_view<const T, IndexType, layout_stride>(
+            x.data(), make_vector_strided_layout<IndexType>(params.len, params.incx));
+          axpy(h,
+               host_alpha_view,
+               x_view,
+               make_device_vector_view<T, IndexType, layout_stride>(
+                 y_host_alpha.data(), make_vector_strided_layout(params.len, params.incy)));
+          axpy(h,
+               device_alpha_view,
+               x_view,
+               make_device_vector_view<T, IndexType, layout_stride>(
+                 y_device_alpha.data(), make_vector_strided_layout(params.len, params.incy)));
+        } else if (params.incx > 1) {
+          auto x_view = make_device_vector_view<const T, IndexType, layout_stride>(
+            x.data(), make_vector_strided_layout<IndexType>(params.len, params.incx));
+          axpy(h,
+               host_alpha_view,
+               x_view,
+               make_device_vector_view<T>(y_host_alpha.data(), params.len));
+          axpy(h,
+               device_alpha_view,
+               x_view,
+               make_device_vector_view<T>(y_device_alpha.data(), params.len));
+        } else if (params.incy > 1) {
+          auto x_view = make_device_vector_view<const T>(x.data(), params.len);
+          axpy(h,
+               host_alpha_view,
+               x_view,
+               make_device_vector_view<T, IndexType, layout_stride>(
+                 y_host_alpha.data(), make_vector_strided_layout(params.len, params.incy)));
+          axpy(h,
+               device_alpha_view,
+               x_view,
+               make_device_vector_view<T, IndexType, layout_stride>(
+                 y_device_alpha.data(), make_vector_strided_layout(params.len, params.incy)));
+        } else {
+          auto x_view = make_device_vector_view<const T>(x.data(), params.len);
+          axpy(h,
+               host_alpha_view,
+               x_view,
+               make_device_vector_view<T>(y_host_alpha.data(), params.len));
+          axpy(h,
+               device_alpha_view,
+               x_view,
+               make_device_vector_view<T>(y_device_alpha.data(), params.len));
+        }
+      },
+      raft::alloc_behavior::NO_ALLOCATIONS);
 
     resource::sync_stream(handle);
   }
diff --git a/cpp/tests/linalg/binary_op.cu b/cpp/tests/linalg/binary_op.cu
index bcd3af8548..7596e324a9 100644
--- a/cpp/tests/linalg/binary_op.cu
+++ b/cpp/tests/linalg/binary_op.cu
@@ -1,5 +1,5 @@
 /*
- * SPDX-FileCopyrightText: Copyright (c) 2018-2024, NVIDIA CORPORATION.
+ * SPDX-FileCopyrightText: Copyright (c) 2018-2026, NVIDIA CORPORATION.
  * SPDX-License-Identifier: Apache-2.0
  */
 
@@ -54,7 +54,10 @@ class BinaryOpTest : public ::testing::TestWithParam<BinaryOpInputs<InType, IdxT
     uniform(handle, r, in1.data(), len, InType(-1.0), InType(1.0));
     uniform(handle, r, in2.data(), len, InType(-1.0), InType(1.0));
     naiveAdd(out_ref.data(), in1.data(), in2.data(), len);
-    binaryOpLaunch(handle, out.data(), in1.data(), in2.data(), len);
+    raft::execute_with_dry_run_check(
+      handle,
+      [&](raft::resources const& h) { binaryOpLaunch(h, out.data(), in1.data(), in2.data(), len); },
+      raft::alloc_behavior::NO_ALLOCATIONS);
     resource::sync_stream(handle, stream);
   }
 
diff --git a/cpp/tests/linalg/coalesced_reduction.cu b/cpp/tests/linalg/coalesced_reduction.cu
index 09e6283c08..0ad8fe4fa5 100644
--- a/cpp/tests/linalg/coalesced_reduction.cu
+++ b/cpp/tests/linalg/coalesced_reduction.cu
@@ -1,5 +1,5 @@
 /*
- * SPDX-FileCopyrightText: Copyright (c) 2018-2024, NVIDIA CORPORATION.
+ * SPDX-FileCopyrightText: Copyright (c) 2018-2026, NVIDIA CORPORATION.
  * SPDX-License-Identifier: Apache-2.0
  */
 
@@ -87,8 +87,13 @@ class coalescedReductionTest : public ::testing::TestWithParam<coalescedReductio
                             raft::add_op{},
                             raft::identity_op{});
 
-    coalescedReductionLaunch(handle, dots_act.data(), data.data(), cols, rows);
-    coalescedReductionLaunch(handle, dots_act.data(), data.data(), cols, rows, true);
+    raft::execute_with_dry_run_check(
+      handle,
+      [&](raft::resources const& h) {
+        coalescedReductionLaunch(h, dots_act.data(), data.data(), cols, rows);
+        coalescedReductionLaunch(h, dots_act.data(), data.data(), cols, rows, true);
+      },
+      raft::alloc_behavior::ARGUMENT_DRIVEN);
 
     resource::sync_stream(handle, stream);
   }
@@ -120,7 +125,8 @@ const std::vector<coalescedReductionInputs<float>> inputsf = {{0.000002f, 50, 2,
                                                               {0.000002f, 10000, 55, 1234ULL},
                                                               {0.000002f, 10000, 100, 1234ULL},
                                                               {0.000002f, 10000, 270, 1234ULL},
-                                                              {0.0001f, 10, 25000, 1234ULL}};
+                                                              {0.0001f, 10, 25000, 1234ULL},
+                                                              {0.0001f, 2, 200000, 1234ULL}};
 
 const std::vector<coalescedReductionInputs<double>> inputsd = {{0.000000001, 50, 2, 1234ULL},
                                                                {0.000000001, 50, 3, 1234ULL},
@@ -136,7 +142,8 @@ const std::vector<coalescedReductionInputs<double>> inputsd = {{0.000000001, 50,
                                                                {0.000000001, 10000, 55, 1234ULL},
                                                                {0.000000001, 10000, 100, 1234ULL},
                                                                {0.000000001, 10000, 270, 1234ULL},
-                                                               {0.0000001, 10, 25000, 1234ULL}};
+                                                               {0.0000001, 10, 25000, 1234ULL},
+                                                               {0.0000001, 2, 200000, 1234ULL}};
 
 typedef coalescedReductionTest<float> coalescedReductionTestF;
 TEST_P(coalescedReductionTestF, Result)
diff --git a/cpp/tests/linalg/divide.cu b/cpp/tests/linalg/divide.cu
index c081698b20..fb001a4e76 100644
--- a/cpp/tests/linalg/divide.cu
+++ b/cpp/tests/linalg/divide.cu
@@ -1,5 +1,5 @@
 /*
- * SPDX-FileCopyrightText: Copyright (c) 2018-2024, NVIDIA CORPORATION.
+ * SPDX-FileCopyrightText: Copyright (c) 2018-2026, NVIDIA CORPORATION.
  * SPDX-License-Identifier: Apache-2.0
  */
 
@@ -54,7 +54,10 @@ class DivideTest : public ::testing::TestWithParam<raft::linalg::UnaryOpInputs<T
     auto out_view    = raft::make_device_vector_view(out.data(), len);
     auto in_view     = raft::make_device_vector_view<const T>(in.data(), len);
     auto scalar_view = raft::make_host_scalar_view<const T>(&params.scalar);
-    divide_scalar(handle, in_view, out_view, scalar_view);
+    raft::execute_with_dry_run_check(
+      handle,
+      [&](raft::resources const& h) { divide_scalar(h, in_view, out_view, scalar_view); },
+      raft::alloc_behavior::NO_ALLOCATIONS);
     resource::sync_stream(handle, stream);
   }
 
diff --git a/cpp/tests/linalg/dot.cu b/cpp/tests/linalg/dot.cu
index 53e6d4cb3c..41bdba9dde 100644
--- a/cpp/tests/linalg/dot.cu
+++ b/cpp/tests/linalg/dot.cu
@@ -1,5 +1,5 @@
 /*
- * SPDX-FileCopyrightText: Copyright (c) 2022-2024, NVIDIA CORPORATION.
+ * SPDX-FileCopyrightText: Copyright (c) 2022-2026, NVIDIA CORPORATION.
  * SPDX-License-Identifier: Apache-2.0
  */
 #include "../test_utils.cuh"
@@ -72,31 +72,36 @@ class DotTest : public ::testing::TestWithParam<DotInputs<T>> {
     auto device_out_view = make_device_scalar_view<T, IndexType>(out.data());
     auto host_out_view   = make_host_scalar_view<T, IndexType>(&host_output);
 
-    if ((params.incx > 1) && (params.incy > 1)) {
-      auto x_view = make_device_vector_view<const T, IndexType, layout_stride>(
-        x.data(), make_vector_strided_layout(params.len, params.incx));
-      auto y_view = make_device_vector_view<const T, IndexType, layout_stride>(
-        y.data(), make_vector_strided_layout(params.len, params.incy));
-      dot(handle, x_view, y_view, device_out_view);
-      dot(handle, x_view, y_view, host_out_view);
-    } else if (params.incx > 1) {
-      auto x_view = make_device_vector_view<const T, IndexType, layout_stride>(
-        x.data(), make_vector_strided_layout(params.len, params.incx));
-      auto y_view = make_device_vector_view<const T>(y.data(), params.len);
-      dot(handle, x_view, y_view, device_out_view);
-      dot(handle, x_view, y_view, host_out_view);
-    } else if (params.incy > 1) {
-      auto x_view = make_device_vector_view<const T>(x.data(), params.len);
-      auto y_view = make_device_vector_view<const T, IndexType, layout_stride>(
-        y.data(), make_vector_strided_layout(params.len, params.incy));
-      dot(handle, x_view, y_view, device_out_view);
-      dot(handle, x_view, y_view, host_out_view);
-    } else {
-      auto x_view = make_device_vector_view<const T>(x.data(), params.len);
-      auto y_view = make_device_vector_view<const T>(y.data(), params.len);
-      dot(handle, x_view, y_view, device_out_view);
-      dot(handle, x_view, y_view, host_out_view);
-    }
+    raft::execute_with_dry_run_check(
+      handle,
+      [&](raft::resources const& h) {
+        if ((params.incx > 1) && (params.incy > 1)) {
+          auto x_view = make_device_vector_view<const T, IndexType, layout_stride>(
+            x.data(), make_vector_strided_layout(params.len, params.incx));
+          auto y_view = make_device_vector_view<const T, IndexType, layout_stride>(
+            y.data(), make_vector_strided_layout(params.len, params.incy));
+          dot(h, x_view, y_view, device_out_view);
+          dot(h, x_view, y_view, host_out_view);
+        } else if (params.incx > 1) {
+          auto x_view = make_device_vector_view<const T, IndexType, layout_stride>(
+            x.data(), make_vector_strided_layout(params.len, params.incx));
+          auto y_view = make_device_vector_view<const T>(y.data(), params.len);
+          dot(h, x_view, y_view, device_out_view);
+          dot(h, x_view, y_view, host_out_view);
+        } else if (params.incy > 1) {
+          auto x_view = make_device_vector_view<const T>(x.data(), params.len);
+          auto y_view = make_device_vector_view<const T, IndexType, layout_stride>(
+            y.data(), make_vector_strided_layout(params.len, params.incy));
+          dot(h, x_view, y_view, device_out_view);
+          dot(h, x_view, y_view, host_out_view);
+        } else {
+          auto x_view = make_device_vector_view<const T>(x.data(), params.len);
+          auto y_view = make_device_vector_view<const T>(y.data(), params.len);
+          dot(h, x_view, y_view, device_out_view);
+          dot(h, x_view, y_view, host_out_view);
+        }
+      },
+      raft::alloc_behavior::NO_ALLOCATIONS);
     raft::update_host(&device_output, out.data(), 1, stream);
     resource::sync_stream(handle);
   }
diff --git a/cpp/tests/linalg/eig.cu b/cpp/tests/linalg/eig.cu
index c51560dbf0..a0ad7e63e2 100644
--- a/cpp/tests/linalg/eig.cu
+++ b/cpp/tests/linalg/eig.cu
@@ -97,11 +97,23 @@ class EigTest : public ::testing::TestWithParam<EigInputs<T>> {
     auto eig_vals_jacobi_view =
       raft::make_device_vector_view<T, std::uint32_t>(eig_vals_jacobi.data(), params.n_row);
 
-    eig_dc(handle, cov_matrix_view, eig_vectors_view, eig_vals_view);
+    raft::execute_with_dry_run_check(
+      handle,
+      [&](raft::resources const& h) {
+        eig_dc(h, cov_matrix_view, eig_vectors_view, eig_vals_view);
+      },
+      raft::alloc_behavior::ARGUMENT_DRIVEN,
+      sizeof(int));
 
     T tol      = 1.e-7;
     int sweeps = 15;
-    eig_jacobi(handle, cov_matrix_view, eig_vectors_jacobi_view, eig_vals_jacobi_view, tol, sweeps);
+    raft::execute_with_dry_run_check(
+      handle,
+      [&](raft::resources const& h) {
+        eig_jacobi(h, cov_matrix_view, eig_vectors_jacobi_view, eig_vals_jacobi_view, tol, sweeps);
+      },
+      raft::alloc_behavior::ARGUMENT_DRIVEN,
+      sizeof(int));
 
     // test code for comparing two methods
     len = params.n * params.n;
@@ -158,10 +170,16 @@ TEST(Raft, EigStream)
   auto eig_vals_stream = raft::make_device_vector<float, std::uint32_t>(handle, n_rows);
   uniform(handle, r, cov_matrix_stream.data_handle(), n_rows * n_rows, float(-1.0), float(1.0));
 
-  raft::linalg::eig_dc(handle,
-                       raft::make_const_mdspan(cov_matrix_stream.view()),
-                       eig_vectors_stream.view(),
-                       eig_vals_stream.view());
+  raft::execute_with_dry_run_check(
+    handle,
+    [&](raft::resources const& h) {
+      raft::linalg::eig_dc(h,
+                           raft::make_const_mdspan(cov_matrix_stream.view()),
+                           eig_vectors_stream.view(),
+                           eig_vals_stream.view());
+    },
+    raft::alloc_behavior::ARGUMENT_DRIVEN,
+    sizeof(int));
   raft::resource::sync_stream(handle, raft::resource::get_cuda_stream(handle));
 }
 
diff --git a/cpp/tests/linalg/gemm_basic.cpp b/cpp/tests/linalg/gemm_basic.cpp
index 6ab669c1d0..03db2da0ea 100644
--- a/cpp/tests/linalg/gemm_basic.cpp
+++ b/cpp/tests/linalg/gemm_basic.cpp
@@ -92,8 +92,8 @@ void test_gemm_pointer_mode_host(bool use_alpha, bool use_beta)
   raft::copy(c_device.data_handle(), c_host.data(), c_host.size(), stream);
 
   // Create scalar views for alpha and beta
-  auto alpha_scalar = raft::make_host_scalar(alpha_val);
-  auto beta_scalar  = raft::make_host_scalar(beta_val);
+  auto alpha_scalar = raft::make_host_scalar(res, alpha_val);
+  auto beta_scalar  = raft::make_host_scalar(res, beta_val);
 
   // Perform GEMM: C = alpha * A * B + beta * C
   raft::linalg::gemm(res,
diff --git a/cpp/tests/linalg/gemm_layout.cu b/cpp/tests/linalg/gemm_layout.cu
index 638b5921e3..6f37f5ee1f 100644
--- a/cpp/tests/linalg/gemm_layout.cu
+++ b/cpp/tests/linalg/gemm_layout.cu
@@ -97,23 +97,28 @@ class GemmLayoutTest : public ::testing::TestWithParam<GemmLayoutInputs<T>> {
     auto z_view_col_major =
       raft::make_device_matrix_view<T, int, raft::col_major>(Z, params.M, params.N);
 
-    if (params.xLayout && params.yLayout && params.zLayout) {
-      gemm(handle, x_view_col_major, y_view_col_major, z_view_col_major);
-    } else if (params.xLayout && params.yLayout && !params.zLayout) {
-      gemm(handle, x_view_col_major, y_view_col_major, z_view_row_major);
-    } else if (params.xLayout && !params.yLayout && params.zLayout) {
-      gemm(handle, x_view_col_major, y_view_row_major, z_view_col_major);
-    } else if (!params.xLayout && params.yLayout && params.zLayout) {
-      gemm(handle, x_view_row_major, y_view_col_major, z_view_col_major);
-    } else if (params.xLayout && !params.yLayout && !params.zLayout) {
-      gemm(handle, x_view_col_major, y_view_row_major, z_view_row_major);
-    } else if (!params.xLayout && params.yLayout && !params.zLayout) {
-      gemm(handle, x_view_row_major, y_view_col_major, z_view_row_major);
-    } else if (!params.xLayout && !params.yLayout && params.zLayout) {
-      gemm(handle, x_view_row_major, y_view_row_major, z_view_col_major);
-    } else if (!params.xLayout && !params.yLayout && !params.zLayout) {
-      gemm(handle, x_view_row_major, y_view_row_major, z_view_row_major);
-    }
+    raft::execute_with_dry_run_check(
+      handle,
+      [&](raft::resources const& h) {
+        if (params.xLayout && params.yLayout && params.zLayout) {
+          gemm(h, x_view_col_major, y_view_col_major, z_view_col_major);
+        } else if (params.xLayout && params.yLayout && !params.zLayout) {
+          gemm(h, x_view_col_major, y_view_col_major, z_view_row_major);
+        } else if (params.xLayout && !params.yLayout && params.zLayout) {
+          gemm(h, x_view_col_major, y_view_row_major, z_view_col_major);
+        } else if (!params.xLayout && params.yLayout && params.zLayout) {
+          gemm(h, x_view_row_major, y_view_col_major, z_view_col_major);
+        } else if (params.xLayout && !params.yLayout && !params.zLayout) {
+          gemm(h, x_view_col_major, y_view_row_major, z_view_row_major);
+        } else if (!params.xLayout && params.yLayout && !params.zLayout) {
+          gemm(h, x_view_row_major, y_view_col_major, z_view_row_major);
+        } else if (!params.xLayout && !params.yLayout && params.zLayout) {
+          gemm(h, x_view_row_major, y_view_row_major, z_view_col_major);
+        } else if (!params.xLayout && !params.yLayout && !params.zLayout) {
+          gemm(h, x_view_row_major, y_view_row_major, z_view_row_major);
+        }
+      },
+      raft::alloc_behavior::NO_ALLOCATIONS);
 
     resource::sync_stream(handle);
 
diff --git a/cpp/tests/linalg/map.cu b/cpp/tests/linalg/map.cu
index 0cd4434e2f..12d1df1140 100644
--- a/cpp/tests/linalg/map.cu
+++ b/cpp/tests/linalg/map.cu
@@ -108,16 +108,14 @@ struct KVPAddOp {
 };
 
 template <typename InType, typename IdxType, typename OutType>
-void mapLaunch(OutType* out,
+void mapLaunch(const raft::resources& handle,
+               OutType* out,
                const InType* in1,
                const InType* in2,
                const InType* in3,
                InType scalar,
-               IdxType len,
-               cudaStream_t stream)
+               IdxType len)
 {
-  raft::resources handle;
-  resource::set_cuda_stream(handle, stream);
   auto out_view = raft::make_device_vector_view(out, len);
   auto in1_view = raft::make_device_vector_view(in1, len);
   auto in2_view = raft::make_device_vector_view(in2, len);
@@ -289,7 +287,12 @@ class MapTest : public ::testing::TestWithParam<MapInputs<InType, IdxType, OutTy
     }
 
     create_ref(out_ref.data(), in1.data(), in2.data(), in3.data(), params.scalar, len, stream);
-    mapLaunch(out.data(), in1.data(), in2.data(), in3.data(), params.scalar, len, stream);
+    raft::execute_with_dry_run_check(
+      handle,
+      [&](raft::resources const& h) {
+        mapLaunch(h, out.data(), in1.data(), in2.data(), in3.data(), params.scalar, len);
+      },
+      raft::alloc_behavior::NO_ALLOCATIONS);
     RAFT_CUDA_TRY(cudaStreamSynchronize(stream));
   }
 
diff --git a/cpp/tests/linalg/matrix_vector_op.cu b/cpp/tests/linalg/matrix_vector_op.cu
index 39bb0ce051..87c27c54a0 100644
--- a/cpp/tests/linalg/matrix_vector_op.cu
+++ b/cpp/tests/linalg/matrix_vector_op.cu
@@ -1,5 +1,5 @@
 /*
- * SPDX-FileCopyrightText: Copyright (c) 2018-2024, NVIDIA CORPORATION.
+ * SPDX-FileCopyrightText: Copyright (c) 2018-2026, NVIDIA CORPORATION.
  * SPDX-License-Identifier: Apache-2.0
  */
 
@@ -161,15 +161,20 @@ class MatVecOpTest : public ::testing::TestWithParam<MatVecOpInputs<IdxType>> {
                   OpT{},
                   stream);
     }
-    matrixVectorOpLaunch<OpT>(handle,
-                              out_ptr,
-                              in_ptr,
-                              vec1.data(),
-                              vec2.data(),
-                              params.cols,
-                              params.rows,
-                              params.rowMajor,
-                              params.bcastAlongRows);
+    raft::execute_with_dry_run_check(
+      handle,
+      [&](raft::resources const& h) {
+        matrixVectorOpLaunch<OpT>(h,
+                                  out_ptr,
+                                  in_ptr,
+                                  vec1.data(),
+                                  vec2.data(),
+                                  params.cols,
+                                  params.rows,
+                                  params.rowMajor,
+                                  params.bcastAlongRows);
+      },
+      raft::alloc_behavior::NO_ALLOCATIONS);
     resource::sync_stream(handle);
   }
 
diff --git a/cpp/tests/linalg/mean_squared_error.cu b/cpp/tests/linalg/mean_squared_error.cu
index 17c7105416..38dc53354b 100644
--- a/cpp/tests/linalg/mean_squared_error.cu
+++ b/cpp/tests/linalg/mean_squared_error.cu
@@ -1,5 +1,5 @@
 /*
- * SPDX-FileCopyrightText: Copyright (c) 2022-2024, NVIDIA CORPORATION.
+ * SPDX-FileCopyrightText: Copyright (c) 2022-2026, NVIDIA CORPORATION.
  * SPDX-License-Identifier: Apache-2.0
  */
 #include "../test_utils.cuh"
@@ -69,11 +69,17 @@ class MeanSquaredErrorTest : public ::testing::TestWithParam<MeanSquaredErrorInp
     uniform(handle, r, b.data(), params.len, T(-1.0), T(1.0));
     resource::sync_stream(handle);
 
-    mean_squared_error<T, std::uint32_t, T>(handle,
-                                            make_device_vector_view<const T>(a.data(), params.len),
-                                            make_device_vector_view<const T>(b.data(), params.len),
-                                            make_device_scalar_view<T>(output.data()),
-                                            params.weight);
+    raft::execute_with_dry_run_check(
+      handle,
+      [&](raft::resources const& h) {
+        mean_squared_error<T, std::uint32_t, T>(
+          h,
+          make_device_vector_view<const T>(a.data(), params.len),
+          make_device_vector_view<const T>(b.data(), params.len),
+          make_device_scalar_view<T>(output.data()),
+          params.weight);
+      },
+      raft::alloc_behavior::NO_ALLOCATIONS);
 
     naiveMeanSquaredError<<<256, 256, 0, stream>>>(
       params.len, a.data(), b.data(), params.weight, refoutput.data());
diff --git a/cpp/tests/linalg/multiply.cu b/cpp/tests/linalg/multiply.cu
index eab5bbbab7..ae5a83aae0 100644
--- a/cpp/tests/linalg/multiply.cu
+++ b/cpp/tests/linalg/multiply.cu
@@ -1,5 +1,5 @@
 /*
- * SPDX-FileCopyrightText: Copyright (c) 2018-2024, NVIDIA CORPORATION.
+ * SPDX-FileCopyrightText: Copyright (c) 2018-2026, NVIDIA CORPORATION.
  * SPDX-License-Identifier: Apache-2.0
  */
 
@@ -39,8 +39,10 @@ class MultiplyTest : public ::testing::TestWithParam<UnaryOpInputs<T>> {
     auto out_view    = raft::make_device_vector_view(out.data(), len);
     auto in_view     = raft::make_device_vector_view<const T>(in.data(), len);
     auto scalar_view = raft::make_host_scalar_view<const T>(&params.scalar);
-    multiply_scalar(handle, in_view, out_view, scalar_view);
-    resource::sync_stream(handle, stream);
+    raft::execute_with_dry_run_check(
+      handle,
+      [&](raft::resources const& h) { multiply_scalar(h, in_view, out_view, scalar_view); },
+      raft::alloc_behavior::NO_ALLOCATIONS);
   }
 
  protected:
diff --git a/cpp/tests/linalg/norm.cu b/cpp/tests/linalg/norm.cu
index 6abe8aca92..9ef90d585b 100644
--- a/cpp/tests/linalg/norm.cu
+++ b/cpp/tests/linalg/norm.cu
@@ -1,5 +1,5 @@
 /*
- * SPDX-FileCopyrightText: Copyright (c) 2018-2024, NVIDIA CORPORATION.
+ * SPDX-FileCopyrightText: Copyright (c) 2018-2026, NVIDIA CORPORATION.
  * SPDX-License-Identifier: Apache-2.0
  */
 
@@ -99,35 +99,40 @@ class RowNormTest : public ::testing::TestWithParam<NormInputs<OutT, IdxT>> {
       data.data(), params.rows, params.cols);
     auto input_col_major = raft::make_device_matrix_view<const T, IdxT, raft::col_major>(
       data.data(), params.rows, params.cols);
-    if (params.do_sqrt) {
-      if (params.rowMajor) {
-        if (params.type == L2Norm) {
-          norm<L2Norm, Apply::ALONG_ROWS>(handle, input_row_major, output_view, raft::sqrt_op{});
+    raft::execute_with_dry_run_check(
+      handle,
+      [&](raft::resources const& h) {
+        if (params.do_sqrt) {
+          if (params.rowMajor) {
+            if (params.type == L2Norm) {
+              norm<L2Norm, Apply::ALONG_ROWS>(h, input_row_major, output_view, raft::sqrt_op{});
+            } else {
+              norm<L1Norm, Apply::ALONG_ROWS>(h, input_row_major, output_view, raft::sqrt_op{});
+            }
+          } else {
+            if (params.type == L2Norm) {
+              norm<L2Norm, Apply::ALONG_ROWS>(h, input_col_major, output_view, raft::sqrt_op{});
+            } else {
+              norm<L1Norm, Apply::ALONG_ROWS>(h, input_col_major, output_view, raft::sqrt_op{});
+            }
+          }
         } else {
-          norm<L1Norm, Apply::ALONG_ROWS>(handle, input_row_major, output_view, raft::sqrt_op{});
+          if (params.rowMajor) {
+            if (params.type == L2Norm) {
+              norm<L2Norm, Apply::ALONG_ROWS>(h, input_row_major, output_view);
+            } else {
+              norm<L1Norm, Apply::ALONG_ROWS>(h, input_row_major, output_view);
+            }
+          } else {
+            if (params.type == L2Norm) {
+              norm<L2Norm, Apply::ALONG_ROWS>(h, input_col_major, output_view);
+            } else {
+              norm<L1Norm, Apply::ALONG_ROWS>(h, input_col_major, output_view);
+            }
+          }
         }
-      } else {
-        if (params.type == L2Norm) {
-          norm<L2Norm, Apply::ALONG_ROWS>(handle, input_col_major, output_view, raft::sqrt_op{});
-        } else {
-          norm<L1Norm, Apply::ALONG_ROWS>(handle, input_col_major, output_view, raft::sqrt_op{});
-        }
-      }
-    } else {
-      if (params.rowMajor) {
-        if (params.type == L2Norm) {
-          norm<L2Norm, Apply::ALONG_ROWS>(handle, input_row_major, output_view);
-        } else {
-          norm<L1Norm, Apply::ALONG_ROWS>(handle, input_row_major, output_view);
-        }
-      } else {
-        if (params.type == L2Norm) {
-          norm<L2Norm, Apply::ALONG_ROWS>(handle, input_col_major, output_view);
-        } else {
-          norm<L1Norm, Apply::ALONG_ROWS>(handle, input_col_major, output_view);
-        }
-      }
-    }
+      },
+      raft::alloc_behavior::NO_ALLOCATIONS);
     resource::sync_stream(handle, stream);
   }
 
@@ -192,35 +197,40 @@ class ColNormTest : public ::testing::TestWithParam<NormInputs<OutT, IdxT>> {
       data.data(), params.rows, params.cols);
     auto input_col_major = raft::make_device_matrix_view<const T, IdxT, raft::col_major>(
       data.data(), params.rows, params.cols);
-    if (params.do_sqrt) {
-      if (params.rowMajor) {
-        if (params.type == L2Norm) {
-          norm<L2Norm, Apply::ALONG_COLUMNS>(handle, input_row_major, output_view, raft::sqrt_op{});
+    raft::execute_with_dry_run_check(
+      handle,
+      [&](raft::resources const& h) {
+        if (params.do_sqrt) {
+          if (params.rowMajor) {
+            if (params.type == L2Norm) {
+              norm<L2Norm, Apply::ALONG_COLUMNS>(h, input_row_major, output_view, raft::sqrt_op{});
+            } else {
+              norm<L1Norm, Apply::ALONG_COLUMNS>(h, input_row_major, output_view, raft::sqrt_op{});
+            }
+          } else {
+            if (params.type == L2Norm) {
+              norm<L2Norm, Apply::ALONG_COLUMNS>(h, input_col_major, output_view, raft::sqrt_op{});
+            } else {
+              norm<L1Norm, Apply::ALONG_COLUMNS>(h, input_col_major, output_view, raft::sqrt_op{});
+            }
+          }
         } else {
-          norm<L1Norm, Apply::ALONG_COLUMNS>(handle, input_row_major, output_view, raft::sqrt_op{});
+          if (params.rowMajor) {
+            if (params.type == L2Norm) {
+              norm<L2Norm, Apply::ALONG_COLUMNS>(h, input_row_major, output_view);
+            } else {
+              norm<L1Norm, Apply::ALONG_COLUMNS>(h, input_row_major, output_view);
+            }
+          } else {
+            if (params.type == L2Norm) {
+              norm<L2Norm, Apply::ALONG_COLUMNS>(h, input_col_major, output_view);
+            } else {
+              norm<L1Norm, Apply::ALONG_COLUMNS>(h, input_col_major, output_view);
+            }
+          }
         }
-      } else {
-        if (params.type == L2Norm) {
-          norm<L2Norm, Apply::ALONG_COLUMNS>(handle, input_col_major, output_view, raft::sqrt_op{});
-        } else {
-          norm<L1Norm, Apply::ALONG_COLUMNS>(handle, input_col_major, output_view, raft::sqrt_op{});
-        }
-      }
-    } else {
-      if (params.rowMajor) {
-        if (params.type == L2Norm) {
-          norm<L2Norm, Apply::ALONG_COLUMNS>(handle, input_row_major, output_view);
-        } else {
-          norm<L1Norm, Apply::ALONG_COLUMNS>(handle, input_row_major, output_view);
-        }
-      } else {
-        if (params.type == L2Norm) {
-          norm<L2Norm, Apply::ALONG_COLUMNS>(handle, input_col_major, output_view);
-        } else {
-          norm<L1Norm, Apply::ALONG_COLUMNS>(handle, input_col_major, output_view);
-        }
-      }
-    }
+      },
+      raft::alloc_behavior::NO_ALLOCATIONS);
     resource::sync_stream(handle, stream);
   }
 
diff --git a/cpp/tests/linalg/normalize.cu b/cpp/tests/linalg/normalize.cu
index 9b3c1ddc5b..152068f666 100644
--- a/cpp/tests/linalg/normalize.cu
+++ b/cpp/tests/linalg/normalize.cu
@@ -1,5 +1,5 @@
 /*
- * SPDX-FileCopyrightText: Copyright (c) 2022-2024, NVIDIA CORPORATION.
+ * SPDX-FileCopyrightText: Copyright (c) 2022-2026, NVIDIA CORPORATION.
  * SPDX-License-Identifier: Apache-2.0
  */
 
@@ -82,13 +82,18 @@ class RowNormalizeTest : public ::testing::TestWithParam<RowNormalizeInputs<T, I
       data.data(), params.rows, params.cols);
     auto output_view = raft::make_device_matrix_view<T, IdxT, raft::row_major>(
       out_act.data(), params.rows, params.cols);
-    if (params.norm_type == raft::linalg::L1Norm) {
-      raft::linalg::row_normalize<raft::linalg::L1Norm>(handle, input_view, output_view);
-    } else if (params.norm_type == raft::linalg::L2Norm) {
-      raft::linalg::row_normalize<raft::linalg::L2Norm>(handle, input_view, output_view);
-    } else if (params.norm_type == raft::linalg::LinfNorm) {
-      raft::linalg::row_normalize<raft::linalg::LinfNorm>(handle, input_view, output_view);
-    }
+    raft::execute_with_dry_run_check(
+      handle,
+      [&](raft::resources const& h) {
+        if (params.norm_type == raft::linalg::L1Norm) {
+          raft::linalg::row_normalize<raft::linalg::L1Norm>(h, input_view, output_view);
+        } else if (params.norm_type == raft::linalg::L2Norm) {
+          raft::linalg::row_normalize<raft::linalg::L2Norm>(h, input_view, output_view);
+        } else if (params.norm_type == raft::linalg::LinfNorm) {
+          raft::linalg::row_normalize<raft::linalg::LinfNorm>(h, input_view, output_view);
+        }
+      },
+      raft::alloc_behavior::NO_ALLOCATIONS);
 
     resource::sync_stream(handle, stream);
   }
diff --git a/cpp/tests/linalg/power.cu b/cpp/tests/linalg/power.cu
index c3a672f296..4fc80a25e5 100644
--- a/cpp/tests/linalg/power.cu
+++ b/cpp/tests/linalg/power.cu
@@ -1,5 +1,5 @@
 /*
- * SPDX-FileCopyrightText: Copyright (c) 2018-2024, NVIDIA CORPORATION.
+ * SPDX-FileCopyrightText: Copyright (c) 2018-2026, NVIDIA CORPORATION.
  * SPDX-License-Identifier: Apache-2.0
  */
 
@@ -96,10 +96,15 @@ class PowerTest : public ::testing::TestWithParam<PowerInputs<T>> {
     auto const_in2_view = raft::make_device_vector_view<const T>(in2.data(), len);
     const auto scalar   = static_cast<T>(2);
     auto scalar_view    = raft::make_host_scalar_view(&scalar);
-    power(handle, const_in1_view, const_in2_view, out_view);
-    power_scalar(handle, const_out_view, out_view, scalar_view);
-    power(handle, const_in1_view, const_in2_view, in1_view);
-    power_scalar(handle, const_in1_view, in1_view, scalar_view);
+    raft::execute_with_dry_run_check(
+      handle,
+      [&](raft::resources const& h) {
+        power(h, const_in1_view, const_in2_view, out_view);
+        power_scalar(h, const_out_view, out_view, scalar_view);
+        power(h, const_in1_view, const_in2_view, in1_view);
+        power_scalar(h, const_in1_view, in1_view, scalar_view);
+      },
+      raft::alloc_behavior::NO_ALLOCATIONS);
 
     resource::sync_stream(handle);
   }
diff --git a/cpp/tests/linalg/reduce.cu b/cpp/tests/linalg/reduce.cu
index 102809957b..dd3f89d40c 100644
--- a/cpp/tests/linalg/reduce.cu
+++ b/cpp/tests/linalg/reduce.cu
@@ -1,5 +1,5 @@
 /*
- * SPDX-FileCopyrightText: Copyright (c) 2019-2025, NVIDIA CORPORATION.
+ * SPDX-FileCopyrightText: Copyright (c) 2019-2026, NVIDIA CORPORATION.
  * SPDX-License-Identifier: Apache-2.0
  */
 
@@ -45,7 +45,8 @@ template <typename InType,
           typename MainLambda,
           typename ReduceLambda,
           typename FinalLambda>
-void reduceLaunch(OutType* dots,
+void reduceLaunch(const raft::resources& handle,
+                  OutType* dots,
                   const InType* data,
                   IdxType cols,
                   IdxType rows,
@@ -53,7 +54,6 @@ void reduceLaunch(OutType* dots,
                   bool alongRows,
                   OutType init,
                   bool inplace,
-                  cudaStream_t stream,
                   MainLambda main_op,
                   ReduceLambda reduce_op,
                   FinalLambda final_op)
@@ -65,9 +65,6 @@ void reduceLaunch(OutType* dots,
   auto input_view_col_major =
     raft::make_device_matrix_view<const InType, IdxType, raft::col_major>(data, rows, cols);
 
-  raft::resources handle;
-  resource::set_cuda_stream(handle, stream);
-
   if (rowMajor and alongRows) {
     reduce<Apply::ALONG_ROWS>(
       handle, input_view_row_major, output_view, init, inplace, main_op, reduce_op, final_op);
@@ -215,30 +212,35 @@ class ReduceTest : public ::testing::TestWithParam<ReduceInputs<InType, OutType,
                    reduce_op,
                    fin_op);
 
-    reduceLaunch(dots_act.data(),
-                 data.data(),
-                 cols,
-                 rows,
-                 params.rowMajor,
-                 params.alongRows,
-                 params.init,
-                 false,
-                 stream,
-                 main_op,
-                 reduce_op,
-                 fin_op);
-    reduceLaunch(dots_act.data(),
-                 data.data(),
-                 cols,
-                 rows,
-                 params.rowMajor,
-                 params.alongRows,
-                 params.init,
-                 true,
-                 stream,
-                 main_op,
-                 reduce_op,
-                 fin_op);
+    raft::execute_with_dry_run_check(
+      handle,
+      [&](raft::resources const& h) {
+        reduceLaunch(h,
+                     dots_act.data(),
+                     data.data(),
+                     cols,
+                     rows,
+                     params.rowMajor,
+                     params.alongRows,
+                     params.init,
+                     false,
+                     main_op,
+                     reduce_op,
+                     fin_op);
+        reduceLaunch(h,
+                     dots_act.data(),
+                     data.data(),
+                     cols,
+                     rows,
+                     params.rowMajor,
+                     params.alongRows,
+                     params.init,
+                     true,
+                     main_op,
+                     reduce_op,
+                     fin_op);
+      },
+      raft::alloc_behavior::ARGUMENT_DRIVEN);
 
     resource::sync_stream(handle, stream);
   }
@@ -320,19 +322,19 @@ REDUCE_TEST((ReduceTest<float, float, int64_t>), ReduceTestFFI64, inputsff_i64);
 
 const std::vector<ReduceInputs<float, float, int>> inputsff_thick_i32 =
   raft::util::itertools::product<ReduceInputs<float, float, int>>(
-    {0.0001f}, {3, 9}, {17771, 33333, 100000}, {true}, {true}, {0.0f}, {1234ULL});
+    {0.0001f}, {3, 9}, {17771, 33333, 200000}, {true}, {true}, {0.0f}, {1234ULL});
 const std::vector<ReduceInputs<double, double, int>> inputsdd_thick_i32 =
   raft::util::itertools::product<ReduceInputs<double, double, int>>(
-    {0.000001}, {3, 9}, {17771, 33333, 100000}, {true}, {true}, {0.0}, {1234ULL});
+    {0.000001}, {3, 9}, {17771, 33333, 200000}, {true}, {true}, {0.0}, {1234ULL});
 const std::vector<ReduceInputs<float, double, int>> inputsfd_thick_i32 =
   raft::util::itertools::product<ReduceInputs<float, double, int>>(
-    {0.000001}, {3, 9}, {17771, 33333, 100000}, {true}, {true}, {0.0f}, {1234ULL});
+    {0.000001}, {3, 9}, {17771, 33333, 200000}, {true}, {true}, {0.0f}, {1234ULL});
 const std::vector<ReduceInputs<float, float, uint32_t>> inputsff_thick_u32 =
   raft::util::itertools::product<ReduceInputs<float, float, uint32_t>>(
-    {0.0001f}, {3u, 9u}, {17771u, 33333u, 100000u}, {true}, {true}, {0.0f}, {1234ULL});
+    {0.0001f}, {3u, 9u}, {17771u, 33333u, 200000u}, {true}, {true}, {0.0f}, {1234ULL});
 const std::vector<ReduceInputs<float, float, int64_t>> inputsff_thick_i64 =
   raft::util::itertools::product<ReduceInputs<float, float, int64_t>>(
-    {0.0001f}, {3, 9}, {17771, 33333, 100000}, {true}, {true}, {0.0f}, {1234ULL});
+    {0.0001f}, {3, 9}, {17771, 33333, 200000}, {true}, {true}, {0.0f}, {1234ULL});
 
 REDUCE_TEST((ReduceTest<float, float, int>), ReduceTestFFI32Thick, inputsff_thick_i32);
 REDUCE_TEST((ReduceTest<double, double, int>), ReduceTestDDI32Thick, inputsdd_thick_i32);
diff --git a/cpp/tests/linalg/rsvd.cu b/cpp/tests/linalg/rsvd.cu
index 6f125afa8e..82fd484e04 100644
--- a/cpp/tests/linalg/rsvd.cu
+++ b/cpp/tests/linalg/rsvd.cu
@@ -1,10 +1,12 @@
 /*
- * SPDX-FileCopyrightText: Copyright (c) 2018-2024, NVIDIA CORPORATION.
+ * SPDX-FileCopyrightText: Copyright (c) 2018-2026, NVIDIA CORPORATION.
  * SPDX-License-Identifier: Apache-2.0
  */
 
 #include "../test_utils.cuh"
 
+#include <raft/core/device_mdarray.hpp>
+#include <raft/core/dry_run_resources.hpp>
 #include <raft/core/resource/cuda_stream.hpp>
 #include <raft/core/resources.hpp>
 #include <raft/linalg/rsvd.cuh>
@@ -12,8 +14,6 @@
 #include <raft/util/cuda_utils.cuh>
 #include <raft/util/cudart_utils.hpp>
 
-#include <rmm/device_uvector.hpp>
-
 #include <gtest/gtest.h>
 
 #include <algorithm>
@@ -125,22 +125,28 @@ class RsvdTest : public ::testing::TestWithParam<RsvdInputs<T>> {
     auto S_vec_view = raft::make_device_vector_view(S.data(), params.k);
 
     // RSVD tests
-    if (params.k == 0) {  // Test with PC and upsampling ratio
-      if (params.use_bbt) {
-        rsvd_perc_symmetric(
-          handle, A_view, S_vec_view, params.PC_perc, params.UpS_perc, U_view, V_view);
-      } else {
-        rsvd_perc(handle, A_view, S_vec_view, params.PC_perc, params.UpS_perc, U_view, V_view);
-      }
-    } else {  // Test with directly given fixed rank
-      if (params.use_bbt) {
-        rsvd_fixed_rank_symmetric_jacobi(
-          handle, A_view, S_vec_view, params.p, eig_svd_tol, max_sweeps, U_view, V_view);
-      } else {
-        rsvd_fixed_rank_jacobi(
-          handle, A_view, S_vec_view, params.p, eig_svd_tol, max_sweeps, U_view, V_view);
-      }
-    }
+    raft::execute_with_dry_run_check(
+      handle,
+      [&](raft::resources const& h) {
+        if (params.k == 0) {  // Test with PC and upsampling ratio
+          if (params.use_bbt) {
+            rsvd_perc_symmetric(
+              h, A_view, S_vec_view, params.PC_perc, params.UpS_perc, U_view, V_view);
+          } else {
+            rsvd_perc(h, A_view, S_vec_view, params.PC_perc, params.UpS_perc, U_view, V_view);
+          }
+        } else {  // Test with directly given fixed rank
+          if (params.use_bbt) {
+            rsvd_fixed_rank_symmetric_jacobi(
+              h, A_view, S_vec_view, params.p, eig_svd_tol, max_sweeps, U_view, V_view);
+          } else {
+            rsvd_fixed_rank_jacobi(
+              h, A_view, S_vec_view, params.p, eig_svd_tol, max_sweeps, U_view, V_view);
+          }
+        }
+      },
+      raft::alloc_behavior::ARGUMENT_DRIVEN,
+      sizeof(int));
     raft::update_device(A.data(), A_backup_cpu.data(), m * n, stream);
   }
 
@@ -312,5 +318,296 @@ INSTANTIATE_TEST_CASE_P(RsvdTests, RsvdTestSquareMatrixNormF, ::testing::ValuesI
 
 INSTANTIATE_TEST_CASE_P(RsvdTests, RsvdTestSquareMatrixNormD, ::testing::ValuesIn(inputs_dx));
 
+// ===================================================================
+// Dry-run tests for RSVD public API functions
+// ===================================================================
+
+TEST(RsvdDryRun, FixedRankQRWithBothVectors)
+{
+  raft::resources res;
+
+  constexpr int n_rows = 256;
+  constexpr int n_cols = 128;
+  constexpr int k      = 50;
+  constexpr int p      = 10;
+
+  // Pre-allocate input/output buffers (outside dry-run)
+  auto M     = raft::make_device_matrix<float, int, raft::col_major>(res, n_rows, n_cols);
+  auto S_vec = raft::make_device_vector<float, int>(res, k);
+  auto U     = raft::make_device_matrix<float, int, raft::col_major>(res, n_rows, k);
+  auto V     = raft::make_device_matrix<float, int, raft::col_major>(res, k, n_cols);
+
+  // Run rsvd_fixed_rank in dry-run mode (QR, no BBT, no Jacobi, both U and V)
+  auto stats = raft::util::dry_run_execute(res, [&](raft::resources const& handle) {
+    raft::linalg::rsvd_fixed_rank(
+      handle, raft::make_const_mdspan(M.view()), S_vec.view(), p, U.view(), V.view());
+  });
+
+  EXPECT_FALSE(raft::resource::get_dry_run_flag(res));
+  EXPECT_GT(stats.device_global, 0) << "Expected non-zero peak device memory allocation";
+}
+
+TEST(RsvdDryRun, FixedRankSymmetricWithBothVectors)
+{
+  raft::resources res;
+
+  constexpr int n_rows = 256;
+  constexpr int n_cols = 128;
+  constexpr int k      = 50;
+  constexpr int p      = 10;
+
+  // Pre-allocate input/output buffers (outside dry-run)
+  auto M     = raft::make_device_matrix<float, int, raft::col_major>(res, n_rows, n_cols);
+  auto S_vec = raft::make_device_vector<float, int>(res, k);
+  auto U     = raft::make_device_matrix<float, int, raft::col_major>(res, n_rows, k);
+  auto V     = raft::make_device_matrix<float, int, raft::col_major>(res, k, n_cols);
+
+  // Run rsvd_fixed_rank_symmetric in dry-run mode (QR, with BBT, no Jacobi, both U and V)
+  auto stats = raft::util::dry_run_execute(res, [&](raft::resources const& handle) {
+    raft::linalg::rsvd_fixed_rank_symmetric(
+      handle, raft::make_const_mdspan(M.view()), S_vec.view(), p, U.view(), V.view());
+  });
+
+  EXPECT_FALSE(raft::resource::get_dry_run_flag(res));
+  EXPECT_GT(stats.device_global, 0) << "Expected non-zero peak device memory allocation";
+}
+
+TEST(RsvdDryRun, FixedRankJacobiWithBothVectors)
+{
+  raft::resources res;
+
+  constexpr int n_rows     = 256;
+  constexpr int n_cols     = 128;
+  constexpr int k          = 50;
+  constexpr int p          = 10;
+  constexpr float tol      = 1e-7f;
+  constexpr int max_sweeps = 100;
+
+  // Pre-allocate input/output buffers (outside dry-run)
+  auto M     = raft::make_device_matrix<float, int, raft::col_major>(res, n_rows, n_cols);
+  auto S_vec = raft::make_device_vector<float, int>(res, k);
+  auto U     = raft::make_device_matrix<float, int, raft::col_major>(res, n_rows, k);
+  auto V     = raft::make_device_matrix<float, int, raft::col_major>(res, k, n_cols);
+
+  // Run rsvd_fixed_rank_jacobi in dry-run mode (QR, no BBT, with Jacobi, both U and V)
+  auto stats = raft::util::dry_run_execute(res, [&](raft::resources const& handle) {
+    raft::linalg::rsvd_fixed_rank_jacobi(handle,
+                                         raft::make_const_mdspan(M.view()),
+                                         S_vec.view(),
+                                         p,
+                                         tol,
+                                         max_sweeps,
+                                         U.view(),
+                                         V.view());
+  });
+
+  EXPECT_FALSE(raft::resource::get_dry_run_flag(res));
+  EXPECT_GT(stats.device_global, 0) << "Expected non-zero peak device memory allocation";
+}
+
+TEST(RsvdDryRun, FixedRankSymmetricJacobiWithBothVectors)
+{
+  raft::resources res;
+
+  constexpr int n_rows     = 256;
+  constexpr int n_cols     = 128;
+  constexpr int k          = 50;
+  constexpr int p          = 10;
+  constexpr float tol      = 1e-7f;
+  constexpr int max_sweeps = 100;
+
+  // Pre-allocate input/output buffers (outside dry-run)
+  auto M     = raft::make_device_matrix<float, int, raft::col_major>(res, n_rows, n_cols);
+  auto S_vec = raft::make_device_vector<float, int>(res, k);
+  auto U     = raft::make_device_matrix<float, int, raft::col_major>(res, n_rows, k);
+  auto V     = raft::make_device_matrix<float, int, raft::col_major>(res, k, n_cols);
+
+  // Run rsvd_fixed_rank_symmetric_jacobi in dry-run mode (QR, with BBT, with Jacobi, both U and V)
+  auto stats = raft::util::dry_run_execute(res, [&](raft::resources const& handle) {
+    raft::linalg::rsvd_fixed_rank_symmetric_jacobi(handle,
+                                                   raft::make_const_mdspan(M.view()),
+                                                   S_vec.view(),
+                                                   p,
+                                                   tol,
+                                                   max_sweeps,
+                                                   U.view(),
+                                                   V.view());
+  });
+
+  EXPECT_FALSE(raft::resource::get_dry_run_flag(res));
+  EXPECT_GT(stats.device_global, 0) << "Expected non-zero peak device memory allocation";
+}
+
+TEST(RsvdDryRun, FixedRankWithOnlyU)
+{
+  raft::resources res;
+
+  constexpr int n_rows = 256;
+  constexpr int n_cols = 128;
+  constexpr int k      = 50;
+  constexpr int p      = 10;
+
+  // Pre-allocate input/output buffers (outside dry-run)
+  auto M     = raft::make_device_matrix<float, int, raft::col_major>(res, n_rows, n_cols);
+  auto S_vec = raft::make_device_vector<float, int>(res, k);
+  auto U     = raft::make_device_matrix<float, int, raft::col_major>(res, n_rows, k);
+
+  // Run rsvd_fixed_rank in dry-run mode (only U, no V)
+  auto stats = raft::util::dry_run_execute(res, [&](raft::resources const& handle) {
+    raft::linalg::rsvd_fixed_rank(
+      handle, raft::make_const_mdspan(M.view()), S_vec.view(), p, U.view(), std::nullopt);
+  });
+
+  EXPECT_FALSE(raft::resource::get_dry_run_flag(res));
+  EXPECT_GT(stats.device_global, 0) << "Expected non-zero peak device memory allocation";
+}
+
+TEST(RsvdDryRun, FixedRankWithOnlyV)
+{
+  raft::resources res;
+
+  constexpr int n_rows = 256;
+  constexpr int n_cols = 128;
+  constexpr int k      = 50;
+  constexpr int p      = 10;
+
+  // Pre-allocate input/output buffers (outside dry-run)
+  auto M     = raft::make_device_matrix<float, int, raft::col_major>(res, n_rows, n_cols);
+  auto S_vec = raft::make_device_vector<float, int>(res, k);
+  auto V     = raft::make_device_matrix<float, int, raft::col_major>(res, k, n_cols);
+
+  // Run rsvd_fixed_rank in dry-run mode (only V, no U)
+  auto stats = raft::util::dry_run_execute(res, [&](raft::resources const& handle) {
+    raft::linalg::rsvd_fixed_rank(
+      handle, raft::make_const_mdspan(M.view()), S_vec.view(), p, std::nullopt, V.view());
+  });
+
+  EXPECT_FALSE(raft::resource::get_dry_run_flag(res));
+  EXPECT_GT(stats.device_global, 0) << "Expected non-zero peak device memory allocation";
+}
+
+TEST(RsvdDryRun, FixedRankWithNoVectors)
+{
+  raft::resources res;
+
+  constexpr int n_rows = 256;
+  constexpr int n_cols = 128;
+  constexpr int k      = 50;
+  constexpr int p      = 10;
+
+  // Pre-allocate input/output buffers (outside dry-run)
+  auto M     = raft::make_device_matrix<float, int, raft::col_major>(res, n_rows, n_cols);
+  auto S_vec = raft::make_device_vector<float, int>(res, k);
+
+  // Run rsvd_fixed_rank in dry-run mode (no U, no V - only singular values)
+  auto stats = raft::util::dry_run_execute(res, [&](raft::resources const& handle) {
+    raft::linalg::rsvd_fixed_rank(
+      handle, raft::make_const_mdspan(M.view()), S_vec.view(), p, std::nullopt, std::nullopt);
+  });
+
+  EXPECT_FALSE(raft::resource::get_dry_run_flag(res));
+  EXPECT_GT(stats.device_global, 0) << "Expected non-zero peak device memory allocation";
+}
+
+TEST(RsvdDryRun, PercWithBothVectors)
+{
+  raft::resources res;
+
+  constexpr int n_rows     = 256;
+  constexpr int n_cols     = 128;
+  constexpr float PC_perc  = 0.2f;
+  constexpr float UpS_perc = 0.05f;
+  constexpr int k          = static_cast<int>(std::min(n_rows, n_cols) * PC_perc);
+
+  // Pre-allocate input/output buffers (outside dry-run)
+  auto M     = raft::make_device_matrix<float, int, raft::col_major>(res, n_rows, n_cols);
+  auto S_vec = raft::make_device_vector<float, int>(res, k);
+  auto U     = raft::make_device_matrix<float, int, raft::col_major>(res, n_rows, k);
+  auto V     = raft::make_device_matrix<float, int, raft::col_major>(res, k, n_cols);
+
+  // Run rsvd_perc in dry-run mode (percentage-based, QR, no BBT, no Jacobi, both U and V)
+  auto stats = raft::util::dry_run_execute(res, [&](raft::resources const& handle) {
+    raft::linalg::rsvd_perc(handle,
+                            raft::make_const_mdspan(M.view()),
+                            S_vec.view(),
+                            PC_perc,
+                            UpS_perc,
+                            U.view(),
+                            V.view());
+  });
+
+  EXPECT_FALSE(raft::resource::get_dry_run_flag(res));
+  EXPECT_GT(stats.device_global, 0) << "Expected non-zero peak device memory allocation";
+}
+
+TEST(RsvdDryRun, PercSymmetricJacobiWithBothVectors)
+{
+  raft::resources res;
+
+  constexpr int n_rows     = 256;
+  constexpr int n_cols     = 128;
+  constexpr float PC_perc  = 0.2f;
+  constexpr float UpS_perc = 0.05f;
+  constexpr float tol      = 1e-7f;
+  constexpr int max_sweeps = 100;
+  constexpr int k          = static_cast<int>(std::min(n_rows, n_cols) * PC_perc);
+
+  // Pre-allocate input/output buffers (outside dry-run)
+  auto M     = raft::make_device_matrix<float, int, raft::col_major>(res, n_rows, n_cols);
+  auto S_vec = raft::make_device_vector<float, int>(res, k);
+  auto U     = raft::make_device_matrix<float, int, raft::col_major>(res, n_rows, k);
+  auto V     = raft::make_device_matrix<float, int, raft::col_major>(res, k, n_cols);
+
+  // Run rsvd_perc_symmetric_jacobi in dry-run mode (percentage-based, QR, with BBT, with Jacobi,
+  // both U and V)
+  auto stats = raft::util::dry_run_execute(res, [&](raft::resources const& handle) {
+    raft::linalg::rsvd_perc_symmetric_jacobi(handle,
+                                             raft::make_const_mdspan(M.view()),
+                                             S_vec.view(),
+                                             PC_perc,
+                                             UpS_perc,
+                                             tol,
+                                             max_sweeps,
+                                             U.view(),
+                                             V.view());
+  });
+
+  EXPECT_FALSE(raft::resource::get_dry_run_flag(res));
+  EXPECT_GT(stats.device_global, 0) << "Expected non-zero peak device memory allocation";
+}
+
+TEST(RsvdDryRun, TallMatrix)
+{
+  raft::resources res;
+
+  constexpr int n_rows = 512;
+  constexpr int n_cols = 128;
+  constexpr int k      = 50;
+  constexpr int p      = 10;
+
+  // Pre-allocate input/output buffers (outside dry-run)
+  auto M     = raft::make_device_matrix<float, int, raft::col_major>(res, n_rows, n_cols);
+  auto S_vec = raft::make_device_vector<float, int>(res, k);
+  auto U     = raft::make_device_matrix<float, int, raft::col_major>(res, n_rows, k);
+  auto V     = raft::make_device_matrix<float, int, raft::col_major>(res, k, n_cols);
+
+  // Run rsvd_fixed_rank_jacobi in dry-run mode on a tall matrix
+  constexpr float tol      = 1e-7f;
+  constexpr int max_sweeps = 100;
+  auto stats               = raft::util::dry_run_execute(res, [&](raft::resources const& handle) {
+    raft::linalg::rsvd_fixed_rank_jacobi(handle,
+                                         raft::make_const_mdspan(M.view()),
+                                         S_vec.view(),
+                                         p,
+                                         tol,
+                                         max_sweeps,
+                                         U.view(),
+                                         V.view());
+  });
+
+  EXPECT_FALSE(raft::resource::get_dry_run_flag(res));
+  EXPECT_GT(stats.device_global, 0) << "Expected non-zero peak device memory allocation";
+}
+
 }  // end namespace linalg
 }  // end namespace raft
diff --git a/cpp/tests/linalg/strided_reduction.cu b/cpp/tests/linalg/strided_reduction.cu
index 57e4c941fe..ab9a5bdd48 100644
--- a/cpp/tests/linalg/strided_reduction.cu
+++ b/cpp/tests/linalg/strided_reduction.cu
@@ -1,5 +1,5 @@
 /*
- * SPDX-FileCopyrightText: Copyright (c) 2018-2024, NVIDIA CORPORATION.
+ * SPDX-FileCopyrightText: Copyright (c) 2018-2026, NVIDIA CORPORATION.
  * SPDX-License-Identifier: Apache-2.0
  */
 
@@ -27,10 +27,8 @@ struct stridedReductionInputs {
 
 template <typename T>
 void stridedReductionLaunch(
-  T* dots, const T* data, int cols, int rows, bool inplace, cudaStream_t stream)
+  const raft::resources& handle, T* dots, const T* data, int cols, int rows, bool inplace)
 {
-  raft::resources handle;
-  resource::set_cuda_stream(handle, stream);
   auto dots_view = raft::make_device_vector_view(dots, cols);
   auto data_view = raft::make_device_matrix_view(data, rows, cols);
   strided_reduction(handle, data_view, dots_view, (T)0, inplace, raft::sq_op{});
@@ -78,8 +76,13 @@ class stridedReductionTest : public ::testing::TestWithParam<stridedReductionInp
                           raft::sq_op{},
                           raft::add_op{},
                           raft::identity_op{});
-    stridedReductionLaunch(dots_act.data(), data.data(), cols, rows, false, stream);
-    stridedReductionLaunch(dots_act.data(), data.data(), cols, rows, true, stream);
+    raft::execute_with_dry_run_check(
+      handle,
+      [&](raft::resources const& h) {
+        stridedReductionLaunch(h, dots_act.data(), data.data(), cols, rows, false);
+        stridedReductionLaunch(h, dots_act.data(), data.data(), cols, rows, true);
+      },
+      raft::alloc_behavior::NO_ALLOCATIONS);
     resource::sync_stream(handle, stream);
   }
 
diff --git a/cpp/tests/linalg/subtract.cu b/cpp/tests/linalg/subtract.cu
index caa52a33f5..a44a655bfc 100644
--- a/cpp/tests/linalg/subtract.cu
+++ b/cpp/tests/linalg/subtract.cu
@@ -1,5 +1,5 @@
 /*
- * SPDX-FileCopyrightText: Copyright (c) 2018-2024, NVIDIA CORPORATION.
+ * SPDX-FileCopyrightText: Copyright (c) 2018-2026, NVIDIA CORPORATION.
  * SPDX-License-Identifier: Apache-2.0
  */
 
@@ -92,11 +92,15 @@ class SubtractTest : public ::testing::TestWithParam<SubtractInputs<T>> {
     const auto scalar   = static_cast<T>(1);
     auto scalar_view    = raft::make_host_scalar_view(&scalar);
 
-    subtract(handle, const_in1_view, const_in2_view, out_view);
-    subtract_scalar(handle, const_out_view, out_view, scalar_view);
-    subtract(handle, const_in1_view, const_in2_view, in1_view);
-    subtract_scalar(handle, const_in1_view, in1_view, scalar_view);
-    resource::sync_stream(handle, stream);
+    raft::execute_with_dry_run_check(
+      handle,
+      [&](raft::resources const& h) {
+        subtract(h, const_in1_view, const_in2_view, out_view);
+        subtract_scalar(h, const_out_view, out_view, scalar_view);
+        subtract(h, const_in1_view, const_in2_view, in1_view);
+        subtract_scalar(h, const_in1_view, in1_view, scalar_view);
+      },
+      raft::alloc_behavior::NO_ALLOCATIONS);
   }
 
  protected:
diff --git a/cpp/tests/linalg/svd.cu b/cpp/tests/linalg/svd.cu
index 544263768d..366413e71d 100644
--- a/cpp/tests/linalg/svd.cu
+++ b/cpp/tests/linalg/svd.cu
@@ -1,10 +1,12 @@
 /*
- * SPDX-FileCopyrightText: Copyright (c) 2018-2024, NVIDIA CORPORATION.
+ * SPDX-FileCopyrightText: Copyright (c) 2018-2026, NVIDIA CORPORATION.
  * SPDX-License-Identifier: Apache-2.0
  */
 
 #include "../test_utils.cuh"
 
+#include <raft/core/device_mdarray.hpp>
+#include <raft/core/dry_run_resources.hpp>
 #include <raft/core/resource/cuda_stream.hpp>
 #include <raft/linalg/init.cuh>
 #include <raft/linalg/svd.cuh>
@@ -83,11 +85,17 @@ class SvdTest : public ::testing::TestWithParam<SvdInputs<T>> {
       std::make_optional(raft::make_device_matrix_view<T, int, raft::col_major>(
         right_eig_vectors_trans_qr.data(), params.n_col, params.n_col));
 
-    svd_qr_transpose_right_vec(handle,
-                               data_view,
-                               sing_vals_qr_view,
-                               left_eig_vectors_qr_view,
-                               right_eig_vectors_trans_qr_view);
+    raft::execute_with_dry_run_check(
+      handle,
+      [&](raft::resources const& h) {
+        svd_qr_transpose_right_vec(h,
+                                   data_view,
+                                   sing_vals_qr_view,
+                                   left_eig_vectors_qr_view,
+                                   right_eig_vectors_trans_qr_view);
+      },
+      raft::alloc_behavior::ARGUMENT_DRIVEN,
+      sizeof(int));
     resource::sync_stream(handle, stream);
   }
 
@@ -197,11 +205,141 @@ INSTANTIATE_TEST_SUITE_P(SvdTests, SvdTestRightVecF, ::testing::ValuesIn(inputsf
 
 INSTANTIATE_TEST_SUITE_P(SvdTests, SvdTestRightVecD, ::testing::ValuesIn(inputsd2));
 
-// INSTANTIATE_TEST_SUITE_P(SvdTests, SvdTestRightVecF,
-// ::testing::ValuesIn(inputsf2));
+// ===================================================================
+// Dry-run tests for SVD public API functions
+// ===================================================================
 
-// INSTANTIATE_TEST_SUITE_P(SvdTests, SvdTestRightVecD,
-//::testing::ValuesIn(inputsd2));
+TEST(SvdDryRun, QrWithBothVectors)
+{
+  raft::resources res;
+  constexpr int n_rows = 256, n_cols = 128;
+
+  auto in        = raft::make_device_matrix<float, int, raft::col_major>(res, n_rows, n_cols);
+  auto sing_vals = raft::make_device_vector<float, int>(res, n_cols);
+  auto U         = raft::make_device_matrix<float, int, raft::col_major>(res, n_rows, n_cols);
+  auto V         = raft::make_device_matrix<float, int, raft::col_major>(res, n_cols, n_cols);
+
+  auto stats = raft::util::dry_run_execute(res, [&](raft::resources const& handle) {
+    raft::linalg::svd_qr(
+      handle, raft::make_const_mdspan(in.view()), sing_vals.view(), U.view(), V.view());
+  });
+
+  EXPECT_FALSE(raft::resource::get_dry_run_flag(res));
+  EXPECT_GT(stats.device_global, 0) << "Expected non-zero peak device memory allocation";
+}
+
+TEST(SvdDryRun, QrWithOnlyU)
+{
+  raft::resources res;
+  constexpr int n_rows = 256, n_cols = 128;
+
+  auto in        = raft::make_device_matrix<float, int, raft::col_major>(res, n_rows, n_cols);
+  auto sing_vals = raft::make_device_vector<float, int>(res, n_cols);
+  auto U         = raft::make_device_matrix<float, int, raft::col_major>(res, n_rows, n_cols);
+
+  auto stats = raft::util::dry_run_execute(res, [&](raft::resources const& handle) {
+    raft::linalg::svd_qr(
+      handle, raft::make_const_mdspan(in.view()), sing_vals.view(), U.view(), std::nullopt);
+  });
+
+  EXPECT_FALSE(raft::resource::get_dry_run_flag(res));
+  EXPECT_GT(stats.device_global, 0) << "Expected non-zero peak device memory allocation";
+}
+
+TEST(SvdDryRun, QrWithOnlyV)
+{
+  raft::resources res;
+  constexpr int n_rows = 256, n_cols = 128;
+
+  auto in        = raft::make_device_matrix<float, int, raft::col_major>(res, n_rows, n_cols);
+  auto sing_vals = raft::make_device_vector<float, int>(res, n_cols);
+  auto V         = raft::make_device_matrix<float, int, raft::col_major>(res, n_cols, n_cols);
+
+  auto stats = raft::util::dry_run_execute(res, [&](raft::resources const& handle) {
+    raft::linalg::svd_qr(
+      handle, raft::make_const_mdspan(in.view()), sing_vals.view(), std::nullopt, V.view());
+  });
+
+  EXPECT_FALSE(raft::resource::get_dry_run_flag(res));
+  EXPECT_GT(stats.device_global, 0) << "Expected non-zero peak device memory allocation";
+}
+
+TEST(SvdDryRun, QrTransposeRightVecWithBothVectors)
+{
+  raft::resources res;
+  constexpr int n_rows = 256, n_cols = 128;
+
+  auto in        = raft::make_device_matrix<float, int, raft::col_major>(res, n_rows, n_cols);
+  auto sing_vals = raft::make_device_vector<float, int>(res, n_cols);
+  auto U         = raft::make_device_matrix<float, int, raft::col_major>(res, n_rows, n_cols);
+  auto V         = raft::make_device_matrix<float, int, raft::col_major>(res, n_cols, n_cols);
+
+  auto stats = raft::util::dry_run_execute(res, [&](raft::resources const& handle) {
+    raft::linalg::svd_qr_transpose_right_vec(
+      handle, raft::make_const_mdspan(in.view()), sing_vals.view(), U.view(), V.view());
+  });
+
+  EXPECT_FALSE(raft::resource::get_dry_run_flag(res));
+  EXPECT_GT(stats.device_global, 0) << "Expected non-zero peak device memory allocation";
+}
+
+TEST(SvdDryRun, EigWithBothVectors)
+{
+  raft::resources res;
+  constexpr int n_rows = 256, n_cols = 128;
+
+  auto in = raft::make_device_matrix<float, int, raft::col_major>(res, n_rows, n_cols);
+  auto S  = raft::make_device_vector<float, int>(res, n_cols);
+  auto V  = raft::make_device_matrix<float, int, raft::col_major>(res, n_cols, n_cols);
+  auto U  = raft::make_device_matrix<float, int, raft::col_major>(res, n_rows, n_cols);
+
+  auto stats = raft::util::dry_run_execute(res, [&](raft::resources const& handle) {
+    raft::linalg::svd_eig(handle, raft::make_const_mdspan(in.view()), S.view(), V.view(), U.view());
+  });
+
+  EXPECT_FALSE(raft::resource::get_dry_run_flag(res));
+  EXPECT_GT(stats.device_global, 0) << "Expected non-zero peak device memory allocation";
+}
+
+TEST(SvdDryRun, EigWithOnlyV)
+{
+  raft::resources res;
+  constexpr int n_rows = 256, n_cols = 128;
+
+  auto in = raft::make_device_matrix<float, int, raft::col_major>(res, n_rows, n_cols);
+  auto S  = raft::make_device_vector<float, int>(res, n_cols);
+  auto V  = raft::make_device_matrix<float, int, raft::col_major>(res, n_cols, n_cols);
+
+  auto stats = raft::util::dry_run_execute(res, [&](raft::resources const& handle) {
+    raft::linalg::svd_eig(
+      handle, raft::make_const_mdspan(in.view()), S.view(), V.view(), std::nullopt);
+  });
+
+  EXPECT_FALSE(raft::resource::get_dry_run_flag(res));
+  EXPECT_GT(stats.device_global, 0) << "Expected non-zero peak device memory allocation";
+}
+
+TEST(SvdDryRun, Reconstruction)
+{
+  raft::resources res;
+  constexpr int n_rows = 256, n_cols = 128, k = 64;
+
+  auto U   = raft::make_device_matrix<float, int, raft::col_major>(res, n_rows, k);
+  auto S   = raft::make_device_matrix<float, int, raft::col_major>(res, k, k);
+  auto V   = raft::make_device_matrix<float, int, raft::col_major>(res, k, n_cols);
+  auto out = raft::make_device_matrix<float, int, raft::col_major>(res, n_rows, n_cols);
+
+  auto stats = raft::util::dry_run_execute(res, [&](raft::resources const& handle) {
+    raft::linalg::svd_reconstruction(handle,
+                                     raft::make_const_mdspan(U.view()),
+                                     raft::make_const_mdspan(S.view()),
+                                     raft::make_const_mdspan(V.view()),
+                                     out.view());
+  });
+
+  EXPECT_FALSE(raft::resource::get_dry_run_flag(res));
+  EXPECT_GT(stats.device_global, 0) << "Expected non-zero peak device memory allocation";
+}
 
 }  // end namespace linalg
 }  // end namespace raft
diff --git a/cpp/tests/linalg/ternary_op.cu b/cpp/tests/linalg/ternary_op.cu
index 7551800fda..d68c1a89fd 100644
--- a/cpp/tests/linalg/ternary_op.cu
+++ b/cpp/tests/linalg/ternary_op.cu
@@ -1,5 +1,5 @@
 /*
- * SPDX-FileCopyrightText: Copyright (c) 2018-2024, NVIDIA CORPORATION.
+ * SPDX-FileCopyrightText: Copyright (c) 2018-2026, NVIDIA CORPORATION.
  * SPDX-License-Identifier: Apache-2.0
  */
 
@@ -63,8 +63,13 @@ class ternaryOpTest : public ::testing::TestWithParam<BinaryOpInputs<T>> {
     auto in2_view     = raft::make_device_vector_view<const T>(in2.data(), len);
     auto in3_view     = raft::make_device_vector_view<const T>(in3.data(), len);
 
-    ternary_op(handle, in1_view, in2_view, in3_view, out_add_view, add);
-    ternary_op(handle, in1_view, in2_view, in3_view, out_mul_view, mul);
+    raft::execute_with_dry_run_check(
+      handle,
+      [&](raft::resources const& h) {
+        ternary_op(h, in1_view, in2_view, in3_view, out_add_view, add);
+        ternary_op(h, in1_view, in2_view, in3_view, out_mul_view, mul);
+      },
+      raft::alloc_behavior::NO_ALLOCATIONS);
   }
 
  protected:
diff --git a/cpp/tests/linalg/transpose.cu b/cpp/tests/linalg/transpose.cu
index 0dcdbbce6b..4c41191392 100644
--- a/cpp/tests/linalg/transpose.cu
+++ b/cpp/tests/linalg/transpose.cu
@@ -1,5 +1,5 @@
 /*
- * SPDX-FileCopyrightText: Copyright (c) 2018-2025, NVIDIA CORPORATION.
+ * SPDX-FileCopyrightText: Copyright (c) 2018-2026, NVIDIA CORPORATION.
  * SPDX-License-Identifier: Apache-2.0
  */
 
@@ -109,7 +109,17 @@ class TransposeTest : public ::testing::TestWithParam<TransposeInputs<T>> {
     raft::update_device(data.data(), data_h.data(), len, stream);
     raft::update_device(data_trans_ref.data(), data_ref_h.data(), len, stream);
 
-    transpose(handle, data.data(), data_trans.data(), params.n_row, params.n_col, stream);
+    raft::execute_with_dry_run_check(
+      handle,
+      [&](raft::resources const& h) {
+        transpose(h,
+                  data.data(),
+                  data_trans.data(),
+                  params.n_row,
+                  params.n_col,
+                  resource::get_cuda_stream(h));
+      },
+      raft::alloc_behavior::NO_ALLOCATIONS);
     if (params.n_row == params.n_col) { transpose(data.data(), params.n_col, stream); }
     resource::sync_stream(handle, stream);
   }
diff --git a/cpp/tests/linalg/unary_op.cu b/cpp/tests/linalg/unary_op.cu
index f41fbc2219..d426b3fbea 100644
--- a/cpp/tests/linalg/unary_op.cu
+++ b/cpp/tests/linalg/unary_op.cu
@@ -1,5 +1,5 @@
 /*
- * SPDX-FileCopyrightText: Copyright (c) 2018-2024, NVIDIA CORPORATION.
+ * SPDX-FileCopyrightText: Copyright (c) 2018-2026, NVIDIA CORPORATION.
  * SPDX-License-Identifier: Apache-2.0
  */
 
@@ -46,10 +46,15 @@ class UnaryOpTest : public ::testing::TestWithParam<UnaryOpInputs<InType, IdxTyp
 
     auto in_view  = raft::make_device_vector_view<const InType>(in.data(), len);
     auto out_view = raft::make_device_vector_view(out.data(), len);
-    unary_op(handle,
-             in_view,
-             out_view,
-             raft::compose_op(raft::cast_op<OutType>(), raft::mul_const_op<InType>(scalar)));
+    raft::execute_with_dry_run_check(
+      handle,
+      [&](raft::resources const& h) {
+        unary_op(h,
+                 in_view,
+                 out_view,
+                 raft::compose_op(raft::cast_op<OutType>(), raft::mul_const_op<InType>(scalar)));
+      },
+      raft::alloc_behavior::NO_ALLOCATIONS);
     resource::sync_stream(handle, stream);
   }
 
diff --git a/cpp/tests/matrix/gather.cu b/cpp/tests/matrix/gather.cu
index 603676ac5d..32841ebb4b 100644
--- a/cpp/tests/matrix/gather.cu
+++ b/cpp/tests/matrix/gather.cu
@@ -151,46 +151,50 @@ class GatherTest : public ::testing::TestWithParam<GatherInputs<IdxT>> {
     auto stencil_view =
       raft::make_device_vector_view<const MatrixT, IdxT>(d_stencil.data(), map_length);
 
-    if (params.ncols_margin == 0) {
-      auto in_view = raft::make_device_matrix_view<const MatrixT, IdxT, row_major>(
-        d_in.data(), params.nrows, params.ncols);
-      auto inout_view = raft::make_device_matrix_view<MatrixT, IdxT, row_major>(
-        d_in.data(), params.nrows, params.ncols);
-      if (Conditional && MapTransform) {
-        raft::matrix::gather_if(
-          handle, in_view, out_view, map_view, stencil_view, pred_op, transform_op);
-      } else if (Conditional) {
-        raft::matrix::gather_if(handle, in_view, out_view, map_view, stencil_view, pred_op);
-      } else if (MapTransform && Inplace) {
-        raft::matrix::gather(handle, inout_view, map_view, params.col_batch_size, transform_op);
-      } else if (MapTransform) {
-        raft::matrix::gather(handle, in_view, map_view, out_view, transform_op);
-      } else if (Inplace) {
-        raft::matrix::gather(handle, inout_view, map_view, params.col_batch_size);
-      } else {
-        raft::matrix::gather(handle, in_view, map_view, out_view);
-      }
-    } else {
-      // Test for a view with specifying the leading dimension
-      auto in_view = raft::make_device_strided_matrix_view<const MatrixT, IdxT, row_major>(
-        d_in.data(), params.nrows, params.ncols, ld_in);
-      auto inout_view = raft::make_device_strided_matrix_view<MatrixT, IdxT, row_major>(
-        d_in.data(), params.nrows, params.ncols, ld_in);
-      if (Conditional && MapTransform) {
-        raft::matrix::gather_if(
-          handle, in_view, out_view, map_view, stencil_view, pred_op, transform_op);
-      } else if (Conditional) {
-        raft::matrix::gather_if(handle, in_view, out_view, map_view, stencil_view, pred_op);
-      } else if (MapTransform && Inplace) {
-        raft::matrix::gather(handle, inout_view, map_view, params.col_batch_size, transform_op);
-      } else if (MapTransform) {
-        raft::matrix::gather(handle, in_view, map_view, out_view, transform_op);
-      } else if (Inplace) {
-        raft::matrix::gather(handle, inout_view, map_view, params.col_batch_size);
-      } else {
-        raft::matrix::gather(handle, in_view, map_view, out_view);
-      }
-    }
+    raft::execute_with_dry_run_check(
+      handle,
+      [&](raft::resources const& h) {
+        if (params.ncols_margin == 0) {
+          auto in_view = raft::make_device_matrix_view<const MatrixT, IdxT, row_major>(
+            d_in.data(), params.nrows, params.ncols);
+          auto inout_view = raft::make_device_matrix_view<MatrixT, IdxT, row_major>(
+            d_in.data(), params.nrows, params.ncols);
+          if (Conditional && MapTransform) {
+            raft::matrix::gather_if(
+              h, in_view, out_view, map_view, stencil_view, pred_op, transform_op);
+          } else if (Conditional) {
+            raft::matrix::gather_if(h, in_view, out_view, map_view, stencil_view, pred_op);
+          } else if (MapTransform && Inplace) {
+            raft::matrix::gather(h, inout_view, map_view, params.col_batch_size, transform_op);
+          } else if (MapTransform) {
+            raft::matrix::gather(h, in_view, map_view, out_view, transform_op);
+          } else if (Inplace) {
+            raft::matrix::gather(h, inout_view, map_view, params.col_batch_size);
+          } else {
+            raft::matrix::gather(h, in_view, map_view, out_view);
+          }
+        } else {
+          auto in_view = raft::make_device_strided_matrix_view<const MatrixT, IdxT, row_major>(
+            d_in.data(), params.nrows, params.ncols, ld_in);
+          auto inout_view = raft::make_device_strided_matrix_view<MatrixT, IdxT, row_major>(
+            d_in.data(), params.nrows, params.ncols, ld_in);
+          if (Conditional && MapTransform) {
+            raft::matrix::gather_if(
+              h, in_view, out_view, map_view, stencil_view, pred_op, transform_op);
+          } else if (Conditional) {
+            raft::matrix::gather_if(h, in_view, out_view, map_view, stencil_view, pred_op);
+          } else if (MapTransform && Inplace) {
+            raft::matrix::gather(h, inout_view, map_view, params.col_batch_size, transform_op);
+          } else if (MapTransform) {
+            raft::matrix::gather(h, in_view, map_view, out_view, transform_op);
+          } else if (Inplace) {
+            raft::matrix::gather(h, inout_view, map_view, params.col_batch_size);
+          } else {
+            raft::matrix::gather(h, in_view, map_view, out_view);
+          }
+        }
+      },
+      raft::alloc_behavior::ARGUMENT_DRIVEN);
 
     if (Inplace) {
       RAFT_CUDA_TRY(cudaMemcpy2DAsync(d_out_act.data(),
diff --git a/cpp/tests/matrix/sample_rows.cu b/cpp/tests/matrix/sample_rows.cu
index 58dd1327bf..81229f5199 100644
--- a/cpp/tests/matrix/sample_rows.cu
+++ b/cpp/tests/matrix/sample_rows.cu
@@ -1,14 +1,16 @@
 /*
- * SPDX-FileCopyrightText: Copyright (c) 2024, NVIDIA CORPORATION.
+ * SPDX-FileCopyrightText: Copyright (c) 2024-2026, NVIDIA CORPORATION.
  * SPDX-License-Identifier: Apache-2.0
  */
 
 #include "../test_utils.cuh"
 
 #include <raft/core/device_mdarray.hpp>
+#include <raft/core/dry_run_resources.hpp>
 #include <raft/core/host_mdarray.hpp>
 #include <raft/core/mdspan.hpp>
 #include <raft/core/resource/cuda_stream.hpp>
+#include <raft/core/resource/dry_run_flag.hpp>
 #include <raft/core/resources.hpp>
 #include <raft/matrix/sample_rows.cuh>
 #include <raft/random/rng.cuh>
@@ -144,5 +146,42 @@ using SampleRowsTestInt64 = SampleRowsTest<float>;
 TEST_P(SampleRowsTestInt64, SamplingTest) { check(); }
 INSTANTIATE_TEST_SUITE_P(SampleRowsTests, SampleRowsTestInt64, ::testing::ValuesIn(inputs1));
 
+// ===== Dry-run tests =====
+
+TEST(SampleRowsDryRun, VoidOverload)
+{
+  raft::resources res;
+  constexpr int64_t n_rows = 1000, n_cols = 64, n_samples = 100;
+
+  auto dataset = raft::make_device_matrix<float, int64_t>(res, n_rows, n_cols);
+  auto output  = raft::make_device_matrix<float, int64_t>(res, n_samples, n_cols);
+
+  auto stats = raft::util::dry_run_execute(res, [&](raft::resources const& handle) {
+    raft::matrix::sample_rows(handle,
+                              raft::random::RngState{42ULL},
+                              raft::make_const_mdspan(dataset.view()),
+                              output.view());
+  });
+
+  EXPECT_FALSE(raft::resource::get_dry_run_flag(res));
+  EXPECT_GT(stats.device_global, 0) << "Expected non-zero peak device allocation";
+}
+
+TEST(SampleRowsDryRun, ReturningOverload)
+{
+  raft::resources res;
+  constexpr int64_t n_rows = 1000, n_cols = 64, n_samples = 100;
+
+  auto dataset = raft::make_device_matrix<float, int64_t>(res, n_rows, n_cols);
+
+  auto stats = raft::util::dry_run_execute(res, [&](raft::resources const& handle) {
+    auto result = raft::matrix::sample_rows<float, int64_t>(
+      handle, raft::random::RngState{42ULL}, raft::make_const_mdspan(dataset.view()), n_samples);
+  });
+
+  EXPECT_FALSE(raft::resource::get_dry_run_flag(res));
+  EXPECT_GT(stats.device_global, 0) << "Expected non-zero peak device allocation";
+}
+
 }  // namespace matrix
 }  // namespace raft
diff --git a/cpp/tests/sparse/convert_csr.cu b/cpp/tests/sparse/convert_csr.cu
index a529041068..3a5670ad59 100644
--- a/cpp/tests/sparse/convert_csr.cu
+++ b/cpp/tests/sparse/convert_csr.cu
@@ -1,5 +1,5 @@
 /*
- * SPDX-FileCopyrightText: Copyright (c) 2019-2024, NVIDIA CORPORATION.
+ * SPDX-FileCopyrightText: Copyright (c) 2019-2026, NVIDIA CORPORATION.
  * SPDX-License-Identifier: Apache-2.0
  */
 
@@ -8,6 +8,7 @@
 #include <raft/core/bitmap.cuh>
 #include <raft/core/bitset.cuh>
 #include <raft/core/resource/cuda_stream.hpp>
+#include <raft/core/resource/dry_run_flag.hpp>
 #include <raft/sparse/convert/csr.cuh>
 #include <raft/sparse/coo.hpp>
 #include <raft/util/cuda_utils.cuh>
@@ -79,6 +80,69 @@ TEST_P(SortedCOOToCSR, Result)
 
 INSTANTIATE_TEST_CASE_P(SparseConvertCSRTest, SortedCOOToCSR, ::testing::ValuesIn(inputsf));
 
+/**************************** COO to CSR ****************************/
+
+typedef SparseConvertCSRTest<float> COOToCSRTest;
+TEST_P(COOToCSRTest, Result)
+{
+  raft::resources handle;
+  auto stream = resource::get_cuda_stream(handle);
+
+  int nnz = 8;
+  int m   = 4;
+
+  int rows_h[]      = {3, 0, 1, 0, 2, 1, 3, 2};
+  int cols_h[]      = {1, 0, 1, 1, 0, 0, 0, 1};
+  float vals_h[]    = {8.0f, 1.0f, 4.0f, 2.0f, 5.0f, 3.0f, 7.0f, 6.0f};
+  int exp_offsets[] = {0, 2, 4, 6, 8};
+  int exp_cols[]    = {0, 1, 0, 1, 0, 1, 0, 1};
+  float exp_vals[]  = {1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f, 7.0f, 8.0f};
+
+  rmm::device_uvector<int> rows_d(nnz, stream);
+  rmm::device_uvector<int> cols_d(nnz, stream);
+  rmm::device_uvector<float> vals_d(nnz, stream);
+  rmm::device_uvector<int> dst_offsets_d(m + 1, stream);
+  rmm::device_uvector<int> dst_cols_d(nnz, stream);
+  rmm::device_uvector<float> dst_vals_d(nnz, stream);
+
+  raft::update_device(rows_d.data(), rows_h, nnz, stream);
+  raft::update_device(cols_d.data(), cols_h, nnz, stream);
+  raft::update_device(vals_d.data(), vals_h, nnz, stream);
+
+  raft::execute_with_dry_run_check(
+    handle,
+    [&](raft::resources const& h) {
+      convert::coo_to_csr(h,
+                          rows_d.data(),
+                          cols_d.data(),
+                          vals_d.data(),
+                          nnz,
+                          m,
+                          dst_offsets_d.data(),
+                          dst_cols_d.data(),
+                          dst_vals_d.data());
+    },
+    raft::alloc_behavior::ARGUMENT_DRIVEN,
+    2 * nnz * sizeof(int));
+
+  rmm::device_uvector<int> exp_offsets_d(m + 1, stream);
+  rmm::device_uvector<int> exp_cols_d(nnz, stream);
+  rmm::device_uvector<float> exp_vals_d(nnz, stream);
+
+  raft::update_device(exp_offsets_d.data(), exp_offsets, m + 1, stream);
+  raft::update_device(exp_cols_d.data(), exp_cols, nnz, stream);
+  raft::update_device(exp_vals_d.data(), exp_vals, nnz, stream);
+
+  ASSERT_TRUE(raft::devArrMatch<int>(
+    dst_offsets_d.data(), exp_offsets_d.data(), m + 1, raft::Compare<int>(), stream));
+  ASSERT_TRUE(raft::devArrMatch<int>(
+    dst_cols_d.data(), exp_cols_d.data(), nnz, raft::Compare<int>(), stream));
+  ASSERT_TRUE(raft::devArrMatch<float>(
+    dst_vals_d.data(), exp_vals_d.data(), nnz, raft::Compare<float>(), stream));
+}
+
+INSTANTIATE_TEST_CASE_P(SparseConvertCSRTest, COOToCSRTest, ::testing::ValuesIn(inputsf));
+
 /******************************** adj graph ********************************/
 
 template <typename index_t>
@@ -145,13 +209,18 @@ class CSRAdjGraphTest : public ::testing::TestWithParam<CSRAdjGraphInputs<index_
 
   void Run()
   {
-    convert::adj_to_csr<index_t>(handle,
-                                 adj.data(),
-                                 row_ind.data(),
-                                 params.n_rows,
-                                 params.n_cols,
-                                 row_counters.data(),
-                                 col_ind.data());
+    raft::execute_with_dry_run_check(
+      handle,
+      [&](raft::resources const& h) {
+        convert::adj_to_csr<index_t>(h,
+                                     adj.data(),
+                                     row_ind.data(),
+                                     params.n_rows,
+                                     params.n_cols,
+                                     row_counters.data(),
+                                     col_ind.data());
+      },
+      raft::alloc_behavior::NO_ALLOCATIONS);
 
     std::vector<index_t> col_ind_host(col_ind.size());
     raft::update_host(col_ind_host.data(), col_ind.data(), col_ind.size(), stream);
@@ -355,23 +424,31 @@ class BitmapToCSRTest : public ::testing::TestWithParam<BitmapToCSRInputs<index_
     auto bitmap =
       raft::core::bitmap_view<bitmap_t, index_t>(bitmap_d.data(), params.n_rows, params.n_cols);
 
-    if (params.owning) {
-      auto csr =
-        raft::make_device_csr_matrix<value_t, index_t>(handle, params.n_rows, params.n_cols, nnz);
-      auto csr_view = csr.structure_view();
-
-      bitmap.to_csr(handle, csr);
-      raft::copy(indptr_d.data(), csr_view.get_indptr().data(), indptr_d.size(), stream);
-      raft::copy(indices_d.data(), csr_view.get_indices().data(), indices_d.size(), stream);
-      raft::copy(values_d.data(), csr.get_elements().data(), nnz, stream);
-    } else {
-      auto csr_view = raft::make_device_compressed_structure_view<index_t, index_t, index_t>(
-        indptr_d.data(), indices_d.data(), params.n_rows, params.n_cols, nnz);
-      auto csr = raft::make_device_csr_matrix<value_t, index_t>(handle, csr_view);
-
-      bitmap.to_csr(handle, csr);
-      raft::copy(values_d.data(), csr.get_elements().data(), nnz, stream);
-    }
+    raft::execute_with_dry_run_check(
+      handle,
+      [&](raft::resources const& h) {
+        if (params.owning) {
+          auto csr =
+            raft::make_device_csr_matrix<value_t, index_t>(h, params.n_rows, params.n_cols, nnz);
+          bitmap.to_csr(h, csr);
+          if (!resource::get_dry_run_flag(h)) {
+            auto csr_view = csr.structure_view();
+            raft::copy(indptr_d.data(), csr_view.get_indptr().data(), indptr_d.size(), stream);
+            raft::copy(indices_d.data(), csr_view.get_indices().data(), indices_d.size(), stream);
+            raft::copy(values_d.data(), csr.get_elements().data(), nnz, stream);
+          }
+        } else {
+          auto csr_view = raft::make_device_compressed_structure_view<index_t, index_t, index_t>(
+            indptr_d.data(), indices_d.data(), params.n_rows, params.n_cols, nnz);
+          auto csr = raft::make_device_csr_matrix<value_t, index_t>(h, csr_view);
+          bitmap.to_csr(h, csr);
+          if (!resource::get_dry_run_flag(h)) {
+            raft::copy(values_d.data(), csr.get_elements().data(), nnz, stream);
+          }
+        }
+      },
+      raft::alloc_behavior::DATA_DRIVEN,
+      sizeof(float) * nnz);
     resource::sync_stream(handle);
 
     std::vector<index_t> indices_h(indices_expected_d.size(), 0);
@@ -645,23 +722,31 @@ class BitsetToCSRTest : public ::testing::TestWithParam<BitsetToCSRInputs<index_
   {
     auto bitset = raft::core::bitset_view<bitset_t, index_t>(bitset_d.data(), params.n_cols);
 
-    if (params.owning) {
-      auto csr =
-        raft::make_device_csr_matrix<value_t, index_t>(handle, params.n_repeat, params.n_cols, nnz);
-      auto csr_view = csr.structure_view();
-
-      bitset.to_csr(handle, csr);
-      raft::copy(indptr_d.data(), csr_view.get_indptr().data(), indptr_d.size(), stream);
-      raft::copy(indices_d.data(), csr_view.get_indices().data(), indices_d.size(), stream);
-      raft::copy(values_d.data(), csr.get_elements().data(), nnz, stream);
-    } else {
-      auto csr_view = raft::make_device_compressed_structure_view<index_t, index_t, index_t>(
-        indptr_d.data(), indices_d.data(), params.n_repeat, params.n_cols, nnz);
-      auto csr = raft::make_device_csr_matrix<value_t, index_t>(handle, csr_view);
-
-      bitset.to_csr(handle, csr);
-      raft::copy(values_d.data(), csr.get_elements().data(), nnz, stream);
-    }
+    raft::execute_with_dry_run_check(
+      handle,
+      [&](raft::resources const& h) {
+        if (params.owning) {
+          auto csr =
+            raft::make_device_csr_matrix<value_t, index_t>(h, params.n_repeat, params.n_cols, nnz);
+          bitset.to_csr(h, csr);
+          if (!resource::get_dry_run_flag(h)) {
+            auto csr_view = csr.structure_view();
+            raft::copy(indptr_d.data(), csr_view.get_indptr().data(), indptr_d.size(), stream);
+            raft::copy(indices_d.data(), csr_view.get_indices().data(), indices_d.size(), stream);
+            raft::copy(values_d.data(), csr.get_elements().data(), nnz, stream);
+          }
+        } else {
+          auto csr_view = raft::make_device_compressed_structure_view<index_t, index_t, index_t>(
+            indptr_d.data(), indices_d.data(), params.n_repeat, params.n_cols, nnz);
+          auto csr = raft::make_device_csr_matrix<value_t, index_t>(h, csr_view);
+          bitset.to_csr(h, csr);
+          if (!resource::get_dry_run_flag(h)) {
+            raft::copy(values_d.data(), csr.get_elements().data(), nnz, stream);
+          }
+        }
+      },
+      raft::alloc_behavior::DATA_DRIVEN,
+      sizeof(float) * nnz);
     resource::sync_stream(handle);
 
     std::vector<index_t> indices_h(indices_expected_d.size(), 0);
diff --git a/cpp/tests/sparse/csr_transpose.cu b/cpp/tests/sparse/csr_transpose.cu
index f4fc2c3a8b..0857dbfe8d 100644
--- a/cpp/tests/sparse/csr_transpose.cu
+++ b/cpp/tests/sparse/csr_transpose.cu
@@ -1,5 +1,5 @@
 /*
- * SPDX-FileCopyrightText: Copyright (c) 2018-2024, NVIDIA CORPORATION.
+ * SPDX-FileCopyrightText: Copyright (c) 2018-2026, NVIDIA CORPORATION.
  * SPDX-License-Identifier: Apache-2.0
  */
 
@@ -94,17 +94,23 @@ class CSRTransposeTest : public ::testing::TestWithParam<CSRTransposeInputs<valu
 
     make_data();
 
-    raft::sparse::linalg::csr_transpose(handle,
-                                        indptr.data(),
-                                        indices.data(),
-                                        data.data(),
-                                        out_indptr.data(),
-                                        out_indices.data(),
-                                        out_data.data(),
-                                        params.nrows,
-                                        params.ncols,
-                                        params.nnz,
-                                        stream);
+    raft::execute_with_dry_run_check(
+      handle,
+      [&](raft::resources const& h) {
+        raft::sparse::linalg::csr_transpose(h,
+                                            indptr.data(),
+                                            indices.data(),
+                                            data.data(),
+                                            out_indptr.data(),
+                                            out_indices.data(),
+                                            out_data.data(),
+                                            params.nrows,
+                                            params.ncols,
+                                            params.nnz,
+                                            resource::get_cuda_stream(h));
+      },
+      raft::alloc_behavior::ARGUMENT_DRIVEN,
+      1);
 
     resource::sync_stream(handle, stream);
   }
diff --git a/cpp/tests/sparse/diagonal.cu b/cpp/tests/sparse/diagonal.cu
index b66cffe676..a05531cd1e 100644
--- a/cpp/tests/sparse/diagonal.cu
+++ b/cpp/tests/sparse/diagonal.cu
@@ -1,8 +1,10 @@
 /*
- * SPDX-FileCopyrightText: Copyright (c) 2025, NVIDIA CORPORATION.
+ * SPDX-FileCopyrightText: Copyright (c) 2025-2026, NVIDIA CORPORATION.
  * SPDX-License-Identifier: Apache-2.0
  */
 
+#include "../test_utils.cuh"
+
 #include <raft/core/device_coo_matrix.hpp>
 #include <raft/core/device_csr_matrix.hpp>
 #include <raft/core/device_mdarray.hpp>
@@ -81,8 +83,10 @@ TEST(SparseMatrixDiagonal, GetDiagonalVectorFromCSR)
   // Create diagonal output vector
   auto diagonal_vec = raft::make_device_vector<float, int>(res, 4);
 
-  // Get diagonal (function initializes to zero internally)
-  diagonal(res, matrix.view(), diagonal_vec.view());
+  raft::execute_with_dry_run_check(
+    res,
+    [&](raft::resources const& h) { diagonal(h, matrix.view(), diagonal_vec.view()); },
+    raft::alloc_behavior::NO_ALLOCATIONS);
 
   // Copy result back to host
   auto diagonal_host = std::vector<float>(4);
@@ -110,8 +114,12 @@ TEST(SparseMatrixDiagonal, ScaleCSRByDiagonalSymmetric)
              diagonal_data.size(),
              raft::resource::get_cuda_stream(res));
 
-  // Scale matrix by diagonal
-  scale_by_diagonal_symmetric(res, diagonal_vec.view(), matrix.view());
+  raft::execute_with_dry_run_check(
+    res,
+    [&](raft::resources const& h) {
+      scale_by_diagonal_symmetric(h, diagonal_vec.view(), matrix.view());
+    },
+    raft::alloc_behavior::NO_ALLOCATIONS);
 
   // Copy result back to host
   auto matrix_structure = matrix.structure_view();
@@ -150,8 +158,10 @@ TEST(SparseMatrixDiagonal, SetCSRDiagonalToOnes)
   auto res    = raft::resources{};
   auto matrix = create_test_csr_matrix(res);
 
-  // Set diagonal to ones
-  set_diagonal(res, matrix.view(), 1.0f);
+  raft::execute_with_dry_run_check(
+    res,
+    [&](raft::resources const& h) { set_diagonal(h, matrix.view(), 1.0f); },
+    raft::alloc_behavior::NO_ALLOCATIONS);
 
   // Copy result back to host
   auto matrix_structure = matrix.structure_view();
@@ -178,15 +188,15 @@ TEST(SparseMatrixDiagonal, CompleteWorkflow)
   auto res    = raft::resources{};
   auto matrix = create_test_csr_matrix(res);
 
-  // 1. Get diagonal
   auto diagonal_vec = raft::make_device_vector<float, int>(res, 4);
-  diagonal(res, matrix.view(), diagonal_vec.view());
-
-  // 2. Scale matrix by diagonal
-  scale_by_diagonal_symmetric(res, diagonal_vec.view(), matrix.view());
-
-  // 3. Set diagonal to ones
-  set_diagonal(res, matrix.view(), 1.0f);
+  raft::execute_with_dry_run_check(
+    res,
+    [&](raft::resources const& h) {
+      diagonal(h, matrix.view(), diagonal_vec.view());
+      scale_by_diagonal_symmetric(h, diagonal_vec.view(), matrix.view());
+      set_diagonal(h, matrix.view(), 1.0f);
+    },
+    raft::alloc_behavior::NO_ALLOCATIONS);
 
   // Copy results back to host
   auto matrix_structure = matrix.structure_view();
@@ -237,11 +247,12 @@ TEST(SparseMatrixDiagonal, GetDiagonalVectorFromCOO)
   auto res    = raft::resources{};
   auto matrix = create_test_coo_matrix(res);
 
-  // Create diagonal output vector
   auto diagonal_vec = raft::make_device_vector<float, int>(res, 4);
 
-  // Get diagonal (function initializes to zero internally)
-  diagonal(res, matrix.view(), diagonal_vec.view());
+  raft::execute_with_dry_run_check(
+    res,
+    [&](raft::resources const& h) { diagonal(h, matrix.view(), diagonal_vec.view()); },
+    raft::alloc_behavior::NO_ALLOCATIONS);
 
   // Copy result back to host
   auto diagonal_host = std::vector<float>(4);
@@ -261,7 +272,6 @@ TEST(SparseMatrixDiagonal, ScaleCOOByDiagonalSymmetric)
   auto res    = raft::resources{};
   auto matrix = create_test_coo_matrix(res);
 
-  // Create diagonal with values [2, 4, 2, 4]
   auto diagonal_data = std::vector<float>{2, 4, 2, 4};
   auto diagonal_vec  = raft::make_device_vector<float, int>(res, 4);
   raft::copy(diagonal_vec.data_handle(),
@@ -269,8 +279,12 @@ TEST(SparseMatrixDiagonal, ScaleCOOByDiagonalSymmetric)
              diagonal_data.size(),
              raft::resource::get_cuda_stream(res));
 
-  // Scale matrix by diagonal
-  scale_by_diagonal_symmetric(res, diagonal_vec.view(), matrix.view());
+  raft::execute_with_dry_run_check(
+    res,
+    [&](raft::resources const& h) {
+      scale_by_diagonal_symmetric(h, diagonal_vec.view(), matrix.view());
+    },
+    raft::alloc_behavior::NO_ALLOCATIONS);
 
   // Copy result back to host
   auto matrix_structure = matrix.structure_view();
@@ -309,8 +323,10 @@ TEST(SparseMatrixDiagonal, SetCOODiagonalToOnes)
   auto res    = raft::resources{};
   auto matrix = create_test_coo_matrix(res);
 
-  // Set diagonal to ones
-  set_diagonal(res, matrix.view(), 1.0f);
+  raft::execute_with_dry_run_check(
+    res,
+    [&](raft::resources const& h) { set_diagonal(h, matrix.view(), 1.0f); },
+    raft::alloc_behavior::NO_ALLOCATIONS);
 
   // Copy result back to host
   auto matrix_structure = matrix.structure_view();
@@ -337,15 +353,15 @@ TEST(SparseMatrixDiagonal, CompleteWorkflowCOO)
   auto res    = raft::resources{};
   auto matrix = create_test_coo_matrix(res);
 
-  // 1. Get diagonal
   auto diagonal_vec = raft::make_device_vector<float, int>(res, 4);
-  diagonal(res, matrix.view(), diagonal_vec.view());
-
-  // 2. Scale matrix by diagonal
-  scale_by_diagonal_symmetric(res, diagonal_vec.view(), matrix.view());
-
-  // 3. Set diagonal to ones
-  set_diagonal(res, matrix.view(), 1.0f);
+  raft::execute_with_dry_run_check(
+    res,
+    [&](raft::resources const& h) {
+      diagonal(h, matrix.view(), diagonal_vec.view());
+      scale_by_diagonal_symmetric(h, diagonal_vec.view(), matrix.view());
+      set_diagonal(h, matrix.view(), 1.0f);
+    },
+    raft::alloc_behavior::NO_ALLOCATIONS);
 
   // Copy results back to host
   auto matrix_structure = matrix.structure_view();
diff --git a/cpp/tests/sparse/filter.cu b/cpp/tests/sparse/filter.cu
index 5771f25c51..b5863d4852 100644
--- a/cpp/tests/sparse/filter.cu
+++ b/cpp/tests/sparse/filter.cu
@@ -1,5 +1,5 @@
 /*
- * SPDX-FileCopyrightText: Copyright (c) 2019-2024, NVIDIA CORPORATION.
+ * SPDX-FileCopyrightText: Copyright (c) 2019-2026, NVIDIA CORPORATION.
  * SPDX-License-Identifier: Apache-2.0
  */
 
@@ -151,7 +151,13 @@ TEST_P(COORemoveScalarView, ResultView)
 
   auto scalar = raft::make_host_scalar<float>(0.0f);
 
-  op::coo_remove_scalar<128, float, int, int>(h, in_view, scalar.view(), out_matrix);
+  raft::execute_with_dry_run_check(
+    h,
+    [&](raft::resources const& h) {
+      op::coo_remove_scalar<128, float, int, int>(h, in_view, scalar.view(), out_matrix);
+    },
+    raft::alloc_behavior::DATA_DRIVEN,
+    2 * 5 * sizeof(int));
   RAFT_CUDA_TRY(cudaStreamSynchronize(stream));
 
   auto out_nnz = out_matrix.structure_view().get_nnz();
diff --git a/cpp/tests/sparse/masked_matmul.cu b/cpp/tests/sparse/masked_matmul.cu
index 6972d76997..737b207e5e 100644
--- a/cpp/tests/sparse/masked_matmul.cu
+++ b/cpp/tests/sparse/masked_matmul.cu
@@ -1,5 +1,5 @@
 /*
- * SPDX-FileCopyrightText: Copyright (c) 2024, NVIDIA CORPORATION.
+ * SPDX-FileCopyrightText: Copyright (c) 2024-2026, NVIDIA CORPORATION.
  * SPDX-License-Identifier: Apache-2.0
  */
 
@@ -338,12 +338,18 @@ class MaskedMatmulTest
 
     if constexpr (bits_layout == BitsLayout::Bitmap) {
       auto mask = raft::core::bitmap_view<const bits_t, index_t>(bits_d.data(), params.m, params.n);
-      raft::sparse::linalg::masked_matmul(handle, A, B, mask, C);
+      raft::execute_with_dry_run_check(
+        handle,
+        [&](raft::resources const& h) { raft::sparse::linalg::masked_matmul(h, A, B, mask, C); },
+        raft::alloc_behavior::ARGUMENT_DRIVEN,
+        c_data_d.size() * sizeof(output_t));
     } else if constexpr (bits_layout == BitsLayout::Bitset) {
       auto mask = raft::core::bitset_view<const bits_t, index_t>(bits_d.data(), params.n);
-      raft::sparse::linalg::masked_matmul(handle, A, B, mask, C);
-    } else {
-      GTEST_SKIP() << "Unsupported BitsLayout!";
+      raft::execute_with_dry_run_check(
+        handle,
+        [&](raft::resources const& h) { raft::sparse::linalg::masked_matmul(h, A, B, mask, C); },
+        raft::alloc_behavior::ARGUMENT_DRIVEN,
+        c_data_d.size() * sizeof(output_t));
     }
 
     resource::sync_stream(handle);
diff --git a/cpp/tests/sparse/norm.cu b/cpp/tests/sparse/norm.cu
index 89fff8acba..0d73d69c2e 100644
--- a/cpp/tests/sparse/norm.cu
+++ b/cpp/tests/sparse/norm.cu
@@ -1,5 +1,5 @@
 /*
- * SPDX-FileCopyrightText: Copyright (c) 2019-2024, NVIDIA CORPORATION.
+ * SPDX-FileCopyrightText: Copyright (c) 2019-2026, NVIDIA CORPORATION.
  * SPDX-License-Identifier: Apache-2.0
  */
 
@@ -52,7 +52,12 @@ class CSRRowNormTest : public ::testing::TestWithParam<CSRRowNormInputs<Type_f,
     raft::update_device(data.data(), params.data.data(), nnz, stream);
     raft::update_device(verify.data(), params.verify.data(), n_rows, stream);
 
-    linalg::rowNormCsr(handle, indptr.data(), data.data(), nnz, n_rows, result.data(), params.norm);
+    raft::execute_with_dry_run_check(
+      handle,
+      [&](raft::resources const& h) {
+        linalg::rowNormCsr(h, indptr.data(), data.data(), nnz, n_rows, result.data(), params.norm);
+      },
+      raft::alloc_behavior::NO_ALLOCATIONS);
     RAFT_CUDA_TRY(cudaStreamSynchronize(stream));
 
     ASSERT_TRUE(
diff --git a/cpp/tests/sparse/preprocess.cu b/cpp/tests/sparse/preprocess.cu
index 0ad6051f66..a955a6ecc6 100644
--- a/cpp/tests/sparse/preprocess.cu
+++ b/cpp/tests/sparse/preprocess.cu
@@ -137,9 +137,21 @@ class SparsePreprocessCSR
       auto bm25_vals = raft::make_device_vector<Type_f, int64_t>(handle, int(coo_a.nnz));
       raft::util::calc_tfidf_bm25<Index_, Type_f>(handle, csr_matrix.view(), bm25_vals.view());
       if (coo_on) {
-        raft::sparse::matrix::encode_bm25<float, int>(handle, coo_a_matrix, result.view());
+        raft::execute_with_dry_run_check(
+          handle,
+          [&](raft::resources const& h) {
+            raft::sparse::matrix::encode_bm25<float, int>(h, coo_a_matrix, result.view());
+          },
+          raft::alloc_behavior::DATA_DRIVEN,
+          sizeof(float) * coo_a.nnz);
       } else {
-        raft::sparse::matrix::encode_bm25<float, int>(handle, csr_matrix, result.view());
+        raft::execute_with_dry_run_check(
+          handle,
+          [&](raft::resources const& h) {
+            raft::sparse::matrix::encode_bm25<float, int>(h, csr_matrix, result.view());
+          },
+          raft::alloc_behavior::DATA_DRIVEN,
+          sizeof(float) * coo_a.nnz);
       }
       ASSERT_TRUE(raft::devArrMatch<Type_f>(bm25_vals.data_handle(),
                                             result.data_handle(),
@@ -151,9 +163,21 @@ class SparsePreprocessCSR
       raft::util::calc_tfidf_bm25<Index_, Type_f>(
         handle, csr_matrix.view(), tfidf_vals.view(), true);
       if (coo_on) {
-        raft::sparse::matrix::encode_tfidf<float, int>(handle, coo_a_matrix, result.view());
+        raft::execute_with_dry_run_check(
+          handle,
+          [&](raft::resources const& h) {
+            raft::sparse::matrix::encode_tfidf<float, int>(h, coo_a_matrix, result.view());
+          },
+          raft::alloc_behavior::ARGUMENT_DRIVEN,
+          1);
       } else {
-        raft::sparse::matrix::encode_tfidf<float, int>(handle, csr_matrix, result.view());
+        raft::execute_with_dry_run_check(
+          handle,
+          [&](raft::resources const& h) {
+            raft::sparse::matrix::encode_tfidf<float, int>(h, csr_matrix, result.view());
+          },
+          raft::alloc_behavior::ARGUMENT_DRIVEN,
+          1);
       }
       ASSERT_TRUE(raft::devArrMatch<Type_f>(tfidf_vals.data_handle(),
                                             result.data_handle(),
diff --git a/cpp/tests/sparse/reduce.cu b/cpp/tests/sparse/reduce.cu
index 2d5fa3e041..1204edd214 100644
--- a/cpp/tests/sparse/reduce.cu
+++ b/cpp/tests/sparse/reduce.cu
@@ -1,5 +1,5 @@
 /*
- * SPDX-FileCopyrightText: Copyright (c) 2019-2024, NVIDIA CORPORATION.
+ * SPDX-FileCopyrightText: Copyright (c) 2019-2026, NVIDIA CORPORATION.
  * SPDX-License-Identifier: Apache-2.0
  */
 
@@ -63,14 +63,23 @@ class SparseReduceTest : public ::testing::TestWithParam<SparseReduceInputs<valu
     raft::update_device(out_vals.data(), params.out_vals.data(), params.out_vals.size(), stream);
 
     raft::sparse::COO<value_t, value_idx, value_idx> out(stream);
-    raft::sparse::op::max_duplicates(handle,
-                                     out,
-                                     in_rows.data(),
-                                     in_cols.data(),
-                                     in_vals.data(),
-                                     (value_idx)params.in_rows.size(),
-                                     (value_idx)params.m,
-                                     (value_idx)params.n);
+    // min_alloc: internal workspace of max_duplicates (diff array + CUB scan workspace).
+    // The COO output itself is not tracked because `out` was created outside the wrapper.
+    auto min_alloc = (params.in_rows.size() + 1) * sizeof(value_idx);
+    raft::execute_with_dry_run_check(
+      handle,
+      [&](raft::resources const& h) {
+        raft::sparse::op::max_duplicates(h,
+                                         out,
+                                         in_rows.data(),
+                                         in_cols.data(),
+                                         in_vals.data(),
+                                         (value_idx)params.in_rows.size(),
+                                         (value_idx)params.m,
+                                         (value_idx)params.n);
+      },
+      raft::alloc_behavior::DATA_DRIVEN,
+      min_alloc);
     RAFT_CUDA_TRY(cudaStreamSynchronize(stream));
     ASSERT_TRUE(raft::devArrMatch<value_idx>(
       out_rows.data(), out.rows(), out.nnz, raft::Compare<value_idx>()));
diff --git a/cpp/tests/sparse/sddmm.cu b/cpp/tests/sparse/sddmm.cu
index 781bb423f9..bff4add929 100644
--- a/cpp/tests/sparse/sddmm.cu
+++ b/cpp/tests/sparse/sddmm.cu
@@ -1,5 +1,5 @@
 /*
- * SPDX-FileCopyrightText: Copyright (c) 2024, NVIDIA CORPORATION.
+ * SPDX-FileCopyrightText: Copyright (c) 2024-2026, NVIDIA CORPORATION.
  * SPDX-License-Identifier: Apache-2.0
  */
 
@@ -313,14 +313,20 @@ class SDDMMTest : public ::testing::TestWithParam<SDDMMInputs<ValueType, IndexTy
     auto op_b = params.transpose_b ? raft::linalg::Operation::TRANSPOSE
                                    : raft::linalg::Operation::NON_TRANSPOSE;
 
-    raft::sparse::linalg::sddmm(handle,
-                                a,
-                                b,
-                                c,
-                                op_a,
-                                op_b,
-                                raft::make_host_scalar_view<OutputType>(&params.alpha),
-                                raft::make_host_scalar_view<OutputType>(&params.beta));
+    raft::execute_with_dry_run_check(
+      handle,
+      [&](raft::resources const& h) {
+        raft::sparse::linalg::sddmm(h,
+                                    a,
+                                    b,
+                                    c,
+                                    op_a,
+                                    op_b,
+                                    raft::make_host_scalar_view<OutputType>(&params.alpha),
+                                    raft::make_host_scalar_view<OutputType>(&params.beta));
+      },
+      raft::alloc_behavior::ARGUMENT_DRIVEN,
+      1);
 
     resource::sync_stream(handle);
 
diff --git a/cpp/tests/sparse/select_k_csr.cu b/cpp/tests/sparse/select_k_csr.cu
index 88bd20b95c..968c21e718 100644
--- a/cpp/tests/sparse/select_k_csr.cu
+++ b/cpp/tests/sparse/select_k_csr.cu
@@ -1,5 +1,5 @@
 /*
- * SPDX-FileCopyrightText: Copyright (c) 2024, NVIDIA CORPORATION.
+ * SPDX-FileCopyrightText: Copyright (c) 2024-2026, NVIDIA CORPORATION.
  * SPDX-License-Identifier: Apache-2.0
  */
 
@@ -287,8 +287,13 @@ class SelectKCsrTest : public ::testing::TestWithParam<SelectKCsrInputs<index_t>
     auto out_idx = raft::make_device_matrix_view<index_t, index_t, raft::row_major>(
       dst_indices_d.data(), params.n_rows, params.top_k);
 
-    raft::sparse::matrix::select_k(
-      handle, in_val, in_idx, out_val, out_idx, params.select_min, true);
+    raft::execute_with_dry_run_check(
+      handle,
+      [&](raft::resources const& h) {
+        raft::sparse::matrix::select_k(
+          h, in_val, in_idx, out_val, out_idx, params.select_min, true);
+      },
+      raft::alloc_behavior::ARGUMENT_DRIVEN);
 
     ASSERT_TRUE(raft::devArrMatch<index_t>(dst_indices_expected_d.data(),
                                            out_idx.data_handle(),
diff --git a/cpp/tests/sparse/solver/lanczos.cu b/cpp/tests/sparse/solver/lanczos.cu
index eaa5ffaa9c..ec1391628b 100644
--- a/cpp/tests/sparse/solver/lanczos.cu
+++ b/cpp/tests/sparse/solver/lanczos.cu
@@ -192,13 +192,19 @@ class rmat_lanczos_tests
     auto csr_matrix = raft::make_device_csr_matrix_view<ValueType, IndexType, IndexType, IndexType>(
       const_cast<ValueType*>(symmetric_coo.vals()), csr_structure);
 
-    std::get<0>(stats) = raft::sparse::solver::lanczos_compute_eigenpairs<IndexType, ValueType>(
+    raft::execute_with_dry_run_check(
       handle,
-      config,
-      csr_matrix,
-      std::make_optional(v0.view()),
-      eigenvalues.view(),
-      eigenvectors.view());
+      [&](raft::resources const& h) {
+        std::get<0>(stats) = raft::sparse::solver::lanczos_compute_eigenpairs<IndexType, ValueType>(
+          h,
+          config,
+          csr_matrix,
+          std::make_optional(v0.view()),
+          eigenvalues.view(),
+          eigenvectors.view());
+      },
+      raft::alloc_behavior::ARGUMENT_DRIVEN,
+      sizeof(ValueType) * symmetric_coo.n_rows * config.ncv);
 
     ASSERT_TRUE(raft::devArrMatch<ValueType>(eigenvalues.data_handle(),
                                              expected_eigenvalues.data_handle(),
@@ -340,13 +346,19 @@ class lanczos_tests : public ::testing::TestWithParam<lanczos_inputs<IndexType,
     auto csr_matrix = raft::make_device_csr_matrix_view<ValueType, IndexType, IndexType, IndexType>(
       const_cast<ValueType*>(vals.data_handle()), csr_structure);
 
-    std::get<0>(stats) = raft::sparse::solver::lanczos_compute_eigenpairs<IndexType, ValueType>(
+    raft::execute_with_dry_run_check(
       handle,
-      config,
-      csr_matrix,
-      std::make_optional(v0.view()),
-      eigenvalues.view(),
-      eigenvectors.view());
+      [&](raft::resources const& h) {
+        std::get<0>(stats) = raft::sparse::solver::lanczos_compute_eigenpairs<IndexType, ValueType>(
+          h,
+          config,
+          csr_matrix,
+          std::make_optional(v0.view()),
+          eigenvalues.view(),
+          eigenvectors.view());
+      },
+      raft::alloc_behavior::ARGUMENT_DRIVEN,
+      sizeof(ValueType) * n * config.ncv);
 
     ASSERT_TRUE(raft::devArrMatch<ValueType>(
       eigenvalues.data_handle(),
diff --git a/cpp/tests/sparse/solver/randomized_svds.cu b/cpp/tests/sparse/solver/randomized_svds.cu
index 978f836eac..763e291b5f 100644
--- a/cpp/tests/sparse/solver/randomized_svds.cu
+++ b/cpp/tests/sparse/solver/randomized_svds.cu
@@ -5,21 +5,25 @@
 
 #include "../../test_utils.cuh"
 
+#include <raft/core/copy.hpp>
 #include <raft/core/device_csr_matrix.hpp>
 #include <raft/core/device_mdarray.hpp>
 #include <raft/core/device_mdspan.hpp>
+#include <raft/core/host_mdspan.hpp>
+#include <raft/core/mdspan.hpp>
+#include <raft/core/operators.hpp>
 #include <raft/core/resource/cublas_handle.hpp>
 #include <raft/core/resource/cuda_stream.hpp>
 #include <raft/core/resources.hpp>
 #include <raft/linalg/detail/cublas_wrappers.hpp>
 #include <raft/linalg/gemm.cuh>
+#include <raft/linalg/gemv.cuh>
+#include <raft/linalg/map.cuh>
 #include <raft/linalg/svd.cuh>
 #include <raft/sparse/solver/detail/csr_linear_operator.cuh>
 #include <raft/sparse/solver/randomized_svds.cuh>
 #include <raft/util/cudart_utils.hpp>
 
-#include <rmm/device_uvector.hpp>
-
 #include <gtest/gtest.h>
 
 #include <cstdint>
@@ -27,6 +31,13 @@
 
 namespace raft::sparse::solver {
 
+// Loose extra space floor: one m×n dense buffer (ignores block_size, workspaces, optional outputs).
+template <typename ValueType>
+auto randomized_svds_loose_min_alloc(int m, int n)
+{
+  return sizeof(ValueType) * static_cast<std::size_t>(m) * static_cast<std::size_t>(n);
+}
+
 // ============================================================================
 // Golden data: 20x15 sparse matrix (nnz=120), generated with cupy seed=42
 // Expected singular values computed via cupy.linalg.svd
@@ -128,7 +139,13 @@ class RandomizedSvdsTest : public ::testing::Test {
     auto U  = raft::make_device_matrix<ValueType, uint32_t, raft::col_major>(handle, m, k);
     auto Vt = raft::make_device_matrix<ValueType, uint32_t, raft::col_major>(handle, k, n);
 
-    sparse_randomized_svd(handle, config, csr_matrix, S.view(), U.view(), Vt.view());
+    raft::execute_with_dry_run_check(
+      handle,
+      [&](raft::resources const& h) {
+        sparse_randomized_svd(h, config, csr_matrix, S.view(), U.view(), Vt.view());
+      },
+      raft::alloc_behavior::ARGUMENT_DRIVEN,
+      randomized_svds_loose_min_alloc<ValueType>(m, n));
 
     // Singular values must match golden ground truth
     ASSERT_TRUE(raft::devArrMatch<ValueType>(
@@ -250,7 +267,13 @@ class OptionalUVtTest : public ::testing::Test {
     config.seed          = 42;
 
     auto S = raft::make_device_vector<ValueType, uint32_t>(handle, k);
-    sparse_randomized_svd(handle, config, csr_matrix, S.view(), U_opt, Vt_opt);
+    raft::execute_with_dry_run_check(
+      handle,
+      [&](raft::resources const& h) {
+        sparse_randomized_svd(h, config, csr_matrix, S.view(), U_opt, Vt_opt);
+      },
+      raft::alloc_behavior::ARGUMENT_DRIVEN,
+      randomized_svds_loose_min_alloc<ValueType>(m, n));
     raft::update_host(S_out, S.data_handle(), k, stream);
     resource::sync_stream(handle);
   }
@@ -404,7 +427,13 @@ struct ReconstructionErrorTest : public ::testing::Test {
     auto S  = raft::make_device_vector<ValueType, uint32_t>(handle, k);
     auto U  = raft::make_device_matrix<ValueType, uint32_t, raft::col_major>(handle, m, k);
     auto Vt = raft::make_device_matrix<ValueType, uint32_t, raft::col_major>(handle, k, n);
-    sparse_randomized_svd(handle, config, csr_matrix, S.view(), U.view(), Vt.view());
+    raft::execute_with_dry_run_check(
+      handle,
+      [&](raft::resources const& h) {
+        sparse_randomized_svd(h, config, csr_matrix, S.view(), U.view(), Vt.view());
+      },
+      raft::alloc_behavior::ARGUMENT_DRIVEN,
+      randomized_svds_loose_min_alloc<ValueType>(m, n));
 
     // Reconstruct: recon = U @ diag(S) @ Vt
     // First: US = U * S (scale columns of U by S)
@@ -495,43 +524,26 @@ struct mean_centered_operator {
              raft::device_matrix_view<const ValueType, uint32_t, raft::col_major> X,
              raft::device_matrix_view<ValueType, uint32_t, raft::col_major> Y) const
   {
-    auto stream = raft::resource::get_cuda_stream(handle);
-    auto cublas = raft::resource::get_cublas_handle(handle);
-    int bk      = X.extent(1);
+    auto bk = static_cast<uint32_t>(X.extent(1));
     base_op_.apply(handle, X, Y);
-    rmm::device_uvector<ValueType> corr(bk, stream);
-    rmm::device_uvector<ValueType> ones(m_, stream);
-    std::vector<ValueType> h(m_, 1);
-    raft::update_device(ones.data(), h.data(), m_, stream);
-    ValueType a1 = 1, a0 = 0, am1 = -1;
-    RAFT_CUBLAS_TRY(raft::linalg::detail::cublasgemv(cublas,
-                                                     CUBLAS_OP_T,
-                                                     n_,
-                                                     bk,
-                                                     &a1,
-                                                     X.data_handle(),
-                                                     n_,
-                                                     col_means_,
-                                                     1,
-                                                     &a0,
-                                                     corr.data(),
-                                                     1,
-                                                     stream));
-    RAFT_CUBLAS_TRY(raft::linalg::detail::cublasgemm(cublas,
-                                                     CUBLAS_OP_N,
-                                                     CUBLAS_OP_N,
-                                                     m_,
-                                                     bk,
-                                                     1,
-                                                     &am1,
-                                                     ones.data(),
-                                                     m_,
-                                                     corr.data(),
-                                                     1,
-                                                     &a1,
-                                                     Y.data_handle(),
-                                                     m_,
-                                                     stream));
+    auto corr = raft::make_device_vector<ValueType, uint32_t>(handle, bk);
+    auto ones = raft::make_device_vector<ValueType, uint32_t>(handle, m_);
+    raft::linalg::map(handle, ones.view(), raft::const_op{ValueType{1}});
+
+    ValueType neg_one = -1, one = 1;
+    // Same col-major buffer as X; row_major tag selects gemv transpose (mean^T @ X).
+    auto X_t = raft::make_device_matrix_view<const ValueType, uint32_t, raft::row_major>(
+      X.data_handle(), X.extent(0), X.extent(1));
+    raft::linalg::gemv(handle,
+                       X_t,
+                       raft::make_device_vector_view<const ValueType, uint32_t>(col_means_, n_),
+                       corr.view());
+    raft::linalg::gemm(handle,
+                       raft::reshape(ones.view(), raft::make_extents<uint32_t>(m_, 1)),
+                       raft::reshape(corr.view(), raft::make_extents<uint32_t>(1, bk)),
+                       Y,
+                       std::make_optional(raft::make_host_scalar_view(&neg_one)),
+                       std::make_optional(raft::make_host_scalar_view(&one)));
   }
 
   // Z = (A - 1*mean^T)^T @ X = A^T@X - mean * (1^T @ X)
@@ -539,43 +551,24 @@ struct mean_centered_operator {
                        raft::device_matrix_view<const ValueType, uint32_t, raft::col_major> X,
                        raft::device_matrix_view<ValueType, uint32_t, raft::col_major> Z) const
   {
-    auto stream = raft::resource::get_cuda_stream(handle);
-    auto cublas = raft::resource::get_cublas_handle(handle);
-    int bk      = X.extent(1);
+    auto bk = static_cast<uint32_t>(X.extent(1));
     base_op_.apply_transpose(handle, X, Z);
-    rmm::device_uvector<ValueType> sums(bk, stream);
-    rmm::device_uvector<ValueType> ones(m_, stream);
-    std::vector<ValueType> h(m_, 1);
-    raft::update_device(ones.data(), h.data(), m_, stream);
-    ValueType a1 = 1, a0 = 0, am1 = -1;
-    RAFT_CUBLAS_TRY(raft::linalg::detail::cublasgemv(cublas,
-                                                     CUBLAS_OP_T,
-                                                     m_,
-                                                     bk,
-                                                     &a1,
-                                                     X.data_handle(),
-                                                     m_,
-                                                     ones.data(),
-                                                     1,
-                                                     &a0,
-                                                     sums.data(),
-                                                     1,
-                                                     stream));
-    RAFT_CUBLAS_TRY(raft::linalg::detail::cublasgemm(cublas,
-                                                     CUBLAS_OP_N,
-                                                     CUBLAS_OP_N,
-                                                     n_,
-                                                     bk,
-                                                     1,
-                                                     &am1,
-                                                     col_means_,
-                                                     n_,
-                                                     sums.data(),
-                                                     1,
-                                                     &a1,
-                                                     Z.data_handle(),
-                                                     n_,
-                                                     stream));
+    auto sums = raft::make_device_vector<ValueType, uint32_t>(handle, bk);
+    auto ones = raft::make_device_vector<ValueType, uint32_t>(handle, m_);
+    raft::linalg::map(handle, ones.view(), raft::const_op{ValueType{1}});
+
+    ValueType neg_one = -1, one = 1;
+    // Same col-major buffer as X; row_major tag selects gemv transpose (ones^T @ X).
+    auto X_t = raft::make_device_matrix_view<const ValueType, uint32_t, raft::row_major>(
+      X.data_handle(), X.extent(0), X.extent(1));
+    raft::linalg::gemv(handle, X_t, raft::make_const_mdspan(ones.view()), sums.view());
+    raft::linalg::gemm(handle,
+                       raft::reshape(raft::make_device_vector_view(col_means_, n_),
+                                     raft::make_extents<uint32_t>(n_, 1)),
+                       raft::reshape(sums.view(), raft::make_extents<uint32_t>(1, bk)),
+                       Z,
+                       std::make_optional(raft::make_host_scalar_view(&neg_one)),
+                       std::make_optional(raft::make_host_scalar_view(&one)));
   }
 };
 
@@ -663,7 +656,13 @@ class MeanCenteredOperatorTest : public ::testing::Test {
     auto S  = raft::make_device_vector<ValueType, uint32_t>(handle, k);
     auto U  = raft::make_device_matrix<ValueType, uint32_t, raft::col_major>(handle, m, k);
     auto Vt = raft::make_device_matrix<ValueType, uint32_t, raft::col_major>(handle, k, n);
-    sparse_randomized_svd(handle, config, op, S.view(), U.view(), Vt.view());
+    raft::execute_with_dry_run_check(
+      handle,
+      [&](raft::resources const& h) {
+        sparse_randomized_svd(h, config, op, S.view(), U.view(), Vt.view());
+      },
+      raft::alloc_behavior::ARGUMENT_DRIVEN,
+      randomized_svds_loose_min_alloc<ValueType>(m, n));
 
     // Singular values must match dense centered ground truth
     ASSERT_TRUE(raft::devArrMatch<ValueType>(
diff --git a/cpp/tests/sparse/spmm.cu b/cpp/tests/sparse/spmm.cu
index d3df89aecb..72f61e5d2c 100644
--- a/cpp/tests/sparse/spmm.cu
+++ b/cpp/tests/sparse/spmm.cu
@@ -1,5 +1,5 @@
 /*
- * SPDX-FileCopyrightText: Copyright (c) 2024, NVIDIA CORPORATION.
+ * SPDX-FileCopyrightText: Copyright (c) 2024-2026, NVIDIA CORPORATION.
  * SPDX-License-Identifier: Apache-2.0
  */
 
@@ -189,8 +189,16 @@ class SpmmTest : public ::testing::TestWithParam<SpmmInputs<T>> {
                                               ldz,
                                               params.row_major);
 
-    spmm(
-      handle, params.trans_x, params.trans_y, &alpha, X_csr, y_stride_view, &beta, z_stride_view);
+    // min_alloc: the actual contiguous span of the strided z matrix (what spmm allocates for z_tmp)
+    auto z_span = params.row_major ? (size_t(params.M) - 1) * ldz + params.N
+                                   : (size_t(params.N) - 1) * ldz + params.M;
+    raft::execute_with_dry_run_check(
+      handle,
+      [&](raft::resources const& h) {
+        spmm(h, params.trans_x, params.trans_y, &alpha, X_csr, y_stride_view, &beta, z_stride_view);
+      },
+      raft::alloc_behavior::ARGUMENT_DRIVEN,
+      z_span * sizeof(T));
 
     resource::sync_stream(handle, stream);
 
diff --git a/cpp/tests/sparse/symmetrize.cu b/cpp/tests/sparse/symmetrize.cu
index 17c3390f2a..ba4f898cfe 100644
--- a/cpp/tests/sparse/symmetrize.cu
+++ b/cpp/tests/sparse/symmetrize.cu
@@ -1,5 +1,5 @@
 /*
- * SPDX-FileCopyrightText: Copyright (c) 2019-2024, NVIDIA CORPORATION.
+ * SPDX-FileCopyrightText: Copyright (c) 2019-2026, NVIDIA CORPORATION.
  * SPDX-License-Identifier: Apache-2.0
  */
 
@@ -94,8 +94,14 @@ class SparseSymmetrizeTest
 
     raft::sparse::COO<value_t, value_idx, nnz_t> out(stream);
 
-    raft::sparse::linalg::symmetrize(
-      handle, coo_rows.data(), indices.data(), data.data(), m, n, coo_rows.size(), out);
+    raft::execute_with_dry_run_check(
+      handle,
+      [&](raft::resources const& h) {
+        raft::sparse::linalg::symmetrize(
+          h, coo_rows.data(), indices.data(), data.data(), m, n, coo_rows.size(), out);
+      },
+      raft::alloc_behavior::DATA_DRIVEN,
+      nnz * 2 * (2 * sizeof(value_idx) + sizeof(value_t)));
 
     rmm::device_scalar<value_idx> sum(stream);
     sum.set_value_to_zero_async(stream);
diff --git a/cpp/tests/stats/cov.cu b/cpp/tests/stats/cov.cu
index 761fbb073a..b7ca455a97 100644
--- a/cpp/tests/stats/cov.cu
+++ b/cpp/tests/stats/cov.cu
@@ -1,5 +1,5 @@
 /*
- * SPDX-FileCopyrightText: Copyright (c) 2018-2024, NVIDIA CORPORATION.
+ * SPDX-FileCopyrightText: Copyright (c) 2018-2026, NVIDIA CORPORATION.
  * SPDX-License-Identifier: Apache-2.0
  */
 
@@ -64,21 +64,31 @@ class CovTest : public ::testing::TestWithParam<CovInputs<T>> {
     if (params.rowMajor) {
       using layout = raft::row_major;
       raft::stats::mean<true>(mean_act.data(), data.data(), cols, rows, stream);
-      cov(handle,
-          raft::make_device_matrix_view<T, std::uint32_t, layout>(data.data(), rows, cols),
-          raft::make_device_vector_view<const T, std::uint32_t>(mean_act.data(), cols),
-          raft::make_device_matrix_view<T, std::uint32_t, layout>(cov_act.data(), cols, cols),
-          params.sample,
-          params.stable);
+      raft::execute_with_dry_run_check(
+        handle,
+        [&](raft::resources const& h) {
+          cov(h,
+              raft::make_device_matrix_view<T, std::uint32_t, layout>(data.data(), rows, cols),
+              raft::make_device_vector_view<const T, std::uint32_t>(mean_act.data(), cols),
+              raft::make_device_matrix_view<T, std::uint32_t, layout>(cov_act.data(), cols, cols),
+              params.sample,
+              params.stable);
+        },
+        raft::alloc_behavior::NO_ALLOCATIONS);
     } else {
       using layout = raft::col_major;
       raft::stats::mean<false>(mean_act.data(), data.data(), cols, rows, stream);
-      cov(handle,
-          raft::make_device_matrix_view<T, std::uint32_t, layout>(data.data(), rows, cols),
-          raft::make_device_vector_view<const T, std::uint32_t>(mean_act.data(), cols),
-          raft::make_device_matrix_view<T, std::uint32_t, layout>(cov_act.data(), cols, cols),
-          params.sample,
-          params.stable);
+      raft::execute_with_dry_run_check(
+        handle,
+        [&](raft::resources const& h) {
+          cov(h,
+              raft::make_device_matrix_view<T, std::uint32_t, layout>(data.data(), rows, cols),
+              raft::make_device_vector_view<const T, std::uint32_t>(mean_act.data(), cols),
+              raft::make_device_matrix_view<T, std::uint32_t, layout>(cov_act.data(), cols, cols),
+              params.sample,
+              params.stable);
+        },
+        raft::alloc_behavior::NO_ALLOCATIONS);
     }
 
     T data_h[6]       = {1.0, 2.0, 5.0, 4.0, 2.0, 1.0};
diff --git a/cpp/tests/stats/homogeneity_score.cu b/cpp/tests/stats/homogeneity_score.cu
index 90b456d32a..0590913af2 100644
--- a/cpp/tests/stats/homogeneity_score.cu
+++ b/cpp/tests/stats/homogeneity_score.cu
@@ -1,10 +1,11 @@
 /*
- * SPDX-FileCopyrightText: Copyright (c) 2019-2024, NVIDIA CORPORATION.
+ * SPDX-FileCopyrightText: Copyright (c) 2019-2026, NVIDIA CORPORATION.
  * SPDX-License-Identifier: Apache-2.0
  */
 #include "../test_utils.cuh"
 
 #include <raft/core/resource/cuda_stream.hpp>
+#include <raft/stats/entropy.cuh>
 #include <raft/stats/homogeneity_score.cuh>
 #include <raft/stats/mutual_info_score.cuh>
 #include <raft/util/cudart_utils.hpp>
diff --git a/cpp/tests/stats/mean.cu b/cpp/tests/stats/mean.cu
index 4323d2f0bd..72d5b3c4ea 100644
--- a/cpp/tests/stats/mean.cu
+++ b/cpp/tests/stats/mean.cu
@@ -1,5 +1,5 @@
 /*
- * SPDX-FileCopyrightText: Copyright (c) 2018-2024, NVIDIA CORPORATION.
+ * SPDX-FileCopyrightText: Copyright (c) 2018-2026, NVIDIA CORPORATION.
  * SPDX-License-Identifier: Apache-2.0
  */
 
@@ -59,17 +59,22 @@ class MeanTest : public ::testing::TestWithParam<MeanInputs<T>> {
   void meanSGtest(T* data, cudaStream_t stream)
   {
     int rows = params.rows, cols = params.cols;
-    if (params.rowMajor) {
-      using layout = raft::row_major;
-      mean(handle,
-           raft::make_device_matrix_view<const T, int, layout>(data, rows, cols),
-           raft::make_device_vector_view<T, int>(mean_act.data(), cols));
-    } else {
-      using layout = raft::col_major;
-      mean(handle,
-           raft::make_device_matrix_view<const T, int, layout>(data, rows, cols),
-           raft::make_device_vector_view<T, int>(mean_act.data(), cols));
-    }
+    raft::execute_with_dry_run_check(
+      handle,
+      [&](raft::resources const& h) {
+        if (params.rowMajor) {
+          using layout = raft::row_major;
+          mean(h,
+               raft::make_device_matrix_view<const T, int, layout>(data, rows, cols),
+               raft::make_device_vector_view<T, int>(mean_act.data(), cols));
+        } else {
+          using layout = raft::col_major;
+          mean(h,
+               raft::make_device_matrix_view<const T, int, layout>(data, rows, cols),
+               raft::make_device_vector_view<T, int>(mean_act.data(), cols));
+        }
+      },
+      raft::alloc_behavior::ARGUMENT_DRIVEN);
   }
 
  protected:
diff --git a/cpp/tests/stats/meanvar.cu b/cpp/tests/stats/meanvar.cu
index 92f133e9c2..6cd6603890 100644
--- a/cpp/tests/stats/meanvar.cu
+++ b/cpp/tests/stats/meanvar.cu
@@ -59,23 +59,31 @@ class MeanVarTest : public ::testing::TestWithParam<MeanVarInputs<T>> {
     random::RngState r(params.seed);
     normal(handle, r, data.data(), params.cols * params.rows, params.mean, params.stddev);
 
-    if (params.rowMajor) {
-      using layout = raft::row_major;
-      meanvar(
-        handle,
-        raft::make_device_matrix_view<const T, int, layout>(data.data(), params.rows, params.cols),
-        raft::make_device_vector_view<T, int>(mean_act.data(), params.cols),
-        raft::make_device_vector_view<T, int>(vars_act.data(), params.cols),
-        params.sample);
-    } else {
-      using layout = raft::col_major;
-      meanvar(
-        handle,
-        raft::make_device_matrix_view<const T, int, layout>(data.data(), params.rows, params.cols),
-        raft::make_device_vector_view<T, int>(mean_act.data(), params.cols),
-        raft::make_device_vector_view<T, int>(vars_act.data(), params.cols),
-        params.sample);
-    }
+    raft::execute_with_dry_run_check(
+      handle,
+      [&](raft::resources const& h) {
+        if (params.rowMajor) {
+          using layout = raft::row_major;
+          meanvar(h,
+                  raft::make_device_matrix_view<const T, int, layout>(
+                    data.data(), params.rows, params.cols),
+                  raft::make_device_vector_view<T, int>(mean_act.data(), params.cols),
+                  raft::make_device_vector_view<T, int>(vars_act.data(), params.cols),
+                  params.sample);
+        } else {
+          using layout = raft::col_major;
+          meanvar(h,
+                  raft::make_device_matrix_view<const T, int, layout>(
+                    data.data(), params.rows, params.cols),
+                  raft::make_device_vector_view<T, int>(mean_act.data(), params.cols),
+                  raft::make_device_vector_view<T, int>(vars_act.data(), params.cols),
+                  params.sample);
+        }
+      },
+      raft::alloc_behavior::ARGUMENT_DRIVEN,
+      params.rowMajor ? sizeof(raft::stats::detail::mean_var<T>) * params.cols +
+                          sizeof(int) * raft::ceildiv(params.cols, 32)
+                      : 0);
     RAFT_CUDA_TRY(cudaStreamSynchronize(stream));
   }
 
diff --git a/cpp/tests/stats/minmax.cu b/cpp/tests/stats/minmax.cu
index 8d17b13e01..0322c41c30 100644
--- a/cpp/tests/stats/minmax.cu
+++ b/cpp/tests/stats/minmax.cu
@@ -1,5 +1,5 @@
 /*
- * SPDX-FileCopyrightText: Copyright (c) 2019-2024, NVIDIA CORPORATION.
+ * SPDX-FileCopyrightText: Copyright (c) 2019-2026, NVIDIA CORPORATION.
  * SPDX-License-Identifier: Apache-2.0
  */
 
@@ -116,15 +116,20 @@ class MinMaxTest : public ::testing::TestWithParam<MinMaxInputs<T>> {
                 minmax_ref.data(),
                 minmax_ref.data() + params.cols,
                 stream);
-    raft::stats::minmax<T, int>(
+    raft::execute_with_dry_run_check(
       handle,
-      raft::make_device_matrix_view<const T, int, raft::layout_f_contiguous>(
-        data.data(), params.rows, params.cols),
-      std::nullopt,
-      std::nullopt,
-      raft::make_device_vector_view<T, int>(minmax_act.data(), params.cols),
-      raft::make_device_vector_view<T, int>(minmax_act.data() + params.cols, params.cols),
-      std::nullopt);
+      [&](raft::resources const& h) {
+        raft::stats::minmax<T, int>(
+          h,
+          raft::make_device_matrix_view<const T, int, raft::layout_f_contiguous>(
+            data.data(), params.rows, params.cols),
+          std::nullopt,
+          std::nullopt,
+          raft::make_device_vector_view<T, int>(minmax_act.data(), params.cols),
+          raft::make_device_vector_view<T, int>(minmax_act.data() + params.cols, params.cols),
+          std::nullopt);
+      },
+      raft::alloc_behavior::NO_ALLOCATIONS);
   }
 
  protected:
diff --git a/cpp/tests/stats/stddev.cu b/cpp/tests/stats/stddev.cu
index fcdb4baca2..c8031747ca 100644
--- a/cpp/tests/stats/stddev.cu
+++ b/cpp/tests/stats/stddev.cu
@@ -66,41 +66,46 @@ class StdDevTest : public ::testing::TestWithParam<StdDevInputs<T>> {
   {
     int rows = params.rows, cols = params.cols;
 
-    if (params.rowMajor) {
-      using layout_t = raft::row_major;
-      mean(handle,
-           raft::make_device_matrix_view<const T, int, layout_t>(data, rows, cols),
-           raft::make_device_vector_view<T, int>(mean_act.data(), cols));
-
-      stddev(handle,
-             raft::make_device_matrix_view<const T, int, layout_t>(data, rows, cols),
-             raft::make_device_vector_view<const T, int>(mean_act.data(), cols),
-             raft::make_device_vector_view<T, int>(stddev_act.data(), cols),
-             params.sample);
-
-      vars(handle,
-           raft::make_device_matrix_view<const T, int, layout_t>(data, rows, cols),
-           raft::make_device_vector_view<const T, int>(mean_act.data(), cols),
-           raft::make_device_vector_view<T, int>(vars_act.data(), cols),
-           params.sample);
-    } else {
-      using layout_t = raft::col_major;
-      mean(handle,
-           raft::make_device_matrix_view<const T, int, layout_t>(data, rows, cols),
-           raft::make_device_vector_view<T>(mean_act.data(), cols));
-
-      stddev(handle,
-             raft::make_device_matrix_view<const T, int, layout_t>(data, rows, cols),
-             raft::make_device_vector_view<const T, int>(mean_act.data(), cols),
-             raft::make_device_vector_view<T, int>(stddev_act.data(), cols),
-             params.sample);
-
-      vars(handle,
-           raft::make_device_matrix_view<const T, int, layout_t>(data, rows, cols),
-           raft::make_device_vector_view<const T, int>(mean_act.data(), cols),
-           raft::make_device_vector_view<T, int>(vars_act.data(), cols),
-           params.sample);
-    }
+    raft::execute_with_dry_run_check(
+      handle,
+      [&](raft::resources const& h) {
+        if (params.rowMajor) {
+          using layout_t = raft::row_major;
+          mean(h,
+               raft::make_device_matrix_view<const T, int, layout_t>(data, rows, cols),
+               raft::make_device_vector_view<T, int>(mean_act.data(), cols));
+
+          stddev(h,
+                 raft::make_device_matrix_view<const T, int, layout_t>(data, rows, cols),
+                 raft::make_device_vector_view<const T, int>(mean_act.data(), cols),
+                 raft::make_device_vector_view<T, int>(stddev_act.data(), cols),
+                 params.sample);
+
+          vars(h,
+               raft::make_device_matrix_view<const T, int, layout_t>(data, rows, cols),
+               raft::make_device_vector_view<const T, int>(mean_act.data(), cols),
+               raft::make_device_vector_view<T, int>(vars_act.data(), cols),
+               params.sample);
+        } else {
+          using layout_t = raft::col_major;
+          mean(h,
+               raft::make_device_matrix_view<const T, int, layout_t>(data, rows, cols),
+               raft::make_device_vector_view<T>(mean_act.data(), cols));
+
+          stddev(h,
+                 raft::make_device_matrix_view<const T, int, layout_t>(data, rows, cols),
+                 raft::make_device_vector_view<const T, int>(mean_act.data(), cols),
+                 raft::make_device_vector_view<T, int>(stddev_act.data(), cols),
+                 params.sample);
+
+          vars(h,
+               raft::make_device_matrix_view<const T, int, layout_t>(data, rows, cols),
+               raft::make_device_vector_view<const T, int>(mean_act.data(), cols),
+               raft::make_device_vector_view<T, int>(vars_act.data(), cols),
+               params.sample);
+        }
+      },
+      raft::alloc_behavior::ARGUMENT_DRIVEN);
     T scalar = T(1);
     raft::matrix::weighted_sqrt(
       handle,
diff --git a/cpp/tests/stats/sum.cu b/cpp/tests/stats/sum.cu
index 711c90acf7..58f949854a 100644
--- a/cpp/tests/stats/sum.cu
+++ b/cpp/tests/stats/sum.cu
@@ -1,5 +1,5 @@
 /*
- * SPDX-FileCopyrightText: Copyright (c) 2018-2024, NVIDIA CORPORATION.
+ * SPDX-FileCopyrightText: Copyright (c) 2018-2026, NVIDIA CORPORATION.
  * SPDX-License-Identifier: Apache-2.0
  */
 
@@ -76,17 +76,22 @@ class SumTest : public ::testing::TestWithParam<SumInputs<T>> {
 
     raft::update_device(data.data(), data_h.data(), len, stream);
 
-    if (params.rowMajor) {
-      using layout = raft::row_major;
-      sum(handle,
-          raft::make_device_matrix_view<const T, int, layout>(data.data(), rows, cols),
-          raft::make_device_vector_view(sum_act.data(), cols));
-    } else {
-      using layout = raft::col_major;
-      sum(handle,
-          raft::make_device_matrix_view<const T, int, layout>(data.data(), rows, cols),
-          raft::make_device_vector_view(sum_act.data(), cols));
-    }
+    raft::execute_with_dry_run_check(
+      handle,
+      [&](raft::resources const& h) {
+        if (params.rowMajor) {
+          using layout = raft::row_major;
+          sum(h,
+              raft::make_device_matrix_view<const T, int, layout>(data.data(), rows, cols),
+              raft::make_device_vector_view(sum_act.data(), cols));
+        } else {
+          using layout = raft::col_major;
+          sum(h,
+              raft::make_device_matrix_view<const T, int, layout>(data.data(), rows, cols),
+              raft::make_device_vector_view(sum_act.data(), cols));
+        }
+      },
+      raft::alloc_behavior::ARGUMENT_DRIVEN);
     resource::sync_stream(handle, stream);
 
     double expected = double(params.rows) * params.value;
diff --git a/cpp/tests/stats/weighted_mean.cu b/cpp/tests/stats/weighted_mean.cu
index 05640b530e..78a919eb49 100644
--- a/cpp/tests/stats/weighted_mean.cu
+++ b/cpp/tests/stats/weighted_mean.cu
@@ -1,5 +1,5 @@
 /*
- * SPDX-FileCopyrightText: Copyright (c) 2019-2024, NVIDIA CORPORATION.
+ * SPDX-FileCopyrightText: Copyright (c) 2019-2026, NVIDIA CORPORATION.
  * SPDX-License-Identifier: Apache-2.0
  */
 
@@ -89,17 +89,20 @@ class RowWeightedMeanTest : public ::testing::TestWithParam<WeightedMeanInputs<T
     auto weights =
       raft::make_device_vector_view<const T, std::uint32_t>(dweights.data().get(), cols);
 
-    if (params.row_major) {
-      auto input = raft::make_device_matrix_view<const T, std::uint32_t, raft::row_major>(
-        din.data().get(), rows, cols);
-      // compute result
-      row_weighted_mean(handle, input, weights, output);
-    } else {
-      auto input = raft::make_device_matrix_view<const T, std::uint32_t, raft::col_major>(
-        din.data().get(), rows, cols);
-      // compute result
-      row_weighted_mean(handle, input, weights, output);
-    }
+    raft::execute_with_dry_run_check(
+      handle,
+      [&](raft::resources const& h) {
+        if (params.row_major) {
+          auto input = raft::make_device_matrix_view<const T, std::uint32_t, raft::row_major>(
+            din.data().get(), rows, cols);
+          row_weighted_mean(h, input, weights, output);
+        } else {
+          auto input = raft::make_device_matrix_view<const T, std::uint32_t, raft::col_major>(
+            din.data().get(), rows, cols);
+          row_weighted_mean(h, input, weights, output);
+        }
+      },
+      raft::alloc_behavior::NO_ALLOCATIONS);
 
     // adjust tolerance to account for round-off accumulation
     params.tolerance *= params.N;
@@ -164,17 +167,20 @@ class ColWeightedMeanTest : public ::testing::TestWithParam<WeightedMeanInputs<T
     auto output = raft::make_device_vector_view<T, std::uint32_t>(dact.data().get(), cols);
     auto weights =
       raft::make_device_vector_view<const T, std::uint32_t>(dweights.data().get(), rows);
-    if (params.row_major) {
-      auto input = raft::make_device_matrix_view<const T, std::uint32_t, raft::row_major>(
-        din.data().get(), rows, cols);
-      // compute result
-      col_weighted_mean(handle, input, weights, output);
-    } else {
-      auto input = raft::make_device_matrix_view<const T, std::uint32_t, raft::col_major>(
-        din.data().get(), rows, cols);
-      // compute result
-      col_weighted_mean(handle, input, weights, output);
-    }
+    raft::execute_with_dry_run_check(
+      handle,
+      [&](raft::resources const& h) {
+        if (params.row_major) {
+          auto input = raft::make_device_matrix_view<const T, std::uint32_t, raft::row_major>(
+            din.data().get(), rows, cols);
+          col_weighted_mean(h, input, weights, output);
+        } else {
+          auto input = raft::make_device_matrix_view<const T, std::uint32_t, raft::col_major>(
+            din.data().get(), rows, cols);
+          col_weighted_mean(h, input, weights, output);
+        }
+      },
+      raft::alloc_behavior::NO_ALLOCATIONS);
     // adjust tolerance to account for round-off accumulation
     params.tolerance *= params.M;
   }
@@ -222,25 +228,28 @@ class WeightedMeanTest : public ::testing::TestWithParam<WeightedMeanInputs<T>>
     auto output = raft::make_device_vector_view<T, std::uint32_t>(dact.data().get(), mean_size);
     auto weights =
       raft::make_device_vector_view<const T, std::uint32_t>(dweights.data().get(), weight_size);
-    if (params.row_major) {
-      auto input = raft::make_device_matrix_view<const T, std::uint32_t, raft::row_major>(
-        din.data().get(), rows, cols);
-      // compute result
-      if (params.along_rows) {
-        weighted_mean<Apply::ALONG_ROWS>(handle, input, weights, output);
-      } else {
-        weighted_mean<Apply::ALONG_COLUMNS>(handle, input, weights, output);
-      }
-    } else {
-      auto input = raft::make_device_matrix_view<const T, std::uint32_t, raft::col_major>(
-        din.data().get(), rows, cols);
-      // compute result
-      if (params.along_rows) {
-        weighted_mean<Apply::ALONG_ROWS>(handle, input, weights, output);
-      } else {
-        weighted_mean<Apply::ALONG_COLUMNS>(handle, input, weights, output);
-      }
-    }
+    raft::execute_with_dry_run_check(
+      handle,
+      [&](raft::resources const& h) {
+        if (params.row_major) {
+          auto input = raft::make_device_matrix_view<const T, std::uint32_t, raft::row_major>(
+            din.data().get(), rows, cols);
+          if (params.along_rows) {
+            weighted_mean<Apply::ALONG_ROWS>(h, input, weights, output);
+          } else {
+            weighted_mean<Apply::ALONG_COLUMNS>(h, input, weights, output);
+          }
+        } else {
+          auto input = raft::make_device_matrix_view<const T, std::uint32_t, raft::col_major>(
+            din.data().get(), rows, cols);
+          if (params.along_rows) {
+            weighted_mean<Apply::ALONG_ROWS>(h, input, weights, output);
+          } else {
+            weighted_mean<Apply::ALONG_COLUMNS>(h, input, weights, output);
+          }
+        }
+      },
+      raft::alloc_behavior::NO_ALLOCATIONS);
     // adjust tolerance to account for round-off accumulation
     params.tolerance *= params.N;
   }
diff --git a/cpp/tests/test_utils.cuh b/cpp/tests/test_utils.cuh
index 646eb46a54..e7283be176 100644
--- a/cpp/tests/test_utils.cuh
+++ b/cpp/tests/test_utils.cuh
@@ -7,11 +7,17 @@
 
 #include "test_utils.h"
 
+#include <raft/core/dry_run_resources.hpp>
+#include <raft/core/memory_stats_resources.hpp>
+#include <raft/core/resource/cuda_stream.hpp>
+#include <raft/core/resources.hpp>
 #include <raft/random/rng.cuh>
 #include <raft/util/cuda_utils.cuh>
 #include <raft/util/cudart_utils.hpp>
 
 #include <rmm/exec_policy.hpp>
+#include <rmm/mr/per_device_resource.hpp>
+#include <rmm/resource_ref.hpp>
 
 #include <cuda/iterator>
 #include <thrust/for_each.h>
@@ -327,4 +333,96 @@ inline std::vector<float> read_csv(std::string filename, bool skip_first_n_colum
   return result;
 }
 
+enum class alloc_behavior {
+  NO_ALLOCATIONS,
+  ARGUMENT_DRIVEN,
+  DATA_DRIVEN,
+};
+
+/**
+ * @brief Execute an action and check dry-run protocol compliance.
+ *
+ * Runs @p action once in dry-run mode (via dry_run_execute) to record predicted
+ * allocations, then runs it for real with all six memory resources wrapped in
+ * statistics adaptors to record actual peak usage. Compares the predicted and
+ * actual peaks according to the specified @p behavior.
+ *
+ * @tparam Action callable with signature void(raft::resources const&)
+ */
+template <typename Action>
+void execute_with_dry_run_check(raft::resources const& res,
+                                Action&& action,
+                                alloc_behavior behavior,
+                                std::size_t min_alloc = 0)
+{
+  auto dry = raft::util::dry_run_execute(res, action);
+
+  raft::memory_stats_resources stat_res(res);
+  std::forward<Action>(action)(static_cast<const raft::resources&>(stat_res));
+  resource::sync_stream(stat_res);
+  auto actual = stat_res.get_bytes_peak();
+
+  auto total_dry    = dry.total();
+  auto total_actual = actual.total();
+
+  if (dry.device_workspace != actual.device_workspace ||
+      dry.device_large_workspace != actual.device_large_workspace ||
+      dry.device_global != actual.device_global || dry.device_managed != actual.device_managed ||
+      dry.host != actual.host || dry.host_pinned != actual.host_pinned) {
+    printf(
+      "  dry-run: ws=%zu large_ws=%zu global=%zu managed=%zu host=%zu pinned=%zu (total=%zu)\n"
+      "  actual:  ws=%zu large_ws=%zu global=%zu managed=%zu host=%zu pinned=%zu (total=%zu)\n",
+      dry.device_workspace,
+      dry.device_large_workspace,
+      dry.device_global,
+      dry.device_managed,
+      dry.host,
+      dry.host_pinned,
+      total_dry,
+      actual.device_workspace,
+      actual.device_large_workspace,
+      actual.device_global,
+      actual.device_managed,
+      actual.host,
+      actual.host_pinned,
+      total_actual);
+  }
+
+  EXPECT_GE(total_actual, min_alloc);
+  EXPECT_GE(total_dry, min_alloc);
+
+  switch (behavior) {
+    case alloc_behavior::NO_ALLOCATIONS:
+      EXPECT_EQ(dry.device_workspace, std::size_t{0});
+      EXPECT_EQ(dry.device_large_workspace, std::size_t{0});
+      EXPECT_EQ(dry.device_global, std::size_t{0});
+      EXPECT_EQ(dry.device_managed, std::size_t{0});
+      EXPECT_EQ(dry.host, std::size_t{0});
+      EXPECT_EQ(dry.host_pinned, std::size_t{0});
+      EXPECT_EQ(actual.device_workspace, std::size_t{0});
+      EXPECT_EQ(actual.device_large_workspace, std::size_t{0});
+      EXPECT_EQ(actual.device_global, std::size_t{0});
+      EXPECT_EQ(actual.device_managed, std::size_t{0});
+      EXPECT_EQ(actual.host, std::size_t{0});
+      EXPECT_EQ(actual.host_pinned, std::size_t{0});
+      break;
+    case alloc_behavior::ARGUMENT_DRIVEN:
+      EXPECT_EQ(dry.device_workspace, actual.device_workspace);
+      EXPECT_EQ(dry.device_large_workspace, actual.device_large_workspace);
+      EXPECT_EQ(dry.device_global, actual.device_global);
+      EXPECT_EQ(dry.device_managed, actual.device_managed);
+      EXPECT_EQ(dry.host, actual.host);
+      EXPECT_EQ(dry.host_pinned, actual.host_pinned);
+      break;
+    case alloc_behavior::DATA_DRIVEN:
+      EXPECT_GE(dry.device_workspace, actual.device_workspace);
+      EXPECT_GE(dry.device_large_workspace, actual.device_large_workspace);
+      EXPECT_GE(dry.device_global, actual.device_global);
+      EXPECT_GE(dry.device_managed, actual.device_managed);
+      EXPECT_GE(dry.host, actual.host);
+      EXPECT_GE(dry.host_pinned, actual.host_pinned);
+      break;
+  }
+}
+
 };  // end namespace raft
diff --git a/cpp/tests/util/bitonic_sort.cu b/cpp/tests/util/bitonic_sort.cu
index 742cf8aeda..5f0658643f 100644
--- a/cpp/tests/util/bitonic_sort.cu
+++ b/cpp/tests/util/bitonic_sort.cu
@@ -1,5 +1,5 @@
 /*
- * SPDX-FileCopyrightText: Copyright (c) 2022-2023, NVIDIA CORPORATION.
+ * SPDX-FileCopyrightText: Copyright (c) 2022-2026, NVIDIA CORPORATION.
  * SPDX-License-Identifier: Apache-2.0
  */
 
@@ -143,8 +143,13 @@ class BitonicTest : public testing::TestWithParam<test_spec> {  // NOLINT
     fill_random(arr_d);
     update_host(in.data(), arr_d.data(), arr_d.size(), stream);
 
-    // calculate the results
-    bitonic_launch<kMaxCapacity>::run(spec, arr_d.data(), stream);
+    // calculate the results (verify dry-run compliance of bitonic sort launch)
+    raft::execute_with_dry_run_check(
+      handle_,
+      [&](raft::resources const&) {
+        bitonic_launch<kMaxCapacity>::run(spec, arr_d.data(), stream);
+      },
+      raft::alloc_behavior::NO_ALLOCATIONS);
     update_host(out.data(), arr_d.data(), arr_d.size(), stream);
 
     // make sure the results are available on host
diff --git a/cpp/tests/util/dry_run_guards.cu b/cpp/tests/util/dry_run_guards.cu
new file mode 100644
index 0000000000..e07e8f1ff0
--- /dev/null
+++ b/cpp/tests/util/dry_run_guards.cu
@@ -0,0 +1,268 @@
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 2026, NVIDIA CORPORATION.
+ * SPDX-License-Identifier: Apache-2.0
+ */
+
+#include <raft/core/copy.hpp>
+#include <raft/core/device_mdarray.hpp>
+#include <raft/core/device_mdspan.hpp>
+#include <raft/core/dry_run_resources.hpp>
+#include <raft/core/host_mdspan.hpp>
+#include <raft/core/operators.hpp>
+#include <raft/core/resource/cuda_stream.hpp>
+#include <raft/core/resource/device_memory_resource.hpp>
+#include <raft/core/resource/dry_run_flag.hpp>
+#include <raft/core/resources.hpp>
+#include <raft/linalg/add.cuh>
+#include <raft/linalg/map.cuh>
+#include <raft/linalg/norm.cuh>
+#include <raft/random/rng.cuh>
+#include <raft/stats/mean.cuh>
+#include <raft/util/cudart_utils.hpp>
+
+#include <rmm/device_uvector.hpp>
+
+#include <gtest/gtest.h>
+
+#include <cstddef>
+
+namespace raft::util {
+
+// ===================================================================
+// Test Category 1: No CUDA stream activity during dry-run
+// ===================================================================
+
+/**
+ * @brief Verify that dry-run guards prevent actual kernel execution.
+ *
+ * Strategy: fill a device array with a known value, run a RAFT function
+ * under dry-run mode that would overwrite it, then read back and confirm
+ * the original value is untouched.
+ */
+TEST(DryRunGuard, AddDoesNotExecute)
+{
+  raft::resources res;
+  auto stream     = resource::get_cuda_stream(res);
+  constexpr int n = 256;
+
+  auto a   = raft::make_device_vector<float>(res, n);
+  auto b   = raft::make_device_vector<float>(res, n);
+  auto out = raft::make_device_vector<float>(res, n);
+
+  raft::linalg::map(res, a.view(), raft::const_op<float>{1.0f});
+  raft::linalg::map(res, b.view(), raft::const_op<float>{2.0f});
+  raft::linalg::map(res, out.view(), raft::const_op<float>{99.0f});
+  resource::sync_stream(res);
+
+  // Enable dry-run; add would write 3.0 to out
+  resource::set_dry_run_flag(res, true);
+  raft::linalg::add(
+    res, raft::make_const_mdspan(a.view()), raft::make_const_mdspan(b.view()), out.view());
+  resource::set_dry_run_flag(res, false);
+  resource::sync_stream(res);
+
+  // Verify data was NOT modified
+  std::vector<float> h_out(n);
+  auto out_src_view = raft::make_const_mdspan(out.view());
+  auto out_dst_view = raft::make_host_vector_view<float, int>(h_out.data(), n);
+  raft::copy(res, out_dst_view, out_src_view);
+  resource::sync_stream(res);
+  for (int i = 0; i < n; ++i) {
+    EXPECT_FLOAT_EQ(h_out[i], 99.0f) << "at index " << i;
+  }
+}
+
+TEST(DryRunGuard, RngDoesNotExecute)
+{
+  raft::resources res;
+  auto stream     = resource::get_cuda_stream(res);
+  constexpr int n = 1024;
+
+  auto out = raft::make_device_vector<float>(res, n);
+  raft::linalg::map(res, out.view(), raft::const_op<float>{0.0f});
+  resource::sync_stream(res);
+
+  raft::random::RngState state(42);
+
+  resource::set_dry_run_flag(res, true);
+  raft::random::uniform(res, state, out.view(), -1.0f, 1.0f);
+  resource::set_dry_run_flag(res, false);
+  resource::sync_stream(res);
+
+  std::vector<float> h_out(n);
+  auto out_src_view = raft::make_const_mdspan(out.view());
+  auto out_dst_view = raft::make_host_vector_view<float, int>(h_out.data(), n);
+  raft::copy(res, out_dst_view, out_src_view);
+  resource::sync_stream(res);
+  for (int i = 0; i < n; ++i) {
+    EXPECT_FLOAT_EQ(h_out[i], 0.0f) << "at index " << i;
+  }
+}
+
+// ===================================================================
+// Test Category 2: Accurate allocation tracking
+// ===================================================================
+
+TEST(DryRunAllocTracking, DeviceUvectorTracked)
+{
+  raft::resources res;
+
+  constexpr std::size_t kAllocSize = 16UL * 1024UL * 1024UL;  // 16 MiB
+
+  auto stats = dry_run_execute(res, [&](raft::resources const& r) {
+    // Allocate an rmm::device_uvector; the dry-run MR should track it
+    rmm::device_uvector<float> buf(kAllocSize / sizeof(float), resource::get_cuda_stream(r));
+  });
+
+  // The allocation should be tracked (note: rmm may align the size)
+  EXPECT_GE(stats.device_global, kAllocSize);
+}
+
+TEST(DryRunAllocTracking, MakeDeviceArrayTracked)
+{
+  raft::resources res;
+
+  constexpr int rows             = 1024;
+  constexpr int cols             = 512;
+  constexpr std::size_t expected = rows * cols * sizeof(float);
+
+  auto stats = dry_run_execute(res, [&](raft::resources const& r) {
+    auto mat = raft::make_device_matrix<float>(r, rows, cols);
+  });
+
+  EXPECT_GE(stats.device_global, expected);
+}
+
+TEST(DryRunAllocTracking, MultipleAllocationsSum)
+{
+  raft::resources res;
+
+  constexpr std::size_t kSize1 = 4UL * 1024UL * 1024UL;  // 4 MiB
+  constexpr std::size_t kSize2 = 8UL * 1024UL * 1024UL;  // 8 MiB
+
+  auto stats = dry_run_execute(res, [&](raft::resources const& r) {
+    auto stream = resource::get_cuda_stream(r);
+    rmm::device_uvector<char> buf1(kSize1, stream);
+    rmm::device_uvector<char> buf2(kSize2, stream);
+    // Both alive at same time -> peak should be sum
+  });
+
+  EXPECT_GE(stats.device_global, kSize1 + kSize2);
+}
+
+TEST(DryRunAllocTracking, DeallocReducesCurrent)
+{
+  raft::resources res;
+
+  constexpr std::size_t kSize1 = 4UL * 1024UL * 1024UL;
+  constexpr std::size_t kSize2 = 8UL * 1024UL * 1024UL;
+
+  auto stats = dry_run_execute(res, [&](raft::resources const& r) {
+    auto stream = resource::get_cuda_stream(r);
+    {
+      rmm::device_uvector<char> buf1(kSize1, stream);
+    }
+    // buf1 is freed now
+    rmm::device_uvector<char> buf2(kSize2, stream);
+    // Peak should be max(kSize1, kSize2) = kSize2 since they don't overlap
+  });
+
+  // Peak should be at least kSize2 (the larger single allocation)
+  EXPECT_GE(stats.device_global, kSize2);
+  // But could be less than kSize1 + kSize2 (since buf1 is freed before buf2)
+  // This depends on timing/implementation, so we just check the peak is reasonable
+}
+
+// ===================================================================
+// Test Category 3: End-to-end integration tests for composite functions
+// ===================================================================
+
+TEST(DryRunE2E, StatsComposite)
+{
+  raft::resources res;
+  auto stream        = resource::get_cuda_stream(res);
+  constexpr int rows = 256;
+  constexpr int cols = 64;
+
+  // Pre-allocate input (outside dry-run)
+  auto input  = raft::make_device_matrix<float>(res, rows, cols);
+  auto output = raft::make_device_vector<float>(res, cols);
+
+  raft::linalg::map(res, input.view(), raft::const_op<float>{1.0f});
+  raft::linalg::map(res, output.view(), raft::const_op<float>{-1.0f});
+  resource::sync_stream(res);
+
+  // Dry-run: compute column means
+  auto stats = dry_run_execute(res, [&](raft::resources const& r) {
+    raft::stats::mean(r, raft::make_const_mdspan(input.view()), output.view(), false);
+  });
+
+  // The output should NOT be modified (still -1.0)
+  std::vector<float> h_output(cols);
+  auto output_src_view = raft::make_const_mdspan(output.view());
+  auto output_dst_view = raft::make_host_vector_view<float, int>(h_output.data(), cols);
+  raft::copy(res, output_dst_view, output_src_view);
+  resource::sync_stream(res);
+  for (int i = 0; i < cols; ++i) {
+    EXPECT_FLOAT_EQ(h_output[i], -1.0f) << "at index " << i;
+  }
+
+  // Verify dry-run flag is restored
+  EXPECT_FALSE(resource::get_dry_run_flag(res));
+  // Stats should be non-negative
+  EXPECT_GE(stats.device_global, 0);
+}
+
+TEST(DryRunE2E, DryRunExecuteWithArgs)
+{
+  raft::resources res;
+
+  // dry_run_execute with extra args perfect-forwarded to the action
+  auto stats = dry_run_execute(
+    res,
+    [](raft::resources const& r, int expected_size) {
+      // Just verify args are forwarded correctly
+      EXPECT_EQ(expected_size, 512);
+      // Allocate some memory to verify tracking
+      rmm::device_uvector<float> tmp(1024, resource::get_cuda_stream(r));
+    },
+    512);
+
+  EXPECT_GE(stats.device_global, 1024 * sizeof(float));
+  EXPECT_FALSE(resource::get_dry_run_flag(res));
+}
+
+TEST(DryRunE2E, NestedDryRunIsNoop)
+{
+  raft::resources res;
+
+  // dry_run_execute is already running, setting the flag again inside should not interfere
+  auto stats = dry_run_execute(res, [&](raft::resources const& r) {
+    EXPECT_TRUE(resource::get_dry_run_flag(r));
+    // Manually set it again (should be harmless)
+    resource::set_dry_run_flag(r, true);
+    EXPECT_TRUE(resource::get_dry_run_flag(r));
+
+    rmm::device_uvector<float> buf(256, resource::get_cuda_stream(r));
+  });
+
+  EXPECT_FALSE(resource::get_dry_run_flag(res));
+  EXPECT_GE(stats.device_global, 256 * sizeof(float));
+}
+
+TEST(DryRunE2E, ExceptionRestoresResources)
+{
+  raft::resources res;
+  auto original_mr   = rmm::mr::get_current_device_resource_ref();
+  auto original_host = raft::mr::get_default_host_resource();
+
+  EXPECT_THROW(dry_run_execute(
+                 res, [](raft::resources const&) { throw std::runtime_error("test exception"); }),
+               std::runtime_error);
+
+  EXPECT_EQ(rmm::mr::get_current_device_resource_ref(), original_mr);
+  EXPECT_EQ(raft::mr::get_default_host_resource(), original_host);
+  EXPECT_FALSE(resource::get_dry_run_flag(res));
+}
+
+}  // namespace raft::util
diff --git a/cpp/tests/util/dry_run_resources.cpp b/cpp/tests/util/dry_run_resources.cpp
new file mode 100644
index 0000000000..55c1c31113
--- /dev/null
+++ b/cpp/tests/util/dry_run_resources.cpp
@@ -0,0 +1,389 @@
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 2026, NVIDIA CORPORATION.
+ * SPDX-License-Identifier: Apache-2.0
+ */
+
+#include <raft/core/dry_run_resources.hpp>
+#include <raft/core/resource/device_memory_resource.hpp>
+#include <raft/core/resource/dry_run_flag.hpp>
+#include <raft/core/resource/managed_memory_resource.hpp>
+#include <raft/core/resource/pinned_memory_resource.hpp>
+#include <raft/core/resources.hpp>
+#include <raft/mr/dry_run_resource.hpp>
+#include <raft/mr/host_device_resource.hpp>
+#include <raft/mr/host_memory_resource.hpp>
+#include <raft/util/cudart_utils.hpp>
+
+#include <rmm/mr/per_device_resource.hpp>
+#include <rmm/resource_ref.hpp>
+
+#include <cuda/stream_ref>
+
+#include <gtest/gtest.h>
+
+#include <cstddef>
+#include <memory>
+#include <stdexcept>
+
+namespace raft::util {
+
+// ===== dry_run_resource tests (device async) =====
+
+TEST(DryRunResource, DeviceAsyncPeakTracking)
+{
+  auto dev_ref = rmm::mr::get_current_device_resource_ref();
+  raft::mr::dry_run_resource<rmm::device_async_resource_ref> dr{dev_ref};
+  auto counter = dr.get_counter();
+
+  constexpr std::size_t kSize1 = 100UL * 1024UL * 1024UL;
+  constexpr std::size_t kSize2 = 200UL * 1024UL * 1024UL;
+
+  void* p1 = dr.allocate(cuda::stream_ref{cudaStreamLegacy}, kSize1);
+  ASSERT_NE(p1, nullptr);
+  EXPECT_EQ(counter->get_allocated_bytes(), kSize1);
+  EXPECT_EQ(counter->get_peak_bytes(), kSize1);
+
+  void* p2 = dr.allocate(cuda::stream_ref{cudaStreamLegacy}, kSize2);
+  EXPECT_EQ(p2, p1);  // same probed pointer for all allocations
+  EXPECT_EQ(counter->get_peak_bytes(), kSize1 + kSize2);
+
+  dr.deallocate(cuda::stream_ref{cudaStreamLegacy}, p1, kSize1);
+  EXPECT_EQ(counter->get_allocated_bytes(), kSize2);
+  EXPECT_EQ(counter->get_peak_bytes(), kSize1 + kSize2);
+
+  dr.deallocate(cuda::stream_ref{cudaStreamLegacy}, p2, kSize2);
+  EXPECT_EQ(counter->get_allocated_bytes(), 0UL);
+}
+
+TEST(DryRunResource, DeviceAsyncLargeAllocation)
+{
+  auto dev_ref = rmm::mr::get_current_device_resource_ref();
+  raft::mr::dry_run_resource<rmm::device_async_resource_ref> dr{dev_ref};
+  auto counter = dr.get_counter();
+
+  constexpr std::size_t kOneGiB = 1024UL * 1024UL * 1024UL;
+  void* ptr                     = dr.allocate(cuda::stream_ref{cudaStreamLegacy}, kOneGiB);
+  ASSERT_NE(ptr, nullptr);
+  EXPECT_EQ(counter->get_allocated_bytes(), kOneGiB);
+
+  dr.deallocate(cuda::stream_ref{cudaStreamLegacy}, ptr, kOneGiB);
+  EXPECT_EQ(counter->get_allocated_bytes(), 0UL);
+  EXPECT_EQ(counter->get_peak_bytes(), kOneGiB);
+}
+
+// ===== dry_run_resource tests (host sync) =====
+
+TEST(DryRunResource, HostSyncPeakTracking)
+{
+  auto host_ref = raft::mr::get_default_host_resource();
+  raft::mr::dry_run_resource<raft::mr::host_resource_ref> dr{host_ref};
+  auto counter = dr.get_counter();
+
+  constexpr std::size_t kSize1 = 100UL * 1024UL * 1024UL;
+  constexpr std::size_t kSize2 = 200UL * 1024UL * 1024UL;
+
+  void* p1 = dr.allocate_sync(kSize1);
+  void* p2 = dr.allocate_sync(kSize2);
+  EXPECT_EQ(p1, p2);  // same probed pointer
+  EXPECT_EQ(counter->get_peak_bytes(), kSize1 + kSize2);
+
+  dr.deallocate_sync(p1, kSize1);
+  dr.deallocate_sync(p2, kSize2);
+  EXPECT_EQ(counter->get_allocated_bytes(), 0UL);
+  EXPECT_EQ(counter->get_peak_bytes(), kSize1 + kSize2);
+}
+
+// ===== dry_run_flag resource tests =====
+
+TEST(DryRunFlag, DefaultIsFalse)
+{
+  raft::resources res;
+  EXPECT_FALSE(resource::get_dry_run_flag(res));
+}
+
+TEST(DryRunFlag, SetAndGet)
+{
+  raft::resources res;
+  resource::set_dry_run_flag(res, true);
+  EXPECT_TRUE(resource::get_dry_run_flag(res));
+
+  resource::set_dry_run_flag(res, false);
+  EXPECT_FALSE(resource::get_dry_run_flag(res));
+}
+
+// ===== dry_run_resources tests =====
+
+TEST(DryRunResources, SetsFlag)
+{
+  raft::resources res;
+  EXPECT_FALSE(resource::get_dry_run_flag(res));
+  {
+    dry_run_resources dry_res(res);
+    EXPECT_TRUE(resource::get_dry_run_flag(dry_res));
+    EXPECT_TRUE(resource::get_dry_run_flag(res));
+  }
+  EXPECT_FALSE(resource::get_dry_run_flag(res));
+}
+
+TEST(DryRunResources, RestoresGlobalDeviceResource)
+{
+  auto original_mr = rmm::mr::get_current_device_resource_ref();
+  raft::resources res;
+  {
+    dry_run_resources dry_res(res);
+    auto current_mr = rmm::mr::get_current_device_resource_ref();
+    EXPECT_NE(current_mr, original_mr);
+  }
+  EXPECT_EQ(rmm::mr::get_current_device_resource_ref(), original_mr);
+}
+
+TEST(DryRunResources, RestoresGlobalHostResource)
+{
+  auto original_ref = raft::mr::get_default_host_resource();
+  raft::resources res;
+  {
+    dry_run_resources dry_res(res);
+    auto current_ref = raft::mr::get_default_host_resource();
+    EXPECT_NE(current_ref, original_ref);
+  }
+  EXPECT_EQ(raft::mr::get_default_host_resource(), original_ref);
+}
+
+TEST(DryRunResources, StatsAccuracy)
+{
+  raft::resources res;
+  constexpr std::size_t kAllocSize = 64UL * 1024UL * 1024UL;
+
+  dry_run_resources dry_res(res);
+
+  auto mr   = rmm::mr::get_current_device_resource_ref();
+  void* ptr = mr.allocate(cuda::stream_ref{cudaStreamLegacy}, kAllocSize);
+  mr.deallocate(cuda::stream_ref{cudaStreamLegacy}, ptr, kAllocSize);
+
+  auto stats = dry_res.get_bytes_peak();
+  EXPECT_EQ(stats.device_global, kAllocSize);
+}
+
+TEST(DryRunResources, PinnedStatsAccuracy)
+{
+  raft::resources res;
+  constexpr std::size_t kAllocSize = 64UL * 1024UL * 1024UL;
+
+  dry_run_resources dry_res(res);
+
+  auto ref  = resource::get_pinned_memory_resource_ref(dry_res);
+  void* ptr = ref.allocate_sync(kAllocSize);
+  ref.deallocate_sync(ptr, kAllocSize);
+
+  auto stats = dry_res.get_bytes_peak();
+  EXPECT_EQ(stats.host_pinned, kAllocSize);
+}
+
+TEST(DryRunResources, ManagedStatsAccuracy)
+{
+  raft::resources res;
+  constexpr std::size_t kAllocSize = 64UL * 1024UL * 1024UL;
+
+  dry_run_resources dry_res(res);
+
+  auto ref  = resource::get_managed_memory_resource_ref(dry_res);
+  void* ptr = ref.allocate_sync(kAllocSize);
+  ref.deallocate_sync(ptr, kAllocSize);
+
+  auto stats = dry_res.get_bytes_peak();
+  EXPECT_EQ(stats.device_managed, kAllocSize);
+}
+
+// ===== dry_run_execute tests =====
+
+TEST(DryRunExecute, BasicExecution)
+{
+  raft::resources res;
+  bool action_ran = false;
+
+  auto stats = dry_run_execute(res, [&](raft::resources const& r) {
+    action_ran = true;
+    EXPECT_TRUE(resource::get_dry_run_flag(r));
+
+    auto mr                     = rmm::mr::get_current_device_resource_ref();
+    constexpr std::size_t kSize = 32UL * 1024UL * 1024UL;
+    void* ptr                   = mr.allocate(cuda::stream_ref{cudaStreamLegacy}, kSize);
+    mr.deallocate(cuda::stream_ref{cudaStreamLegacy}, ptr, kSize);
+  });
+
+  EXPECT_TRUE(action_ran);
+  EXPECT_FALSE(resource::get_dry_run_flag(res));
+  EXPECT_EQ(stats.device_global, 32UL * 1024UL * 1024UL);
+}
+
+TEST(DryRunExecute, ExceptionSafety)
+{
+  raft::resources res;
+  auto original_mr   = rmm::mr::get_current_device_resource_ref();
+  auto original_host = raft::mr::get_default_host_resource();
+
+  EXPECT_THROW(dry_run_execute(
+                 res, [](raft::resources const&) { throw std::runtime_error("test exception"); }),
+               std::runtime_error);
+
+  EXPECT_EQ(rmm::mr::get_current_device_resource_ref(), original_mr);
+  EXPECT_EQ(raft::mr::get_default_host_resource(), original_host);
+  EXPECT_FALSE(resource::get_dry_run_flag(res));
+}
+
+// ===== Independent-counting tests for dry_run_resources =====
+
+TEST(DryRunResources, IndependentCounting_DefaultWorkspace)
+{
+  raft::resources res;
+
+  dry_run_resources dry_res(res);
+
+  constexpr std::size_t kWsSize     = 1024;
+  constexpr std::size_t kGlobalSize = 2048;
+
+  auto ws_ref  = resource::get_workspace_resource_ref(dry_res);
+  void* ws_ptr = ws_ref.allocate(cuda::stream_ref{cudaStreamLegacy}, kWsSize);
+
+  auto dev_mr   = rmm::mr::get_current_device_resource_ref();
+  void* dev_ptr = dev_mr.allocate(cuda::stream_ref{cudaStreamLegacy}, kGlobalSize);
+
+  auto peak = dry_res.get_bytes_peak();
+  EXPECT_EQ(peak.device_workspace, kWsSize);
+  EXPECT_EQ(peak.device_global, kGlobalSize);
+  EXPECT_EQ(peak.total(), kWsSize + kGlobalSize);
+
+  ws_ref.deallocate(cuda::stream_ref{cudaStreamLegacy}, ws_ptr, kWsSize);
+  dev_mr.deallocate(cuda::stream_ref{cudaStreamLegacy}, dev_ptr, kGlobalSize);
+}
+
+TEST(DryRunResources, IndependentCounting_WorkspaceSetToGlobal)
+{
+  raft::resources res;
+  resource::set_workspace_to_global_resource(res);
+
+  dry_run_resources dry_res(res);
+
+  constexpr std::size_t kWsSize     = 1024;
+  constexpr std::size_t kGlobalSize = 2048;
+
+  auto ws_ref  = resource::get_workspace_resource_ref(dry_res);
+  void* ws_ptr = ws_ref.allocate(cuda::stream_ref{cudaStreamLegacy}, kWsSize);
+
+  auto dev_mr   = rmm::mr::get_current_device_resource_ref();
+  void* dev_ptr = dev_mr.allocate(cuda::stream_ref{cudaStreamLegacy}, kGlobalSize);
+
+  auto peak = dry_res.get_bytes_peak();
+  EXPECT_EQ(peak.device_workspace, kWsSize);
+  EXPECT_EQ(peak.device_global, kGlobalSize);
+  EXPECT_EQ(peak.total(), kWsSize + kGlobalSize);
+
+  ws_ref.deallocate(cuda::stream_ref{cudaStreamLegacy}, ws_ptr, kWsSize);
+  dev_mr.deallocate(cuda::stream_ref{cudaStreamLegacy}, dev_ptr, kGlobalSize);
+}
+
+// ===== Independent-counting tests for memory_stats_resources =====
+
+TEST(MemoryStatsResources, IndependentCounting_DefaultWorkspace)
+{
+  raft::resources res;
+
+  memory_stats_resources stat_res(res);
+
+  constexpr std::size_t kWsSize     = 1024;
+  constexpr std::size_t kGlobalSize = 2048;
+
+  auto ws_ref  = resource::get_workspace_resource_ref(stat_res);
+  void* ws_ptr = ws_ref.allocate(cuda::stream_ref{cudaStreamLegacy}, kWsSize);
+
+  auto dev_mr   = rmm::mr::get_current_device_resource_ref();
+  void* dev_ptr = dev_mr.allocate(cuda::stream_ref{cudaStreamLegacy}, kGlobalSize);
+
+  auto peak = stat_res.get_bytes_peak();
+  EXPECT_EQ(peak.device_workspace, kWsSize);
+  EXPECT_EQ(peak.device_global, kGlobalSize);
+  EXPECT_EQ(peak.total(), kWsSize + kGlobalSize);
+
+  ws_ref.deallocate(cuda::stream_ref{cudaStreamLegacy}, ws_ptr, kWsSize);
+  dev_mr.deallocate(cuda::stream_ref{cudaStreamLegacy}, dev_ptr, kGlobalSize);
+}
+
+TEST(MemoryStatsResources, IndependentCounting_WorkspaceSetToGlobal)
+{
+  raft::resources res;
+  resource::set_workspace_to_global_resource(res);
+
+  memory_stats_resources stat_res(res);
+
+  constexpr std::size_t kWsSize     = 1024;
+  constexpr std::size_t kGlobalSize = 2048;
+
+  auto ws_ref  = resource::get_workspace_resource_ref(stat_res);
+  void* ws_ptr = ws_ref.allocate(cuda::stream_ref{cudaStreamLegacy}, kWsSize);
+
+  auto dev_mr   = rmm::mr::get_current_device_resource_ref();
+  void* dev_ptr = dev_mr.allocate(cuda::stream_ref{cudaStreamLegacy}, kGlobalSize);
+
+  auto peak = stat_res.get_bytes_peak();
+  EXPECT_EQ(peak.device_workspace, kWsSize);
+  EXPECT_EQ(peak.device_global, kGlobalSize);
+  EXPECT_EQ(peak.total(), kWsSize + kGlobalSize);
+
+  ws_ref.deallocate(cuda::stream_ref{cudaStreamLegacy}, ws_ptr, kWsSize);
+  dev_mr.deallocate(cuda::stream_ref{cudaStreamLegacy}, dev_ptr, kGlobalSize);
+}
+
+TEST(MemoryStatsResources, IndependentCounting_PoolWorkspace)
+{
+  raft::resources res;
+  constexpr std::size_t kPoolLimit = 64UL * 1024UL * 1024UL;
+  resource::set_workspace_to_pool_resource(res, kPoolLimit);
+
+  memory_stats_resources stat_res(res);
+
+  constexpr std::size_t kWsSize     = 1024;
+  constexpr std::size_t kGlobalSize = 2048;
+
+  auto ws_ref  = resource::get_workspace_resource_ref(stat_res);
+  void* ws_ptr = ws_ref.allocate(cuda::stream_ref{cudaStreamLegacy}, kWsSize);
+
+  auto dev_mr   = rmm::mr::get_current_device_resource_ref();
+  void* dev_ptr = dev_mr.allocate(cuda::stream_ref{cudaStreamLegacy}, kGlobalSize);
+
+  auto peak = stat_res.get_bytes_peak();
+  EXPECT_EQ(peak.device_workspace, kWsSize);
+  EXPECT_EQ(peak.device_global, kGlobalSize);
+  EXPECT_EQ(peak.total(), kWsSize + kGlobalSize);
+
+  ws_ref.deallocate(cuda::stream_ref{cudaStreamLegacy}, ws_ptr, kWsSize);
+  dev_mr.deallocate(cuda::stream_ref{cudaStreamLegacy}, dev_ptr, kGlobalSize);
+}
+
+// ===== Nested wrappers test =====
+
+TEST(IndependentCounting, NestedDryRunInStats)
+{
+  raft::resources res;
+
+  memory_stats_resources stat_res(res);
+  dry_run_resources dry_res(stat_res);
+
+  constexpr std::size_t kWsSize     = 1024;
+  constexpr std::size_t kGlobalSize = 2048;
+
+  auto ws_ref  = resource::get_workspace_resource_ref(dry_res);
+  void* ws_ptr = ws_ref.allocate(cuda::stream_ref{cudaStreamLegacy}, kWsSize);
+
+  auto dev_mr   = rmm::mr::get_current_device_resource_ref();
+  void* dev_ptr = dev_mr.allocate(cuda::stream_ref{cudaStreamLegacy}, kGlobalSize);
+
+  auto peak = dry_res.get_bytes_peak();
+  EXPECT_EQ(peak.device_workspace, kWsSize);
+  EXPECT_EQ(peak.device_global, kGlobalSize);
+  EXPECT_EQ(peak.total(), kWsSize + kGlobalSize);
+
+  ws_ref.deallocate(cuda::stream_ref{cudaStreamLegacy}, ws_ptr, kWsSize);
+  dev_mr.deallocate(cuda::stream_ref{cudaStreamLegacy}, dev_ptr, kGlobalSize);
+}
+
+}  // namespace raft::util
diff --git a/cpp/tests/util/memory_type_dispatcher.cu b/cpp/tests/util/memory_type_dispatcher.cu
index 36ab4b143a..41ff8f0b50 100644
--- a/cpp/tests/util/memory_type_dispatcher.cu
+++ b/cpp/tests/util/memory_type_dispatcher.cu
@@ -1,9 +1,9 @@
 /*
- * SPDX-FileCopyrightText: Copyright (c) 2023-2024, NVIDIA CORPORATION.
+ * SPDX-FileCopyrightText: Copyright (c) 2023-2026, NVIDIA CORPORATION.
  * SPDX-License-Identifier: Apache-2.0
  */
 
-#include "../test_utils.h"
+#include "../test_utils.cuh"
 
 #include <raft/core/device_mdarray.hpp>
 #include <raft/core/device_mdspan.hpp>
@@ -220,172 +220,176 @@ auto generate_input(raft::resources const& res)
 template <memory_type input_memory_type>
 void test_memory_type_dispatcher()
 {
-  auto res          = raft::device_resources{};
-  auto data         = generate_input<input_memory_type>(res);
-  auto data_float   = generate_input<input_memory_type, float>(res);
-  auto data_f       = generate_input<input_memory_type, double, layout_f_contiguous>(res);
-  auto data_f_float = generate_input<input_memory_type, float, layout_f_contiguous>(res);
+  execute_with_dry_run_check(
+    raft::device_resources{},
+    [&](raft::resources const& res) {
+      auto data         = generate_input<input_memory_type>(res);
+      auto data_float   = generate_input<input_memory_type, float>(res);
+      auto data_f       = generate_input<input_memory_type, double, layout_f_contiguous>(res);
+      auto data_f_float = generate_input<input_memory_type, float, layout_f_contiguous>(res);
 
-  EXPECT_EQ(memory_type_dispatcher(res, functor_h{}, data.view()),
-            functor_h::expected_output<input_memory_type>());
-  EXPECT_EQ(memory_type_dispatcher(res, functor_d{}, data.view()),
-            functor_d::expected_output<input_memory_type>());
-  EXPECT_EQ(memory_type_dispatcher(res, functor_m{}, data.view()),
-            functor_m::expected_output<input_memory_type>());
-  EXPECT_EQ(memory_type_dispatcher(res, functor_p{}, data.view()),
-            functor_p::expected_output<input_memory_type>());
-  EXPECT_EQ(memory_type_dispatcher(res, functor_hd{}, data.view()),
-            functor_hd::expected_output<input_memory_type>());
-  EXPECT_EQ(memory_type_dispatcher(res, functor_hm{}, data.view()),
-            functor_hm::expected_output<input_memory_type>());
-  EXPECT_EQ(memory_type_dispatcher(res, functor_hp{}, data.view()),
-            functor_hp::expected_output<input_memory_type>());
-  EXPECT_EQ(memory_type_dispatcher(res, functor_dm{}, data.view()),
-            functor_dm::expected_output<input_memory_type>());
-  EXPECT_EQ(memory_type_dispatcher(res, functor_dp{}, data.view()),
-            functor_dp::expected_output<input_memory_type>());
-  EXPECT_EQ(memory_type_dispatcher(res, functor_mp{}, data.view()),
-            functor_mp::expected_output<input_memory_type>());
-  EXPECT_EQ(memory_type_dispatcher(res, functor_hdm{}, data.view()),
-            functor_hdm::expected_output<input_memory_type>());
-  EXPECT_EQ(memory_type_dispatcher(res, functor_hdp{}, data.view()),
-            functor_hdp::expected_output<input_memory_type>());
-  EXPECT_EQ(memory_type_dispatcher(res, functor_dmp{}, data.view()),
-            functor_dmp::expected_output<input_memory_type>());
-  EXPECT_EQ(memory_type_dispatcher(res, functor_hdmp{}, data.view()),
-            functor_hdmp::expected_output<input_memory_type>());
+      EXPECT_EQ(memory_type_dispatcher(res, functor_h{}, data.view()),
+                functor_h::expected_output<input_memory_type>());
+      EXPECT_EQ(memory_type_dispatcher(res, functor_d{}, data.view()),
+                functor_d::expected_output<input_memory_type>());
+      EXPECT_EQ(memory_type_dispatcher(res, functor_m{}, data.view()),
+                functor_m::expected_output<input_memory_type>());
+      EXPECT_EQ(memory_type_dispatcher(res, functor_p{}, data.view()),
+                functor_p::expected_output<input_memory_type>());
+      EXPECT_EQ(memory_type_dispatcher(res, functor_hd{}, data.view()),
+                functor_hd::expected_output<input_memory_type>());
+      EXPECT_EQ(memory_type_dispatcher(res, functor_hm{}, data.view()),
+                functor_hm::expected_output<input_memory_type>());
+      EXPECT_EQ(memory_type_dispatcher(res, functor_hp{}, data.view()),
+                functor_hp::expected_output<input_memory_type>());
+      EXPECT_EQ(memory_type_dispatcher(res, functor_dm{}, data.view()),
+                functor_dm::expected_output<input_memory_type>());
+      EXPECT_EQ(memory_type_dispatcher(res, functor_dp{}, data.view()),
+                functor_dp::expected_output<input_memory_type>());
+      EXPECT_EQ(memory_type_dispatcher(res, functor_mp{}, data.view()),
+                functor_mp::expected_output<input_memory_type>());
+      EXPECT_EQ(memory_type_dispatcher(res, functor_hdm{}, data.view()),
+                functor_hdm::expected_output<input_memory_type>());
+      EXPECT_EQ(memory_type_dispatcher(res, functor_hdp{}, data.view()),
+                functor_hdp::expected_output<input_memory_type>());
+      EXPECT_EQ(memory_type_dispatcher(res, functor_dmp{}, data.view()),
+                functor_dmp::expected_output<input_memory_type>());
+      EXPECT_EQ(memory_type_dispatcher(res, functor_hdmp{}, data.view()),
+                functor_hdmp::expected_output<input_memory_type>());
 
-  // Functor expects double; input is float
-  auto out = memory_type_dispatcher<mdbuffer<double, matrix_extent<std::uint32_t>>>(
-    res, functor_h{}, data_float.view());
-  EXPECT_EQ(out, functor_h::expected_output<input_memory_type>());
-  out = memory_type_dispatcher<mdbuffer<double, matrix_extent<std::uint32_t>>>(
-    res, functor_d{}, data_float.view());
-  EXPECT_EQ(out, functor_d::expected_output<input_memory_type>());
-  out = memory_type_dispatcher<mdbuffer<double, matrix_extent<std::uint32_t>>>(
-    res, functor_m{}, data_float.view());
-  EXPECT_EQ(out, functor_m::expected_output<input_memory_type>());
-  out = memory_type_dispatcher<mdbuffer<double, matrix_extent<std::uint32_t>>>(
-    res, functor_p{}, data_float.view());
-  EXPECT_EQ(out, functor_p::expected_output<input_memory_type>());
-  out = memory_type_dispatcher<mdbuffer<double, matrix_extent<std::uint32_t>>>(
-    res, functor_hd{}, data_float.view());
-  EXPECT_EQ(out, functor_hd::expected_output<input_memory_type>());
-  out = memory_type_dispatcher<mdbuffer<double, matrix_extent<std::uint32_t>>>(
-    res, functor_hm{}, data_float.view());
-  EXPECT_EQ(out, functor_hm::expected_output<input_memory_type>());
-  out = memory_type_dispatcher<mdbuffer<double, matrix_extent<std::uint32_t>>>(
-    res, functor_hp{}, data_float.view());
-  EXPECT_EQ(out, functor_hp::expected_output<input_memory_type>());
-  out = memory_type_dispatcher<mdbuffer<double, matrix_extent<std::uint32_t>>>(
-    res, functor_dm{}, data_float.view());
-  EXPECT_EQ(out, functor_dm::expected_output<input_memory_type>());
-  out = memory_type_dispatcher<mdbuffer<double, matrix_extent<std::uint32_t>>>(
-    res, functor_dp{}, data_float.view());
-  EXPECT_EQ(out, functor_dp::expected_output<input_memory_type>());
-  out = memory_type_dispatcher<mdbuffer<double, matrix_extent<std::uint32_t>>>(
-    res, functor_mp{}, data_float.view());
-  EXPECT_EQ(out, functor_mp::expected_output<input_memory_type>());
-  out = memory_type_dispatcher<mdbuffer<double, matrix_extent<std::uint32_t>>>(
-    res, functor_hdm{}, data_float.view());
-  EXPECT_EQ(out, functor_hdm::expected_output<input_memory_type>());
-  out = memory_type_dispatcher<mdbuffer<double, matrix_extent<std::uint32_t>>>(
-    res, functor_hdp{}, data_float.view());
-  EXPECT_EQ(out, functor_hdp::expected_output<input_memory_type>());
-  out = memory_type_dispatcher<mdbuffer<double, matrix_extent<std::uint32_t>>>(
-    res, functor_dmp{}, data_float.view());
-  EXPECT_EQ(out, functor_dmp::expected_output<input_memory_type>());
-  out = memory_type_dispatcher<mdbuffer<double, matrix_extent<std::uint32_t>>>(
-    res, functor_hdmp{}, data_float.view());
-  EXPECT_EQ(out, functor_hdmp::expected_output<input_memory_type>());
+      // Functor expects double; input is float
+      auto out = memory_type_dispatcher<mdbuffer<double, matrix_extent<std::uint32_t>>>(
+        res, functor_h{}, data_float.view());
+      EXPECT_EQ(out, functor_h::expected_output<input_memory_type>());
+      out = memory_type_dispatcher<mdbuffer<double, matrix_extent<std::uint32_t>>>(
+        res, functor_d{}, data_float.view());
+      EXPECT_EQ(out, functor_d::expected_output<input_memory_type>());
+      out = memory_type_dispatcher<mdbuffer<double, matrix_extent<std::uint32_t>>>(
+        res, functor_m{}, data_float.view());
+      EXPECT_EQ(out, functor_m::expected_output<input_memory_type>());
+      out = memory_type_dispatcher<mdbuffer<double, matrix_extent<std::uint32_t>>>(
+        res, functor_p{}, data_float.view());
+      EXPECT_EQ(out, functor_p::expected_output<input_memory_type>());
+      out = memory_type_dispatcher<mdbuffer<double, matrix_extent<std::uint32_t>>>(
+        res, functor_hd{}, data_float.view());
+      EXPECT_EQ(out, functor_hd::expected_output<input_memory_type>());
+      out = memory_type_dispatcher<mdbuffer<double, matrix_extent<std::uint32_t>>>(
+        res, functor_hm{}, data_float.view());
+      EXPECT_EQ(out, functor_hm::expected_output<input_memory_type>());
+      out = memory_type_dispatcher<mdbuffer<double, matrix_extent<std::uint32_t>>>(
+        res, functor_hp{}, data_float.view());
+      EXPECT_EQ(out, functor_hp::expected_output<input_memory_type>());
+      out = memory_type_dispatcher<mdbuffer<double, matrix_extent<std::uint32_t>>>(
+        res, functor_dm{}, data_float.view());
+      EXPECT_EQ(out, functor_dm::expected_output<input_memory_type>());
+      out = memory_type_dispatcher<mdbuffer<double, matrix_extent<std::uint32_t>>>(
+        res, functor_dp{}, data_float.view());
+      EXPECT_EQ(out, functor_dp::expected_output<input_memory_type>());
+      out = memory_type_dispatcher<mdbuffer<double, matrix_extent<std::uint32_t>>>(
+        res, functor_mp{}, data_float.view());
+      EXPECT_EQ(out, functor_mp::expected_output<input_memory_type>());
+      out = memory_type_dispatcher<mdbuffer<double, matrix_extent<std::uint32_t>>>(
+        res, functor_hdm{}, data_float.view());
+      EXPECT_EQ(out, functor_hdm::expected_output<input_memory_type>());
+      out = memory_type_dispatcher<mdbuffer<double, matrix_extent<std::uint32_t>>>(
+        res, functor_hdp{}, data_float.view());
+      EXPECT_EQ(out, functor_hdp::expected_output<input_memory_type>());
+      out = memory_type_dispatcher<mdbuffer<double, matrix_extent<std::uint32_t>>>(
+        res, functor_dmp{}, data_float.view());
+      EXPECT_EQ(out, functor_dmp::expected_output<input_memory_type>());
+      out = memory_type_dispatcher<mdbuffer<double, matrix_extent<std::uint32_t>>>(
+        res, functor_hdmp{}, data_float.view());
+      EXPECT_EQ(out, functor_hdmp::expected_output<input_memory_type>());
 
-  // Functor expects C-contiguous; input is F-contiguous
-  out = memory_type_dispatcher<mdbuffer<double, matrix_extent<std::uint32_t>>>(
-    res, functor_h{}, data_f.view());
-  EXPECT_EQ(out, functor_h::expected_output<input_memory_type>());
-  out = memory_type_dispatcher<mdbuffer<double, matrix_extent<std::uint32_t>>>(
-    res, functor_d{}, data_f.view());
-  EXPECT_EQ(out, functor_d::expected_output<input_memory_type>());
-  out = memory_type_dispatcher<mdbuffer<double, matrix_extent<std::uint32_t>>>(
-    res, functor_m{}, data_f.view());
-  EXPECT_EQ(out, functor_m::expected_output<input_memory_type>());
-  out = memory_type_dispatcher<mdbuffer<double, matrix_extent<std::uint32_t>>>(
-    res, functor_p{}, data_f.view());
-  EXPECT_EQ(out, functor_p::expected_output<input_memory_type>());
-  out = memory_type_dispatcher<mdbuffer<double, matrix_extent<std::uint32_t>>>(
-    res, functor_hd{}, data_f.view());
-  EXPECT_EQ(out, functor_hd::expected_output<input_memory_type>());
-  out = memory_type_dispatcher<mdbuffer<double, matrix_extent<std::uint32_t>>>(
-    res, functor_hm{}, data_f.view());
-  EXPECT_EQ(out, functor_hm::expected_output<input_memory_type>());
-  out = memory_type_dispatcher<mdbuffer<double, matrix_extent<std::uint32_t>>>(
-    res, functor_hp{}, data_f.view());
-  EXPECT_EQ(out, functor_hp::expected_output<input_memory_type>());
-  out = memory_type_dispatcher<mdbuffer<double, matrix_extent<std::uint32_t>>>(
-    res, functor_dm{}, data_f.view());
-  EXPECT_EQ(out, functor_dm::expected_output<input_memory_type>());
-  out = memory_type_dispatcher<mdbuffer<double, matrix_extent<std::uint32_t>>>(
-    res, functor_dp{}, data_f.view());
-  EXPECT_EQ(out, functor_dp::expected_output<input_memory_type>());
-  out = memory_type_dispatcher<mdbuffer<double, matrix_extent<std::uint32_t>>>(
-    res, functor_mp{}, data_f.view());
-  EXPECT_EQ(out, functor_mp::expected_output<input_memory_type>());
-  out = memory_type_dispatcher<mdbuffer<double, matrix_extent<std::uint32_t>>>(
-    res, functor_hdm{}, data_f.view());
-  EXPECT_EQ(out, functor_hdm::expected_output<input_memory_type>());
-  out = memory_type_dispatcher<mdbuffer<double, matrix_extent<std::uint32_t>>>(
-    res, functor_hdp{}, data_f.view());
-  EXPECT_EQ(out, functor_hdp::expected_output<input_memory_type>());
-  out = memory_type_dispatcher<mdbuffer<double, matrix_extent<std::uint32_t>>>(
-    res, functor_dmp{}, data_f.view());
-  EXPECT_EQ(out, functor_dmp::expected_output<input_memory_type>());
-  out = memory_type_dispatcher<mdbuffer<double, matrix_extent<std::uint32_t>>>(
-    res, functor_hdmp{}, data_f.view());
-  EXPECT_EQ(out, functor_hdmp::expected_output<input_memory_type>());
+      // Functor expects C-contiguous; input is F-contiguous
+      out = memory_type_dispatcher<mdbuffer<double, matrix_extent<std::uint32_t>>>(
+        res, functor_h{}, data_f.view());
+      EXPECT_EQ(out, functor_h::expected_output<input_memory_type>());
+      out = memory_type_dispatcher<mdbuffer<double, matrix_extent<std::uint32_t>>>(
+        res, functor_d{}, data_f.view());
+      EXPECT_EQ(out, functor_d::expected_output<input_memory_type>());
+      out = memory_type_dispatcher<mdbuffer<double, matrix_extent<std::uint32_t>>>(
+        res, functor_m{}, data_f.view());
+      EXPECT_EQ(out, functor_m::expected_output<input_memory_type>());
+      out = memory_type_dispatcher<mdbuffer<double, matrix_extent<std::uint32_t>>>(
+        res, functor_p{}, data_f.view());
+      EXPECT_EQ(out, functor_p::expected_output<input_memory_type>());
+      out = memory_type_dispatcher<mdbuffer<double, matrix_extent<std::uint32_t>>>(
+        res, functor_hd{}, data_f.view());
+      EXPECT_EQ(out, functor_hd::expected_output<input_memory_type>());
+      out = memory_type_dispatcher<mdbuffer<double, matrix_extent<std::uint32_t>>>(
+        res, functor_hm{}, data_f.view());
+      EXPECT_EQ(out, functor_hm::expected_output<input_memory_type>());
+      out = memory_type_dispatcher<mdbuffer<double, matrix_extent<std::uint32_t>>>(
+        res, functor_hp{}, data_f.view());
+      EXPECT_EQ(out, functor_hp::expected_output<input_memory_type>());
+      out = memory_type_dispatcher<mdbuffer<double, matrix_extent<std::uint32_t>>>(
+        res, functor_dm{}, data_f.view());
+      EXPECT_EQ(out, functor_dm::expected_output<input_memory_type>());
+      out = memory_type_dispatcher<mdbuffer<double, matrix_extent<std::uint32_t>>>(
+        res, functor_dp{}, data_f.view());
+      EXPECT_EQ(out, functor_dp::expected_output<input_memory_type>());
+      out = memory_type_dispatcher<mdbuffer<double, matrix_extent<std::uint32_t>>>(
+        res, functor_mp{}, data_f.view());
+      EXPECT_EQ(out, functor_mp::expected_output<input_memory_type>());
+      out = memory_type_dispatcher<mdbuffer<double, matrix_extent<std::uint32_t>>>(
+        res, functor_hdm{}, data_f.view());
+      EXPECT_EQ(out, functor_hdm::expected_output<input_memory_type>());
+      out = memory_type_dispatcher<mdbuffer<double, matrix_extent<std::uint32_t>>>(
+        res, functor_hdp{}, data_f.view());
+      EXPECT_EQ(out, functor_hdp::expected_output<input_memory_type>());
+      out = memory_type_dispatcher<mdbuffer<double, matrix_extent<std::uint32_t>>>(
+        res, functor_dmp{}, data_f.view());
+      EXPECT_EQ(out, functor_dmp::expected_output<input_memory_type>());
+      out = memory_type_dispatcher<mdbuffer<double, matrix_extent<std::uint32_t>>>(
+        res, functor_hdmp{}, data_f.view());
+      EXPECT_EQ(out, functor_hdmp::expected_output<input_memory_type>());
 
-  // Functor expects C-contiguous double; input is F-contiguous float
-  out = memory_type_dispatcher<mdbuffer<double, matrix_extent<std::uint32_t>>>(
-    res, functor_h{}, data_f_float.view());
-  EXPECT_EQ(out, functor_h::expected_output<input_memory_type>());
-  out = memory_type_dispatcher<mdbuffer<double, matrix_extent<std::uint32_t>>>(
-    res, functor_d{}, data_f_float.view());
-  EXPECT_EQ(out, functor_d::expected_output<input_memory_type>());
-  out = memory_type_dispatcher<mdbuffer<double, matrix_extent<std::uint32_t>>>(
-    res, functor_m{}, data_f_float.view());
-  EXPECT_EQ(out, functor_m::expected_output<input_memory_type>());
-  out = memory_type_dispatcher<mdbuffer<double, matrix_extent<std::uint32_t>>>(
-    res, functor_p{}, data_f_float.view());
-  EXPECT_EQ(out, functor_p::expected_output<input_memory_type>());
-  out = memory_type_dispatcher<mdbuffer<double, matrix_extent<std::uint32_t>>>(
-    res, functor_hd{}, data_f_float.view());
-  EXPECT_EQ(out, functor_hd::expected_output<input_memory_type>());
-  out = memory_type_dispatcher<mdbuffer<double, matrix_extent<std::uint32_t>>>(
-    res, functor_hm{}, data_f_float.view());
-  EXPECT_EQ(out, functor_hm::expected_output<input_memory_type>());
-  out = memory_type_dispatcher<mdbuffer<double, matrix_extent<std::uint32_t>>>(
-    res, functor_hp{}, data_f_float.view());
-  EXPECT_EQ(out, functor_hp::expected_output<input_memory_type>());
-  out = memory_type_dispatcher<mdbuffer<double, matrix_extent<std::uint32_t>>>(
-    res, functor_dm{}, data_f_float.view());
-  EXPECT_EQ(out, functor_dm::expected_output<input_memory_type>());
-  out = memory_type_dispatcher<mdbuffer<double, matrix_extent<std::uint32_t>>>(
-    res, functor_dp{}, data_f_float.view());
-  EXPECT_EQ(out, functor_dp::expected_output<input_memory_type>());
-  out = memory_type_dispatcher<mdbuffer<double, matrix_extent<std::uint32_t>>>(
-    res, functor_mp{}, data_f_float.view());
-  EXPECT_EQ(out, functor_mp::expected_output<input_memory_type>());
-  out = memory_type_dispatcher<mdbuffer<double, matrix_extent<std::uint32_t>>>(
-    res, functor_hdm{}, data_f_float.view());
-  EXPECT_EQ(out, functor_hdm::expected_output<input_memory_type>());
-  out = memory_type_dispatcher<mdbuffer<double, matrix_extent<std::uint32_t>>>(
-    res, functor_hdp{}, data_f_float.view());
-  EXPECT_EQ(out, functor_hdp::expected_output<input_memory_type>());
-  out = memory_type_dispatcher<mdbuffer<double, matrix_extent<std::uint32_t>>>(
-    res, functor_dmp{}, data_f_float.view());
-  EXPECT_EQ(out, functor_dmp::expected_output<input_memory_type>());
-  out = memory_type_dispatcher<mdbuffer<double, matrix_extent<std::uint32_t>>>(
-    res, functor_hdmp{}, data_f_float.view());
-  EXPECT_EQ(out, functor_hdmp::expected_output<input_memory_type>());
+      // Functor expects C-contiguous double; input is F-contiguous float
+      out = memory_type_dispatcher<mdbuffer<double, matrix_extent<std::uint32_t>>>(
+        res, functor_h{}, data_f_float.view());
+      EXPECT_EQ(out, functor_h::expected_output<input_memory_type>());
+      out = memory_type_dispatcher<mdbuffer<double, matrix_extent<std::uint32_t>>>(
+        res, functor_d{}, data_f_float.view());
+      EXPECT_EQ(out, functor_d::expected_output<input_memory_type>());
+      out = memory_type_dispatcher<mdbuffer<double, matrix_extent<std::uint32_t>>>(
+        res, functor_m{}, data_f_float.view());
+      EXPECT_EQ(out, functor_m::expected_output<input_memory_type>());
+      out = memory_type_dispatcher<mdbuffer<double, matrix_extent<std::uint32_t>>>(
+        res, functor_p{}, data_f_float.view());
+      EXPECT_EQ(out, functor_p::expected_output<input_memory_type>());
+      out = memory_type_dispatcher<mdbuffer<double, matrix_extent<std::uint32_t>>>(
+        res, functor_hd{}, data_f_float.view());
+      EXPECT_EQ(out, functor_hd::expected_output<input_memory_type>());
+      out = memory_type_dispatcher<mdbuffer<double, matrix_extent<std::uint32_t>>>(
+        res, functor_hm{}, data_f_float.view());
+      EXPECT_EQ(out, functor_hm::expected_output<input_memory_type>());
+      out = memory_type_dispatcher<mdbuffer<double, matrix_extent<std::uint32_t>>>(
+        res, functor_hp{}, data_f_float.view());
+      EXPECT_EQ(out, functor_hp::expected_output<input_memory_type>());
+      out = memory_type_dispatcher<mdbuffer<double, matrix_extent<std::uint32_t>>>(
+        res, functor_dm{}, data_f_float.view());
+      EXPECT_EQ(out, functor_dm::expected_output<input_memory_type>());
+      out = memory_type_dispatcher<mdbuffer<double, matrix_extent<std::uint32_t>>>(
+        res, functor_dp{}, data_f_float.view());
+      EXPECT_EQ(out, functor_dp::expected_output<input_memory_type>());
+      out = memory_type_dispatcher<mdbuffer<double, matrix_extent<std::uint32_t>>>(
+        res, functor_mp{}, data_f_float.view());
+      EXPECT_EQ(out, functor_mp::expected_output<input_memory_type>());
+      out = memory_type_dispatcher<mdbuffer<double, matrix_extent<std::uint32_t>>>(
+        res, functor_hdm{}, data_f_float.view());
+      EXPECT_EQ(out, functor_hdm::expected_output<input_memory_type>());
+      out = memory_type_dispatcher<mdbuffer<double, matrix_extent<std::uint32_t>>>(
+        res, functor_hdp{}, data_f_float.view());
+      EXPECT_EQ(out, functor_hdp::expected_output<input_memory_type>());
+      out = memory_type_dispatcher<mdbuffer<double, matrix_extent<std::uint32_t>>>(
+        res, functor_dmp{}, data_f_float.view());
+      EXPECT_EQ(out, functor_dmp::expected_output<input_memory_type>());
+      out = memory_type_dispatcher<mdbuffer<double, matrix_extent<std::uint32_t>>>(
+        res, functor_hdmp{}, data_f_float.view());
+      EXPECT_EQ(out, functor_hdmp::expected_output<input_memory_type>());
+    },
+    alloc_behavior::ARGUMENT_DRIVEN);
 }
 
 }  // namespace dispatch_test
diff --git a/cpp/tests/util/popc.cu b/cpp/tests/util/popc.cu
index f309f15efb..2b2ccbf7ba 100644
--- a/cpp/tests/util/popc.cu
+++ b/cpp/tests/util/popc.cu
@@ -1,5 +1,5 @@
 /*
- * SPDX-FileCopyrightText: Copyright (c) 2024, NVIDIA CORPORATION.
+ * SPDX-FileCopyrightText: Copyright (c) 2024-2026, NVIDIA CORPORATION.
  * SPDX-License-Identifier: Apache-2.0
  */
 
@@ -96,7 +96,10 @@ class PopcTest : public ::testing::TestWithParam<PopcInputs<index_t>> {
     rmm::device_scalar<index_t> nnz_actual_d(0, stream);
     auto nnz_actual_view = raft::make_device_scalar_view<index_t>(nnz_actual_d.data());
 
-    raft::popc(handle, bits_view, max_len_view, nnz_actual_view);
+    raft::execute_with_dry_run_check(
+      handle,
+      [&](raft::resources const& h) { raft::popc(h, bits_view, max_len_view, nnz_actual_view); },
+      raft::alloc_behavior::ARGUMENT_DRIVEN);
     raft::copy(&nnz_actual_h, nnz_actual_d.data(), 1, stream);
     resource::sync_stream(handle);
 
diff --git a/docs/source/cpp_api/core_resources.rst b/docs/source/cpp_api/core_resources.rst
index d4d28394bd..34678d89ec 100644
--- a/docs/source/cpp_api/core_resources.rst
+++ b/docs/source/cpp_api/core_resources.rst
@@ -222,6 +222,22 @@ namespace *raft*
     :project: RAFT
     :members:
 
+Dry-Run Resources
+~~~~~~~~~~~~~~~~~
+
+``#include <raft/core/dry_run_resources.hpp>``
+
+namespace *raft*
+
+.. doxygenclass:: raft::dry_run_resources
+    :project: RAFT
+    :members:
+
+namespace *raft::util*
+
+.. doxygenfunction:: raft::util::dry_run_execute
+    :project: RAFT
+
 Device Properties
 ~~~~~~~~~~~~~~~~~
 
diff --git a/docs/source/developer_guide.md b/docs/source/developer_guide.md
index cd1e426707..9db69fb908 100644
--- a/docs/source/developer_guide.md
+++ b/docs/source/developer_guide.md
@@ -315,6 +315,27 @@ When writing new code, re-using functionality, or reviewing changes, prefer:
    deprecated function; when adding a replacement, deprecate the old one per
    [API stability](#api-stability).
 
+## Dry Run Protocol
+
+The dry run protocol defines a mechanism to simulate the execution of algorithms to get a precise estimate of the memory requirements for a real execution with the same parameters.
+
+In dry run mode:
+- no CUDA work happens in any CUDA stream
+- no expensive CPU algorithms are allowed to run
+- no real allocations happen in any of:
+  - `rmm` default device resource (device mdarrays and `rmm::device_uvector`)
+  - `cuda::mr` (host/managed/pinned) resources (all mdarray types)
+  - workspace memory resources managed by `raft::resources`.
+All attempted allocations in the above resources are tracked and reported, thus enabling planning of the memory usage with a relatively small overhead of simulated execution.
+
+To keep the dry run mode functional, the developers must follow the protocol:
+- Any function that takes `raft::resources` handle as an argument can run in dry run mode.
+  It's always safe to call such functions without any precautions.
+- Any other expensive function or any function involving CUDA-calls must be guarded by `resource::get_dry_run_flag(res)`
+- Allocations through rmm or raft memory resources must NOT be guarded to accurately track the allocation statistics.
+
+See the full [Dry Run Protocol](dry_run_protocol.md) guide for rules, patterns, and common mistakes.
+
 ## Header organization of expensive function templates
 
 RAFT is a heavily templated library. Several core functions are expensive to compile and we want to prevent duplicate compilation of this functionality. To limit build time, RAFT provides a precompiled library (libraft.so) where expensive function templates are instantiated for the most commonly used template parameters. To prevent (1) accidental instantiation of these templates and (2) unnecessary dependency on the internals of these templates, we use a split header structure and define macros to control template instantiation. This section describes the macros and header structure.
diff --git a/docs/source/dry_run_protocol.md b/docs/source/dry_run_protocol.md
new file mode 100644
index 0000000000..310dae9d6c
--- /dev/null
+++ b/docs/source/dry_run_protocol.md
@@ -0,0 +1,92 @@
+# Dry Run Protocol
+
+The dry run protocol lets callers estimate an algorithm's memory footprint without executing it. When enabled, the runtime swaps memory resources for lightweight trackers that record every allocation and deallocation, producing peak-usage statistics at the end.
+
+## Using Dry Run Mode
+
+```cpp
+#include <raft/core/dry_run_resources.hpp>
+
+raft::resources res;
+// auto my_function(const raft::resources& res, my_args...);
+auto stats = raft::util::dry_run_execute(res, my_function, my_args...);
+// stats.device_global  – peak device memory (bytes)
+```
+
+`dry_run_execute` swaps the memory resources, sets the flag, runs the callable, restores everything, and returns a `raft::memory_stats` snapshot of peak allocation usage.
+
+You can also construct `raft::dry_run_resources` directly for finer control (e.g. reading `get_bytes_current()` in addition to `get_bytes_peak()`).
+
+## Three Rules
+
+1. **Allocations must not be guarded.** Every `rmm::device_uvector`, `rmm::device_scalar`, `rmm::device_buffer`, `raft::make_(device|host|pinned|managed)_(mdarray|matrix|vector|scalar)` allocation must execute in both modes so the tracker sees it.
+
+2. **CUDA work must be guarded.** Kernel launches, Thrust algorithms, cuBLAS/cuSOLVER/cuSPARSE compute calls, `cudaMemcpyAsync`, `cudaMemsetAsync`, and `raft::interruptible::synchronize` must not run in dry-run mode.
+
+3. **Every function taking `raft::resources` must be callable in dry-run mode.** If it only delegates to other compliant functions, it needs no guard at all. If it performs raw CUDA work, it must guard that work internally.
+
+## What Needs Guarding
+
+| Must guard | Safe in dry-run (no guard needed) |
+|---|---|
+| Kernel launches (`<<<>>>`) | Allocations (`rmm::device_uvector`, `make_device_*`, …) |
+| `thrust::reduce`, `thrust::for_each`, … | Workspace-size queries (`cub::…(nullptr, &size, …)`, `cusparse…_bufferSize`) |
+| cuBLAS / cuSOLVER / cuSPARSE compute calls | cuSPARSE descriptor create/destroy |
+| CUB compute calls (second pass) | `resource::sync_stream()` |
+| `cudaMemcpyAsync`, `cudaMemsetAsync` | `raft::copy` (takes `raft::resources`) |
+| `raft::interruptible::synchronize()` | `raft::linalg::map`, `raft::linalg::reduce`, and other compliant RAFT APIs |
+
+## Patterns
+
+### Basic: allocate, then guard
+
+```cpp
+void algo(raft::resources const& handle, int n, cudaStream_t stream)
+{
+  rmm::device_uvector<float> buf(n, stream);           // tracked
+  if (resource::get_dry_run_flag(handle)) { return; }
+  kernel<<<grid, block, 0, stream>>>(buf.data(), n);   // skipped in dry-run
+}
+```
+
+### Workspace-size query before guard
+
+_We assume_ CUB and cuSPARSE workspace queries do not launch device work when the workspace pointer is `nullptr`, so they are safe to run in dry-run mode.
+
+```cpp
+size_t ws_bytes = 0;
+cub::DeviceRadixSort::SortPairs(nullptr, &ws_bytes, ...);   // query only
+rmm::device_uvector<char> workspace(ws_bytes, stream);       // tracked
+if (resource::get_dry_run_flag(handle)) { return; }
+cub::DeviceRadixSort::SortPairs(workspace.data(), &ws_bytes, ...);  // real work
+```
+
+### Guard individual operations (not the whole body)
+
+When cleanup or descriptor destruction must always run, guard each operation instead of returning early.
+
+```cpp
+cusparseSpMV_bufferSize(handle, ..., &buf_size);         // safe
+rmm::device_uvector<char> tmp(buf_size, stream);         // tracked
+if (!is_dry_run) {
+  cusparseSpMV(handle, ..., tmp.data());                 // guarded
+}
+cusparseDestroyDnVec(descr);                             // always runs
+```
+
+### Public wrappers: delegate without guards
+
+A wrapper that only calls compliant functions must **not** add an early return—doing so hides allocations made by the callee.
+
+```cpp
+// WRONG – hides allocations inside detail::foo
+void foo(raft::resources const& handle, ...) {
+  if (resource::get_dry_run_flag(handle)) { return; }
+  detail::foo(handle, ...);
+}
+
+// CORRECT
+void foo(raft::resources const& handle, ...) {
+  detail::foo(handle, ...);
+}
+```