From 91f4bcb8e1ea4f1286814a4c1429d5b236328a96 Mon Sep 17 00:00:00 2001 From: solos Date: Fri, 5 Jun 2026 18:34:34 +0800 Subject: [PATCH 1/2] Refactor(barrier): Optimize affine direction handling with GPU snapshot - Eliminated redundant CPU-GPU data transfers in Mehrotra step. - Introduced GPU-side snapshot to retain affine directions (dx, dy, etc.). - Removed costly synchronous copies and stream synchronizations. - Simplified code by deprecating manual host-side vector management. --- cpp/src/barrier/barrier.cu | 73 ++++++++++++++++++++++--------------- cpp/src/barrier/barrier.hpp | 3 +- 2 files changed, 45 insertions(+), 31 deletions(-) diff --git a/cpp/src/barrier/barrier.cu b/cpp/src/barrier/barrier.cu index a9b3aa891..12a2d6765 100644 --- a/cpp/src/barrier/barrier.cu +++ b/cpp/src/barrier/barrier.cu @@ -2707,6 +2707,30 @@ f_t barrier_solver_t::compute_nonnegative_step_length(iteration_data_t stream_view_); } +// Copy the current device search direction (dw, dx, dy, dv, dz) into d_*_aff_ buffers. +// Called from gpu_compute_search_direction when snapshot_affine_direction is true, immediately +// after the direction is fully formed and before the function returns. Mehrotra uses two separate +// calls to gpu_compute_search_direction (affine then corrector); the corrector call overwrites +// d_dx_ / d_dy_ / etc. but must not refresh d_*_aff_. +template +void copy_affine_direction_to_device_buffers(iteration_data_t& data, + rmm::cuda_stream_view stream) +{ + raft::common::nvtx::range fun_scope("Barrier: copy_affine_direction_to_device_buffers"); + + auto copy_device_vec = [&](rmm::device_uvector& dst, + const rmm::device_uvector& src) { + dst.resize(src.size(), stream); + if (src.size() > 0) { raft::copy(dst.data(), src.data(), src.size(), stream); } + }; + + copy_device_vec(data.d_dw_aff_, data.d_dw_); + copy_device_vec(data.d_dx_aff_, data.d_dx_); + copy_device_vec(data.d_dy_aff_, data.d_dy_); + copy_device_vec(data.d_dv_aff_, data.d_dv_); + copy_device_vec(data.d_dz_aff_, data.d_dz_); +} + template i_t barrier_solver_t::gpu_compute_search_direction(iteration_data_t& data, pinned_dense_vector_t& dw, @@ -2716,7 +2740,8 @@ i_t barrier_solver_t::gpu_compute_search_direction(iteration_data_t& dz, f_t& dual_perturb, f_t& primal_perturb, - f_t& max_residual) + f_t& max_residual, + bool snapshot_affine_direction) { raft::common::nvtx::range fun_scope("Barrier: compute_search_direction"); @@ -3017,9 +3042,6 @@ i_t barrier_solver_t::gpu_compute_search_direction(iteration_data_t::gpu_compute_search_direction(iteration_data_t::gpu_compute_search_direction(iteration_data_t::gpu_compute_search_direction(iteration_data_t(data.d_dz_.data(), linear_dz_size), raft::device_span(data.d_is_direct_free_linear_.data(), linear_dz_size), stream_view_); - raft::copy(dz.data(), data.d_dz_.data(), data.d_dz_.size(), stream_view_); } if (debug) { @@ -3332,7 +3349,6 @@ i_t barrier_solver_t::gpu_compute_search_direction(iteration_data_t::gpu_compute_search_direction(iteration_data_t::gpu_compute_search_direction(iteration_data_t::compute_target_mu( const bool has_soc = data.has_cones(); f_t complementarity_aff_sum = 0.0; - // TMP no copy and data should always be on the GPU - data.d_dw_aff_.resize(data.dw_aff.size(), stream_view_); - data.d_dx_aff_.resize(data.dx_aff.size(), stream_view_); - data.d_dv_aff_.resize(data.dv_aff.size(), stream_view_); - data.d_dz_aff_.resize(data.dz_aff.size(), stream_view_); - - raft::copy(data.d_dw_aff_.data(), data.dw_aff.data(), data.dw_aff.size(), stream_view_); - raft::copy(data.d_dx_aff_.data(), data.dx_aff.data(), data.dx_aff.size(), stream_view_); - raft::copy(data.d_dv_aff_.data(), data.dv_aff.data(), data.dv_aff.size(), stream_view_); - raft::copy(data.d_dz_aff_.data(), data.dz_aff.data(), data.dz_aff.size(), stream_view_); f_t step_primal_aff = std::min(compute_nonnegative_step_length(data, data.d_w_, data.d_dw_aff_), compute_nonnegative_step_length(data, data.d_x_, data.d_dx_aff_)); @@ -3799,16 +3814,17 @@ template void barrier_solver_t::compute_final_direction(iteration_data_t& data) { raft::common::nvtx::range fun_scope("Barrier: compute_final_direction"); - data.d_dy_aff_.resize(data.dy_aff.size(), stream_view_); - raft::copy(data.d_dy_aff_.data(), data.dy_aff.data(), data.dy_aff.size(), stream_view_); #ifdef FINITE_CHECK for (i_t i = 0; i < (int)data.y.size(); i++) { cuopt_assert(std::isfinite(data.y[i]), "data.d_y_[i] is not finite"); } - for (i_t i = 0; i < (int)data.dy_aff.size(); i++) { - cuopt_assert(std::isfinite(data.dy_aff[i]), "data.dy_aff_[i] is not finite"); + if (!data.d_dy_aff_.empty()) { + const auto dy_aff_host = host_copy(data.d_dy_aff_, stream_view_); + for (i_t i = 0; i < static_cast(dy_aff_host.size()); ++i) { + cuopt_assert(std::isfinite(dy_aff_host[i]), "data.d_dy_aff_[i] is not finite"); + } } #endif @@ -4437,13 +4453,12 @@ lp_status_t barrier_solver_t::solve(f_t start_time, lp_solution_t::solve(f_t start_time, lp_solution_t& dz, f_t& dual_perturb, f_t& primal_perturb, - f_t& max_residual); + f_t& max_residual, + bool snapshot_affine_direction = false); private: lp_status_t check_for_suboptimal_solution(iteration_data_t& data, From 6857d1a0945df8c94439892104e27364ac3276d8 Mon Sep 17 00:00:00 2001 From: solos Date: Fri, 5 Jun 2026 18:47:39 +0800 Subject: [PATCH 2/2] Add docs Add docs Fix bug --- cpp/src/barrier/barrier.cu | 24 +++++++++++++++--------- cpp/src/barrier/barrier.hpp | 9 +++++++++ 2 files changed, 24 insertions(+), 9 deletions(-) diff --git a/cpp/src/barrier/barrier.cu b/cpp/src/barrier/barrier.cu index 12a2d6765..0bf39e468 100644 --- a/cpp/src/barrier/barrier.cu +++ b/cpp/src/barrier/barrier.cu @@ -2707,11 +2707,17 @@ f_t barrier_solver_t::compute_nonnegative_step_length(iteration_data_t stream_view_); } -// Copy the current device search direction (dw, dx, dy, dv, dz) into d_*_aff_ buffers. -// Called from gpu_compute_search_direction when snapshot_affine_direction is true, immediately -// after the direction is fully formed and before the function returns. Mehrotra uses two separate -// calls to gpu_compute_search_direction (affine then corrector); the corrector call overwrites -// d_dx_ / d_dy_ / etc. but must not refresh d_*_aff_. +/** + * @brief Copy the current device search direction into Mehrotra affine buffers. + * + * Device-to-device snapshot of (dw, dx, dy, dv, dz) into d_*_aff_. Called from + * gpu_compute_search_direction when snapshot_affine_direction is true, immediately + * after the direction is fully formed. The corrector step reuses d_dx_/d_dy_/etc. + * and must not refresh d_*_aff_. + * + * @param data Per-iteration device state (d_dw_, d_dx_, ..., d_*_aff_). + * @param stream CUDA stream for resize and copy operations. + */ template void copy_affine_direction_to_device_buffers(iteration_data_t& data, rmm::cuda_stream_view stream) @@ -2720,6 +2726,8 @@ void copy_affine_direction_to_device_buffers(iteration_data_t& data, auto copy_device_vec = [&](rmm::device_uvector& dst, const rmm::device_uvector& src) { + cuopt_assert(dst.empty() || dst.size() == src.size(), + "Buffer size mismatch in affine snapshot"); dst.resize(src.size(), stream); if (src.size() > 0) { raft::copy(dst.data(), src.data(), src.size(), stream); } }; @@ -3485,9 +3493,6 @@ i_t barrier_solver_t::gpu_compute_search_direction(iteration_data_t::compute_final_direction(iteration_data_t 0) { const auto dy_aff_host = host_copy(data.d_dy_aff_, stream_view_); + stream_view_.synchronize(); for (i_t i = 0; i < static_cast(dy_aff_host.size()); ++i) { cuopt_assert(std::isfinite(dy_aff_host[i]), "data.d_dy_aff_[i] is not finite"); } diff --git a/cpp/src/barrier/barrier.hpp b/cpp/src/barrier/barrier.hpp index 275f75b8e..c758d4bb3 100644 --- a/cpp/src/barrier/barrier.hpp +++ b/cpp/src/barrier/barrier.hpp @@ -100,6 +100,15 @@ class barrier_solver_t { f_t compute_nonnegative_step_length(iteration_data_t& data, const rmm::device_uvector& x, const rmm::device_uvector& dx); + /** + * @brief Solve for a Mehrotra search direction on the GPU. + * + * When snapshot_affine_direction is true (affine predictor step), copies the completed + * direction into data.d_*_aff_ for compute_target_mu and compute_final_direction. + * + * @param snapshot_affine_direction If true, snapshot (dw, dx, dy, dv, dz) into d_*_aff_. + * @return 0 on success, a negative value on failure, or CONCURRENT_HALT_RETURN if halted. + */ i_t gpu_compute_search_direction(iteration_data_t& data, pinned_dense_vector_t& dw, pinned_dense_vector_t& dx,