diff --git a/cpp/src/barrier/barrier.cu b/cpp/src/barrier/barrier.cu index a9b3aa891..0bf39e468 100644 --- a/cpp/src/barrier/barrier.cu +++ b/cpp/src/barrier/barrier.cu @@ -2707,6 +2707,38 @@ f_t barrier_solver_t::compute_nonnegative_step_length(iteration_data_t stream_view_); } +/** + * @brief Copy the current device search direction into Mehrotra affine buffers. + * + * Device-to-device snapshot of (dw, dx, dy, dv, dz) into d_*_aff_. Called from + * gpu_compute_search_direction when snapshot_affine_direction is true, immediately + * after the direction is fully formed. The corrector step reuses d_dx_/d_dy_/etc. + * and must not refresh d_*_aff_. + * + * @param data Per-iteration device state (d_dw_, d_dx_, ..., d_*_aff_). + * @param stream CUDA stream for resize and copy operations. + */ +template +void copy_affine_direction_to_device_buffers(iteration_data_t& data, + rmm::cuda_stream_view stream) +{ + raft::common::nvtx::range fun_scope("Barrier: copy_affine_direction_to_device_buffers"); + + auto copy_device_vec = [&](rmm::device_uvector& dst, + const rmm::device_uvector& src) { + cuopt_assert(dst.empty() || dst.size() == src.size(), + "Buffer size mismatch in affine snapshot"); + dst.resize(src.size(), stream); + if (src.size() > 0) { raft::copy(dst.data(), src.data(), src.size(), stream); } + }; + + copy_device_vec(data.d_dw_aff_, data.d_dw_); + copy_device_vec(data.d_dx_aff_, data.d_dx_); + copy_device_vec(data.d_dy_aff_, data.d_dy_); + copy_device_vec(data.d_dv_aff_, data.d_dv_); + copy_device_vec(data.d_dz_aff_, data.d_dz_); +} + template i_t barrier_solver_t::gpu_compute_search_direction(iteration_data_t& data, pinned_dense_vector_t& dw, @@ -2716,7 +2748,8 @@ i_t barrier_solver_t::gpu_compute_search_direction(iteration_data_t& dz, f_t& dual_perturb, f_t& primal_perturb, - f_t& max_residual) + f_t& max_residual, + bool snapshot_affine_direction) { raft::common::nvtx::range fun_scope("Barrier: compute_search_direction"); @@ -3017,9 +3050,6 @@ i_t barrier_solver_t::gpu_compute_search_direction(iteration_data_t::gpu_compute_search_direction(iteration_data_t::gpu_compute_search_direction(iteration_data_t::gpu_compute_search_direction(iteration_data_t(data.d_dz_.data(), linear_dz_size), raft::device_span(data.d_is_direct_free_linear_.data(), linear_dz_size), stream_view_); - raft::copy(dz.data(), data.d_dz_.data(), data.d_dz_.size(), stream_view_); } if (debug) { @@ -3332,7 +3357,6 @@ i_t barrier_solver_t::gpu_compute_search_direction(iteration_data_t::gpu_compute_search_direction(iteration_data_t::compute_target_mu( const bool has_soc = data.has_cones(); f_t complementarity_aff_sum = 0.0; - // TMP no copy and data should always be on the GPU - data.d_dw_aff_.resize(data.dw_aff.size(), stream_view_); - data.d_dx_aff_.resize(data.dx_aff.size(), stream_view_); - data.d_dv_aff_.resize(data.dv_aff.size(), stream_view_); - data.d_dz_aff_.resize(data.dz_aff.size(), stream_view_); - - raft::copy(data.d_dw_aff_.data(), data.dw_aff.data(), data.dw_aff.size(), stream_view_); - raft::copy(data.d_dx_aff_.data(), data.dx_aff.data(), data.dx_aff.size(), stream_view_); - raft::copy(data.d_dv_aff_.data(), data.dv_aff.data(), data.dv_aff.size(), stream_view_); - raft::copy(data.d_dz_aff_.data(), data.dz_aff.data(), data.dz_aff.size(), stream_view_); f_t step_primal_aff = std::min(compute_nonnegative_step_length(data, data.d_w_, data.d_dw_aff_), compute_nonnegative_step_length(data, data.d_x_, data.d_dx_aff_)); @@ -3799,16 +3819,18 @@ template void barrier_solver_t::compute_final_direction(iteration_data_t& data) { raft::common::nvtx::range fun_scope("Barrier: compute_final_direction"); - data.d_dy_aff_.resize(data.dy_aff.size(), stream_view_); - raft::copy(data.d_dy_aff_.data(), data.dy_aff.data(), data.dy_aff.size(), stream_view_); #ifdef FINITE_CHECK for (i_t i = 0; i < (int)data.y.size(); i++) { cuopt_assert(std::isfinite(data.y[i]), "data.d_y_[i] is not finite"); } - for (i_t i = 0; i < (int)data.dy_aff.size(); i++) { - cuopt_assert(std::isfinite(data.dy_aff[i]), "data.dy_aff_[i] is not finite"); + if (data.d_dy_aff_.size() > 0) { + const auto dy_aff_host = host_copy(data.d_dy_aff_, stream_view_); + stream_view_.synchronize(); + for (i_t i = 0; i < static_cast(dy_aff_host.size()); ++i) { + cuopt_assert(std::isfinite(dy_aff_host[i]), "data.d_dy_aff_[i] is not finite"); + } } #endif @@ -4437,13 +4459,12 @@ lp_status_t barrier_solver_t::solve(f_t start_time, lp_solution_t::solve(f_t start_time, lp_solution_t& data, const rmm::device_uvector& x, const rmm::device_uvector& dx); + /** + * @brief Solve for a Mehrotra search direction on the GPU. + * + * When snapshot_affine_direction is true (affine predictor step), copies the completed + * direction into data.d_*_aff_ for compute_target_mu and compute_final_direction. + * + * @param snapshot_affine_direction If true, snapshot (dw, dx, dy, dv, dz) into d_*_aff_. + * @return 0 on success, a negative value on failure, or CONCURRENT_HALT_RETURN if halted. + */ i_t gpu_compute_search_direction(iteration_data_t& data, pinned_dense_vector_t& dw, pinned_dense_vector_t& dx, @@ -108,7 +117,8 @@ class barrier_solver_t { pinned_dense_vector_t& dz, f_t& dual_perturb, f_t& primal_perturb, - f_t& max_residual); + f_t& max_residual, + bool snapshot_affine_direction = false); private: lp_status_t check_for_suboptimal_solution(iteration_data_t& data,