diff --git a/cpp/src/barrier/barrier.cu b/cpp/src/barrier/barrier.cu index a9b3aa891..1cbeed5cf 100644 --- a/cpp/src/barrier/barrier.cu +++ b/cpp/src/barrier/barrier.cu @@ -14,6 +14,7 @@ #include #include #include +#include #include #include #include @@ -244,27 +245,7 @@ class iteration_data_t { A(lp.A), Q(Qin), cusparse_Q_view_(lp.handle_ptr, Q), - primal_residual(lp.num_rows), - bound_residual(num_upper_bounds), - dual_residual(lp.num_cols), - complementarity_xz_residual(lp.num_cols), - complementarity_wv_residual(num_upper_bounds), cusparse_view_(lp.handle_ptr, lp.A), - primal_rhs(lp.num_rows), - bound_rhs(num_upper_bounds), - dual_rhs(lp.num_cols), - complementarity_xz_rhs(lp.num_cols), - complementarity_wv_rhs(num_upper_bounds), - dw_aff(num_upper_bounds), - dx_aff(lp.num_cols), - dy_aff(lp.num_rows), - dv_aff(num_upper_bounds), - dz_aff(lp.num_cols), - dw(num_upper_bounds), - dx(lp.num_cols), - dy(lp.num_rows), - dv(num_upper_bounds), - dz(lp.num_cols), cusparse_info(lp.handle_ptr), device_AD(lp.num_cols, lp.num_rows, 0, lp.handle_ptr->get_stream()), device_A(lp.num_cols, lp.num_rows, 0, lp.handle_ptr->get_stream()), @@ -304,6 +285,7 @@ class iteration_data_t { d_augmented_rhs_(0, lp.handle_ptr->get_stream()), d_augmented_soln_(0, lp.handle_ptr->get_stream()), d_c_(lp.num_cols, lp.handle_ptr->get_stream()), + d_b_(lp.num_rows, lp.handle_ptr->get_stream()), d_upper_(0, lp.handle_ptr->get_stream()), d_u_(lp.A.n, lp.handle_ptr->get_stream()), d_upper_bounds_(0, lp.handle_ptr->get_stream()), @@ -336,6 +318,7 @@ class iteration_data_t { d_Q_diag_(0, lp.handle_ptr->get_stream()), d_Qx_(Qin.m, lp.handle_ptr->get_stream()), restrict_u_(0), + d_restrict_u_(0, lp.handle_ptr->get_stream()), transform_reduce_helper_(lp.handle_ptr->get_stream()), sum_reduce_helper_(lp.handle_ptr->get_stream()), indefinite_Q(false), @@ -364,6 +347,8 @@ class iteration_data_t { } } + raft::copy(d_c_.data(), c.data(), c.size(), stream_view_); + raft::copy(d_b_.data(), b.data(), b.size(), stream_view_); bool has_Q = Q.x.size() > 0; indefinite_Q = false; { @@ -1398,7 +1383,8 @@ class iteration_data_t { // Solve ADAT * x = b return chol->solve(d_b, d_x); } else { - // TMP until this is ported to the GPU + raft::copy(inv_diag.data(), d_inv_diag.data(), d_inv_diag.size(), stream_view_); + stream_view_.synchronize(); dense_vector_t b = host_copy(d_b, stream_view_); dense_vector_t x = host_copy(d_x, stream_view_); @@ -1419,7 +1405,6 @@ class iteration_data_t { f_t objective, f_t user_objective, f_t primal_residual, - f_t dual_residual, cusparse_view_t& cusparse_view, lp_solution_t& solution) { @@ -1897,15 +1882,15 @@ class iteration_data_t { raft::handle_t const* handle_ptr; i_t n_upper_bounds; - pinned_dense_vector_t upper_bounds; - pinned_dense_vector_t c; - pinned_dense_vector_t b; + dense_vector_t upper_bounds; + dense_vector_t c; + dense_vector_t b; - pinned_dense_vector_t w; - pinned_dense_vector_t x; + dense_vector_t w; + dense_vector_t x; dense_vector_t y; - pinned_dense_vector_t v; - pinned_dense_vector_t z; + dense_vector_t v; + dense_vector_t z; dense_vector_t w_save; dense_vector_t x_save; @@ -1919,9 +1904,9 @@ class iteration_data_t { f_t dual_residual_norm_save; f_t complementarity_residual_norm_save; - pinned_dense_vector_t diag; + dense_vector_t diag; pinned_dense_vector_t inv_diag; - pinned_dense_vector_t inv_sqrt_diag; + dense_vector_t inv_sqrt_diag; rmm::device_uvector d_original_A_values; @@ -1981,29 +1966,6 @@ class iteration_data_t { bool has_solve_info; i_t num_factorizations; - pinned_dense_vector_t primal_residual; - pinned_dense_vector_t bound_residual; - pinned_dense_vector_t dual_residual; - pinned_dense_vector_t complementarity_xz_residual; - pinned_dense_vector_t complementarity_wv_residual; - - pinned_dense_vector_t primal_rhs; - pinned_dense_vector_t bound_rhs; - pinned_dense_vector_t dual_rhs; - pinned_dense_vector_t complementarity_xz_rhs; - pinned_dense_vector_t complementarity_wv_rhs; - - pinned_dense_vector_t dw_aff; - pinned_dense_vector_t dx_aff; - pinned_dense_vector_t dy_aff; - pinned_dense_vector_t dv_aff; - pinned_dense_vector_t dz_aff; - - pinned_dense_vector_t dw; - pinned_dense_vector_t dx; - pinned_dense_vector_t dy; - pinned_dense_vector_t dv; - pinned_dense_vector_t dz; cusparse_info_t cusparse_info; cusparse_view_t cusparse_view_; detail::cusparse_dn_vec_descr_wrapper_t cusparse_tmp4_; @@ -2039,6 +2001,7 @@ class iteration_data_t { rmm::device_uvector d_augmented_rhs_; rmm::device_uvector d_augmented_soln_; rmm::device_uvector d_c_; + rmm::device_uvector d_b_; rmm::device_uvector d_upper_; rmm::device_uvector d_u_; rmm::device_uvector d_upper_bounds_; @@ -2077,7 +2040,8 @@ class iteration_data_t { rmm::device_uvector d_Q_diag_; rmm::device_uvector d_Qx_; - pinned_dense_vector_t restrict_u_; + dense_vector_t restrict_u_; + rmm::device_uvector d_restrict_u_; transform_reduce_helper_t transform_reduce_helper_; sum_reduce_helper_t sum_reduce_helper_; @@ -2340,20 +2304,22 @@ int barrier_solver_t::initial_point(iteration_data_t& data) } // Verify A*x = b - data.primal_residual = lp.rhs; - data.cusparse_view_.spmv(1.0, data.x, -1.0, data.primal_residual); + dense_vector_t init_primal_residual(lp.num_rows); + init_primal_residual = lp.rhs; + data.cusparse_view_.spmv(1.0, data.x, -1.0, init_primal_residual); data.handle_ptr->get_stream().synchronize(); #ifdef PRINT_INFO - settings.log.printf("||b - A * x||: %.16e\n", vector_norm2(data.primal_residual)); + settings.log.printf("||b - A * x||: %.16e\n", vector_norm2(init_primal_residual)); #endif if (data.n_upper_bounds > 0) { + dense_vector_t init_bound_residual(data.n_upper_bounds); for (i_t k = 0; k < data.n_upper_bounds; k++) { i_t j = data.upper_bounds[k]; - data.bound_residual[k] = lp.upper[j] - data.w[k] - data.x[j]; + init_bound_residual[k] = lp.upper[j] - data.w[k] - data.x[j]; } #ifdef PRINT_INFO - settings.log.printf("|| u - w - x||: %e\n", vector_norm2(data.bound_residual)); + settings.log.printf("|| u - w - x||: %e\n", vector_norm2(init_bound_residual)); #endif } @@ -2455,18 +2421,19 @@ int barrier_solver_t::initial_point(iteration_data_t& data) } // Verify A'*y + z - E*v - Q*x = c - data.z.pairwise_subtract(data.c, data.dual_residual); - if (data.Q.n > 0) { matrix_vector_multiply(data.Q, -1.0, data.x, 1.0, data.dual_residual); } - data.cusparse_view_.transpose_spmv(1.0, data.y, 1.0, data.dual_residual); + dense_vector_t init_dual_residual(lp.num_cols); + data.z.pairwise_subtract(data.c, init_dual_residual); + if (data.Q.n > 0) { matrix_vector_multiply(data.Q, -1.0, data.x, 1.0, init_dual_residual); } + data.cusparse_view_.transpose_spmv(1.0, data.y, 1.0, init_dual_residual); if (data.n_upper_bounds > 0) { for (i_t k = 0; k < data.n_upper_bounds; k++) { i_t j = data.upper_bounds[k]; - data.dual_residual[j] -= data.v[k]; + init_dual_residual[j] -= data.v[k]; } } #ifdef PRINT_INFO settings.log.printf("||A^T y + z - E*v - Q*x - c ||: %e\n", - vector_norm2(data.dual_residual)); + vector_norm2(init_dual_residual)); #endif // Make sure (w, x, v, z) > 0. Skip free variables being handled directly. data.w.ensure_positive(epsilon_adjust); @@ -2500,24 +2467,10 @@ void barrier_solver_t::gpu_compute_residuals(const rmm::device_uvector { raft::common::nvtx::range fun_scope("Barrier: GPU compute_residuals"); - data.d_primal_residual_.resize(data.primal_residual.size(), stream_view_); - raft::copy(data.d_primal_residual_.data(), lp.rhs.data(), lp.rhs.size(), stream_view_); + data.d_primal_residual_.resize(lp.num_rows, stream_view_); + raft::copy(data.d_primal_residual_.data(), data.d_b_.data(), data.d_b_.size(), stream_view_); - data.d_dual_residual_.resize(data.dual_residual.size(), stream_view_); - raft::copy(data.d_dual_residual_.data(), - data.dual_residual.data(), - data.dual_residual.size(), - stream_view_); - data.d_upper_bounds_.resize(data.n_upper_bounds, stream_view_); - raft::copy( - data.d_upper_bounds_.data(), data.upper_bounds.data(), data.n_upper_bounds, stream_view_); - data.d_upper_.resize(lp.upper.size(), stream_view_); - raft::copy(data.d_upper_.data(), lp.upper.data(), lp.upper.size(), stream_view_); - data.d_bound_residual_.resize(data.bound_residual.size(), stream_view_); - raft::copy(data.d_bound_residual_.data(), - data.bound_residual.data(), - data.bound_residual.size(), - stream_view_); + data.d_dual_residual_.resize(lp.num_cols, stream_view_); // Compute primal_residual = b - A*x @@ -2540,19 +2493,16 @@ void barrier_solver_t::gpu_compute_residuals(const rmm::device_uvector } // Compute dual_residual = c - A'*y - z + E*v + Q*x - if (data.Q.n > 0) { - raft::copy(data.d_c_.data(), data.c.data(), data.c.size(), stream_view_); - auto cusparse_d_c = data.cusparse_view_.create_vector(data.d_c_); - data.cusparse_Q_view_.spmv(1.0, cusparse_d_x, 1.0, cusparse_d_c); - } else { - raft::copy(data.d_c_.data(), data.c.data(), data.c.size(), stream_view_); - } cub::DeviceTransform::Transform(cuda::std::make_tuple(data.d_c_.data(), data.d_z_.data()), data.d_dual_residual_.data(), data.d_dual_residual_.size(), cuda::std::minus<>{}, stream_view_.value()); RAFT_CHECK_CUDA(stream_view_); + if (data.Q.n > 0) { + auto descr_dual_residual = data.cusparse_view_.create_vector(data.d_dual_residual_); + data.cusparse_Q_view_.spmv(1.0, cusparse_d_x, 1.0, descr_dual_residual); + } // Compute dual_residual = c - A'*y - z + E*v auto cusparse_d_y = data.cusparse_view_.create_vector(d_y); auto descr_dual_residual = data.cusparse_view_.create_vector(data.d_dual_residual_); @@ -2584,69 +2534,6 @@ void barrier_solver_t::gpu_compute_residuals(const rmm::device_uvector cuda::std::multiplies<>{}, stream_view_.value()); RAFT_CHECK_CUDA(stream_view_); - raft::copy(data.complementarity_wv_residual.data(), - data.d_complementarity_wv_residual_.data(), - data.d_complementarity_wv_residual_.size(), - stream_view_); - raft::copy(data.complementarity_xz_residual.data(), - data.d_complementarity_xz_residual_.data(), - data.d_complementarity_xz_residual_.size(), - stream_view_); - raft::copy(data.dual_residual.data(), - data.d_dual_residual_.data(), - data.d_dual_residual_.size(), - stream_view_); - raft::copy(data.primal_residual.data(), - data.d_primal_residual_.data(), - data.d_primal_residual_.size(), - stream_view_); - raft::copy(data.bound_residual.data(), - data.d_bound_residual_.data(), - data.d_bound_residual_.size(), - stream_view_); - // Sync to make sure host data has been copied - RAFT_CUDA_TRY(cudaStreamSynchronize(stream_view_)); -} - -template -template -void barrier_solver_t::compute_residuals(const dense_vector_t& w, - const dense_vector_t& x, - const dense_vector_t& y, - const dense_vector_t& v, - const dense_vector_t& z, - iteration_data_t& data) -{ - raft::common::nvtx::range fun_scope("Barrier: CPU compute_residuals"); - - // Compute primal_residual = b - A*x - data.primal_residual = lp.rhs; - data.cusparse_view_.spmv(-1.0, x, 1.0, data.primal_residual); - - // Compute bound_residual = E'*u - w - E'*x - if (data.n_upper_bounds > 0) { - for (i_t k = 0; k < data.n_upper_bounds; k++) { - i_t j = data.upper_bounds[k]; - data.bound_residual[k] = lp.upper[j] - w[k] - x[j]; - } - } - - // Compute dual_residual = c - A'*y - z + E*v + Q*x - data.c.pairwise_subtract(z, data.dual_residual); - data.cusparse_view_.transpose_spmv(-1.0, y, 1.0, data.dual_residual); - if (data.n_upper_bounds > 0) { - for (i_t k = 0; k < data.n_upper_bounds; k++) { - i_t j = data.upper_bounds[k]; - data.dual_residual[j] += v[k]; - } - } - if (data.Q.n > 0) { matrix_vector_multiply(data.Q, 1.0, x, 1.0, data.dual_residual); } - - // Compute complementarity_xz_residual = x.*z - x.pairwise_product(z, data.complementarity_xz_residual); - - // Compute complementarity_wv_residual = w.*v - w.pairwise_product(v, data.complementarity_wv_residual); } template @@ -2709,11 +2596,6 @@ f_t barrier_solver_t::compute_nonnegative_step_length(iteration_data_t template i_t barrier_solver_t::gpu_compute_search_direction(iteration_data_t& data, - pinned_dense_vector_t& dw, - pinned_dense_vector_t& dx, - pinned_dense_vector_t& dy, - pinned_dense_vector_t& dv, - pinned_dense_vector_t& dz, f_t& dual_perturb, f_t& primal_perturb, f_t& max_residual) @@ -2728,11 +2610,21 @@ i_t barrier_solver_t::gpu_compute_search_direction(iteration_data_t(data, stream_view_); - data.d_dx_.resize(dx.size(), stream_view_); - data.d_dy_.resize(dy.size(), stream_view_); - data.d_dz_.resize(dz.size(), stream_view_); - data.d_dv_.resize(dv.size(), stream_view_); + { + raft::common::nvtx::range fun_scope("Barrier: GPU allocation and copies"); + + // RHS and state are already on device (set by compute_affine_rhs/compute_cc_rhs) + data.d_upper_bounds_.resize(data.upper_bounds.size(), stream_view_); + data.d_dy_.resize(lp.num_rows, stream_view_); + data.d_dx_.resize(lp.num_cols, stream_view_); + data.d_dz_.resize(lp.num_cols, stream_view_); + data.d_dv_.resize(data.n_upper_bounds, stream_view_); + data.d_dw_.resize(data.n_upper_bounds, stream_view_); + data.d_dw_residual_.resize(data.n_upper_bounds, stream_view_); + data.d_wv_residual_.resize(data.d_complementarity_wv_rhs_.size(), stream_view_); + data.d_xz_residual_.resize(data.d_complementarity_xz_rhs_.size(), stream_view_); + data.d_bound_residual_.resize(data.n_upper_bounds, stream_view_); + } // Solves the linear system // @@ -2838,8 +2730,6 @@ i_t barrier_solver_t::gpu_compute_search_direction(iteration_data_t::gpu_compute_search_direction(iteration_data_t::gpu_compute_search_direction(iteration_data_tsolve(data.d_augmented_rhs_, data.d_augmented_soln_); struct op_t { op_t(iteration_data_t& data) : data_(data) {} @@ -3017,8 +2903,6 @@ i_t barrier_solver_t::gpu_compute_search_direction(iteration_data_t::gpu_compute_search_direction(iteration_data_t::gpu_compute_search_direction(iteration_data_t::gpu_compute_search_direction(iteration_data_t x_residual = data.primal_rhs; - data.cusparse_view_.spmv(1.0, dx, -1.0, x_residual); - const f_t x_residual_norm = vector_norm_inf(x_residual, stream_view_); - max_residual = std::max(max_residual, x_residual_norm); - if (x_residual_norm > 1e-2) { - settings.log.printf("|| A * dx - rp || = %.2e\n", x_residual_norm); - } - } - { raft::common::nvtx::range fun_scope("Barrier: dz formation GPU"); @@ -3277,7 +3142,6 @@ i_t barrier_solver_t::gpu_compute_search_direction(iteration_data_t(data.d_dz_.data(), linear_dz_size), raft::device_span(data.d_is_direct_free_linear_.data(), linear_dz_size), stream_view_); - raft::copy(dz.data(), data.d_dz_.data(), data.d_dz_.size(), stream_view_); } if (debug) { @@ -3332,7 +3196,6 @@ i_t barrier_solver_t::gpu_compute_search_direction(iteration_data_t::gpu_compute_search_direction(iteration_data_t::gpu_compute_search_direction(iteration_data_t -void copy_host_iterate_to_device(iteration_data_t& data, rmm::cuda_stream_view stream) -{ - raft::common::nvtx::range fun_scope("Barrier: copy_host_iterate_to_device"); - - data.d_x_.resize(data.x.size(), stream); - data.d_z_.resize(data.z.size(), stream); - raft::copy(data.d_x_.data(), data.x.data(), data.x.size(), stream); - raft::copy(data.d_z_.data(), data.z.data(), data.z.size(), stream); - - data.d_w_.resize(data.w.size(), stream); - data.d_v_.resize(data.v.size(), stream); - raft::copy(data.d_w_.data(), data.w.data(), data.w.size(), stream); - raft::copy(data.d_v_.data(), data.v.data(), data.v.size(), stream); - - data.d_y_.resize(data.y.size(), stream); - raft::copy(data.d_y_.data(), data.y.data(), data.y.size(), stream); - - data.d_upper_bounds_.resize(data.upper_bounds.size(), stream); - raft::copy( - data.d_upper_bounds_.data(), data.upper_bounds.data(), data.upper_bounds.size(), stream); -} - -// One-time static problem data (constant across barrier iterations). -template -void copy_static_problem_data_to_device(const lp_problem_t& lp, - iteration_data_t& data, - rmm::cuda_stream_view stream) -{ - raft::common::nvtx::range fun_scope("Barrier: copy_static_problem_data_to_device"); - - data.d_primal_residual_.resize(lp.rhs.size(), stream); - raft::copy(data.d_primal_residual_.data(), lp.rhs.data(), lp.rhs.size(), stream); - - data.d_upper_.resize(lp.upper.size(), stream); - raft::copy(data.d_upper_.data(), lp.upper.data(), lp.upper.size(), stream); - - data.d_bound_residual_.resize(data.bound_residual.size(), stream); - data.d_dw_residual_.resize(data.n_upper_bounds, stream); -} - -// Per Mehrotra step: affine/corrector RHS only (iterate stays on device from initial sync / -// next_iterate). -template -void copy_step_rhs_to_device(iteration_data_t& data, rmm::cuda_stream_view stream) -{ - raft::common::nvtx::range fun_scope("Barrier: copy_step_rhs_to_device"); - - data.d_bound_rhs_.resize(data.bound_rhs.size(), stream); - raft::copy(data.d_bound_rhs_.data(), data.bound_rhs.data(), data.bound_rhs.size(), stream); - - raft::copy(data.d_h_.data(), data.primal_rhs.data(), data.primal_rhs.size(), stream); - raft::copy(data.d_dual_rhs_.data(), data.dual_rhs.data(), data.dual_rhs.size(), stream); - - data.d_dw_.resize(data.bound_rhs.size(), stream); - raft::copy(data.d_dw_.data(), data.bound_rhs.data(), data.bound_rhs.size(), stream); - - data.d_xz_residual_.resize(data.d_complementarity_xz_rhs_.size(), stream); - data.d_wv_residual_.resize(data.d_complementarity_wv_rhs_.size(), stream); -} - template void fill_linear_complementarity_target(iteration_data_t& data, raft::device_span target, @@ -3548,34 +3348,6 @@ void fill_linear_complementarity_target(iteration_data_t& data, RAFT_CHECK_CUDA(stream); } -template -static f_t host_complementarity_residual_norm(const iteration_data_t& data, - const std::vector& second_order_cone_dims, - rmm::cuda_stream_view stream) -{ - const i_t linear_xz_size = data.linear_xz_size(data.complementarity_xz_residual.size()); - raft::host_span linear_xz_span = - raft::host_span(data.complementarity_xz_residual.data(), linear_xz_size); - f_t complementarity_residual_norm = - std::max(vector_norm_inf(linear_xz_span, stream), - vector_norm_inf(data.complementarity_wv_residual, stream)); - if (data.has_cones()) { - f_t cone_complementarity_norm = f_t(0); - i_t offset = data.cone_start(); - for (i_t q_k : second_order_cone_dims) { - f_t cone_dot = f_t(0); - for (i_t j = 0; j < q_k; ++j) { - cone_dot += data.complementarity_xz_residual[offset + j]; - } - cone_complementarity_norm = std::max(cone_complementarity_norm, cone_dot); - offset += q_k; - } - complementarity_residual_norm = - std::max(complementarity_residual_norm, cone_complementarity_norm); - } - return complementarity_residual_norm; -} - template void fill_affine_cone_complementarity_target(iteration_data_t& data, i_t cone_var_start, @@ -3625,21 +3397,36 @@ void barrier_solver_t::compute_affine_rhs(iteration_data_t& const i_t cone_var_start = data.cone_start(); const i_t m_c = data.cone_entry_count(); - data.primal_rhs = data.primal_residual; - data.bound_rhs = data.bound_residual; - data.dual_rhs = data.dual_residual; + // D2D: RHS = residuals (all on device) data.cone_combined_step_ = false; data.cone_sigma_mu_ = f_t(0); + raft::copy( + data.d_h_.data(), data.d_primal_residual_.data(), data.d_primal_residual_.size(), stream_view_); + raft::copy(data.d_dual_rhs_.data(), + data.d_dual_residual_.data(), + data.d_dual_residual_.size(), + stream_view_); + data.d_bound_rhs_.resize(data.d_bound_residual_.size(), stream_view_); + raft::copy(data.d_bound_rhs_.data(), + data.d_bound_residual_.data(), + data.d_bound_residual_.size(), + stream_view_); + data.d_dw_.resize(data.d_bound_residual_.size(), stream_view_); + raft::copy( + data.d_dw_.data(), data.d_bound_residual_.data(), data.d_bound_residual_.size(), stream_view_); - // xz -> -x .* z; + // xz -> -x .* z for the linear complementarity block. negate_complementarity_rhs( raft::device_span(data.d_complementarity_xz_rhs_.data(), linear_size), raft::device_span(data.d_complementarity_xz_residual_.data(), linear_size), stream_view_); - // w.*v -> -w .* v; - negate_complementarity_rhs(cuopt::make_span(data.d_complementarity_wv_rhs_), - cuopt::make_span(data.d_complementarity_wv_residual_), - stream_view_); + // w.*v -> -w .* v. + negate_complementarity_rhs( + raft::device_span(data.d_complementarity_wv_rhs_.data(), + data.d_complementarity_wv_rhs_.size()), + raft::device_span(data.d_complementarity_wv_residual_.data(), + data.d_complementarity_wv_residual_.size()), + stream_view_); RAFT_CHECK_CUDA(stream_view_); fill_linear_complementarity_target( @@ -3663,16 +3450,6 @@ void barrier_solver_t::compute_target_mu( f_t complementarity_aff_sum = 0.0; // TMP no copy and data should always be on the GPU - data.d_dw_aff_.resize(data.dw_aff.size(), stream_view_); - data.d_dx_aff_.resize(data.dx_aff.size(), stream_view_); - data.d_dv_aff_.resize(data.dv_aff.size(), stream_view_); - data.d_dz_aff_.resize(data.dz_aff.size(), stream_view_); - - raft::copy(data.d_dw_aff_.data(), data.dw_aff.data(), data.dw_aff.size(), stream_view_); - raft::copy(data.d_dx_aff_.data(), data.dx_aff.data(), data.dx_aff.size(), stream_view_); - raft::copy(data.d_dv_aff_.data(), data.dv_aff.data(), data.dv_aff.size(), stream_view_); - raft::copy(data.d_dz_aff_.data(), data.dz_aff.data(), data.dz_aff.size(), stream_view_); - f_t step_primal_aff = std::min(compute_nonnegative_step_length(data, data.d_w_, data.d_dw_aff_), compute_nonnegative_step_length(data, data.d_x_, data.d_dx_aff_)); f_t step_dual_aff = std::min(compute_nonnegative_step_length(data, data.d_v_, data.d_dv_aff_), @@ -3787,10 +3564,13 @@ void barrier_solver_t::compute_cc_rhs(iteration_data_t& data [new_mu] HD(f_t dw_aff, f_t dv_aff) { return -(dw_aff * dv_aff) + new_mu; }, stream_view_.value()); RAFT_CHECK_CUDA(stream_view_); - // TMP should be CPU to 0 if CPU and GPU to 0 if GPU - data.primal_rhs.set_scalar(0.0); - data.bound_rhs.set_scalar(0.0); - data.dual_rhs.set_scalar(0.0); + // Zero the corrector RHS on device + thrust::fill(rmm::exec_policy(stream_view_), data.d_h_.begin(), data.d_h_.end(), f_t(0)); + thrust::fill( + rmm::exec_policy(stream_view_), data.d_dual_rhs_.begin(), data.d_dual_rhs_.end(), f_t(0)); + thrust::fill( + rmm::exec_policy(stream_view_), data.d_bound_rhs_.begin(), data.d_bound_rhs_.end(), f_t(0)); + thrust::fill(rmm::exec_policy(stream_view_), data.d_dw_.begin(), data.d_dw_.end(), f_t(0)); data.cone_combined_step_ = has_soc; data.cone_sigma_mu_ = has_soc ? new_mu : f_t(0); } @@ -3799,18 +3579,6 @@ template void barrier_solver_t::compute_final_direction(iteration_data_t& data) { raft::common::nvtx::range fun_scope("Barrier: compute_final_direction"); - data.d_dy_aff_.resize(data.dy_aff.size(), stream_view_); - raft::copy(data.d_dy_aff_.data(), data.dy_aff.data(), data.dy_aff.size(), stream_view_); - -#ifdef FINITE_CHECK - for (i_t i = 0; i < (int)data.y.size(); i++) { - cuopt_assert(std::isfinite(data.y[i]), "data.d_y_[i] is not finite"); - } - - for (i_t i = 0; i < (int)data.dy_aff.size(); i++) { - cuopt_assert(std::isfinite(data.dy_aff[i]), "data.dy_aff_[i] is not finite"); - } -#endif // dw = dw_aff + dw_cc // dx = dx_aff + dx_cc @@ -3945,14 +3713,6 @@ void barrier_solver_t::compute_next_iterate(iteration_data_t span_x[v] -= eta; }); } - - raft::copy(data.w.data(), data.d_w_.data(), data.d_w_.size(), stream_view_); - raft::copy(data.x.data(), data.d_x_.data(), data.d_x_.size(), stream_view_); - raft::copy(data.y.data(), data.d_y_.data(), data.d_y_.size(), stream_view_); - raft::copy(data.v.data(), data.d_v_.data(), data.d_v_.size(), stream_view_); - raft::copy(data.z.data(), data.d_z_.data(), data.d_z_.size(), stream_view_); - // Sync to make sure all host variable are done copying - RAFT_CUDA_TRY(cudaStreamSynchronize(stream_view_)); } template @@ -3994,9 +3754,6 @@ void barrier_solver_t::compute_primal_dual_objective(iteration_data_t< f_t& dual_objective) { raft::common::nvtx::range fun_scope("Barrier: compute_primal_dual_objective"); - raft::copy(data.d_c_.data(), data.c.data(), data.c.size(), stream_view_); - auto d_b = device_copy(data.b, stream_view_); - auto d_restrict_u = device_copy(data.restrict_u_, stream_view_); rmm::device_scalar d_cx(stream_view_); rmm::device_scalar d_by(stream_view_); rmm::device_scalar d_uv(stream_view_); @@ -4010,16 +3767,16 @@ void barrier_solver_t::compute_primal_dual_objective(iteration_data_t< d_cx.data(), stream_view_)); RAFT_CUBLAS_TRY(raft::linalg::detail::cublasdot(lp.handle_ptr->get_cublas_handle(), - d_b.size(), - d_b.data(), + data.d_b_.size(), + data.d_b_.data(), 1, data.d_y_.data(), 1, d_by.data(), stream_view_)); RAFT_CUBLAS_TRY(raft::linalg::detail::cublasdot(lp.handle_ptr->get_cublas_handle(), - d_restrict_u.size(), - d_restrict_u.data(), + data.d_restrict_u_.size(), + data.d_restrict_u_.data(), 1, data.d_v_.data(), 1, @@ -4156,12 +3913,16 @@ lp_status_t barrier_solver_t::check_for_suboptimal_solution( relative_dual_residual < settings.barrier_relaxed_optimality_tol && relative_complementarity_residual < settings.barrier_relaxed_complementarity_tol && primal_objective == primal_objective) { + raft::copy(data.x.data(), data.d_x_.data(), data.d_x_.size(), stream_view_); + raft::copy(data.y.data(), data.d_y_.data(), data.d_y_.size(), stream_view_); + raft::copy(data.z.data(), data.d_z_.data(), data.d_z_.size(), stream_view_); + raft::copy(data.v.data(), data.d_v_.data(), data.d_v_.size(), stream_view_); + RAFT_CUDA_TRY(cudaStreamSynchronize(stream_view_)); data.to_solution(lp, iter, primal_objective, compute_user_objective(lp, primal_objective), - vector_norm2(data.primal_residual), - vector_norm2(data.dual_residual), + primal_residual_norm, data.cusparse_view_, solution); settings.log.printf("\n"); @@ -4195,12 +3956,16 @@ lp_status_t barrier_solver_t::check_for_suboptimal_solution( data.relative_dual_residual_save < settings.barrier_relaxed_optimality_tol && data.relative_complementarity_residual_save < settings.barrier_relaxed_complementarity_tol) { settings.log.printf("Restoring previous solution\n"); + raft::copy(data.x.data(), data.d_x_.data(), data.d_x_.size(), stream_view_); + raft::copy(data.y.data(), data.d_y_.data(), data.d_y_.size(), stream_view_); + raft::copy(data.z.data(), data.d_z_.data(), data.d_z_.size(), stream_view_); + raft::copy(data.v.data(), data.d_v_.data(), data.d_v_.size(), stream_view_); + RAFT_CUDA_TRY(cudaStreamSynchronize(stream_view_)); data.to_solution(lp, iter, primal_objective_save, compute_user_objective(lp, primal_objective_save), data.primal_residual_norm_save, - data.dual_residual_norm_save, data.cusparse_view_, solution); settings.log.printf("\n"); @@ -4307,22 +4072,40 @@ lp_status_t barrier_solver_t::solve(f_t start_time, lp_solution_t(data, stream_view_); - copy_static_problem_data_to_device(lp, data, stream_view_); - - compute_residuals>(data.w, data.x, data.y, data.v, data.z, data); - - f_t primal_residual_norm = - std::max(vector_norm_inf(data.primal_residual, stream_view_), - vector_norm_inf(data.bound_residual, stream_view_)); - f_t dual_residual_norm = vector_norm_inf(data.dual_residual, stream_view_); - f_t complementarity_residual_norm = - host_complementarity_residual_norm(data, lp.second_order_cone_dims, stream_view_); - - f_t mu_denom = data.complementarity_degree(n, num_upper_bounds); - f_t mu = - (data.complementarity_xz_residual.sum() + data.complementarity_wv_residual.sum()) / mu_denom; + // Upload initial point to device and compute initial residuals/norms on GPU + data.d_complementarity_wv_residual_.resize(data.n_upper_bounds, stream_view_); + data.d_complementarity_wv_rhs_.resize(data.n_upper_bounds, stream_view_); + data.d_x_.resize(data.x.size(), stream_view_); + raft::copy(data.d_x_.data(), data.x.data(), data.x.size(), stream_view_); + data.d_y_.resize(data.y.size(), stream_view_); + raft::copy(data.d_y_.data(), data.y.data(), data.y.size(), stream_view_); + data.d_z_.resize(data.z.size(), stream_view_); + raft::copy(data.d_z_.data(), data.z.data(), data.z.size(), stream_view_); + data.d_w_.resize(data.w.size(), stream_view_); + raft::copy(data.d_w_.data(), data.w.data(), data.w.size(), stream_view_); + data.d_v_.resize(data.v.size(), stream_view_); + raft::copy(data.d_v_.data(), data.v.data(), data.v.size(), stream_view_); + data.d_upper_bounds_.resize(data.upper_bounds.size(), stream_view_); + raft::copy(data.d_upper_bounds_.data(), + data.upper_bounds.data(), + data.upper_bounds.size(), + stream_view_); + data.d_upper_.resize(lp.upper.size(), stream_view_); + raft::copy(data.d_upper_.data(), lp.upper.data(), lp.upper.size(), stream_view_); + data.d_bound_residual_.resize(data.n_upper_bounds, stream_view_); + + f_t primal_residual_norm, dual_residual_norm, complementarity_residual_norm; + gpu_compute_residual_norms(data.d_w_, + data.d_x_, + data.d_y_, + data.d_v_, + data.d_z_, + data, + primal_residual_norm, + dual_residual_norm, + complementarity_residual_norm); + f_t mu; + compute_mu(data, mu); f_t norm_b = vector_norm_inf(data.b, stream_view_); f_t norm_c = vector_norm_inf(data.c, stream_view_); @@ -4344,6 +4127,9 @@ lp_status_t barrier_solver_t::solve(f_t start_time, lp_solution_t upper(lp.upper); data.gather_upper_bounds(upper, data.restrict_u_); + data.d_restrict_u_.resize(data.restrict_u_.size(), stream_view_); + raft::copy( + data.d_restrict_u_.data(), data.restrict_u_.data(), data.restrict_u_.size(), stream_view_); f_t dual_objective = data.b.inner_product(data.y) - data.restrict_u_.inner_product(data.v) - quad_objective; @@ -4352,6 +4138,12 @@ lp_status_t barrier_solver_t::solve(f_t start_time, lp_solution_t::solve(f_t start_time, lp_solution_t::solve(f_t start_time, lp_solution_t::solve(f_t start_time, lp_solution_t::solve(f_t start_time, lp_solution_t::solve(f_t start_time, lp_solution_t::solve(f_t start_time, lp_solution_t(data.primal_residual), - vector_norm2(data.dual_residual), + primal_residual_norm, data.cusparse_view_, solution); return lp_status_t::ITERATION_LIMIT; diff --git a/cpp/src/barrier/barrier.hpp b/cpp/src/barrier/barrier.hpp index f7c93b6de..b17cce397 100644 --- a/cpp/src/barrier/barrier.hpp +++ b/cpp/src/barrier/barrier.hpp @@ -7,7 +7,6 @@ #pragma once #include -#include #include #include @@ -49,13 +48,6 @@ class barrier_solver_t { f_t& dual_residual_norm, f_t& complementarity_residual_norm); - template - void compute_residuals(const dense_vector_t& w, - const dense_vector_t& x, - const dense_vector_t& y, - const dense_vector_t& v, - const dense_vector_t& z, - iteration_data_t& data); void compute_primal_dual_step_length(iteration_data_t& data, f_t step_scale, f_t& step_primal, @@ -101,11 +93,6 @@ class barrier_solver_t { const rmm::device_uvector& x, const rmm::device_uvector& dx); i_t gpu_compute_search_direction(iteration_data_t& data, - pinned_dense_vector_t& dw, - pinned_dense_vector_t& dx, - pinned_dense_vector_t& dy, - pinned_dense_vector_t& dv, - pinned_dense_vector_t& dz, f_t& dual_perturb, f_t& primal_perturb, f_t& max_residual);