diff --git a/CMakeLists.txt b/CMakeLists.txt index 8ccc7ac..7910540 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -8,7 +8,7 @@ project(cupdlpx LANGUAGES C CXX CUDA) set(CUPDLPX_VERSION_MAJOR 0) set(CUPDLPX_VERSION_MINOR 1) -set(CUPDLPX_VERSION_PATCH 4) +set(CUPDLPX_VERSION_PATCH 5) set(CUPDLPX_VERSION "${CUPDLPX_VERSION_MAJOR}.${CUPDLPX_VERSION_MINOR}.${CUPDLPX_VERSION_PATCH}") add_compile_definitions(CUPDLPX_VERSION="${CUPDLPX_VERSION}") diff --git a/internal/internal_types.h b/internal/internal_types.h index 1f42582..aa69005 100644 --- a/internal/internal_types.h +++ b/internal/internal_types.h @@ -28,7 +28,9 @@ typedef struct int num_nonzeros; int *row_ptr; int *col_ind; + int *row_ind; double *val; + int *transpose_map; } cu_sparse_matrix_csr_t; typedef struct @@ -46,6 +48,7 @@ typedef struct int num_blocks_primal; int num_blocks_dual; int num_blocks_primal_dual; + int num_blocks_nnz; double objective_vector_norm; double constraint_bound_norm; double *constraint_lower_bound_finite_val; @@ -129,7 +132,6 @@ typedef struct typedef struct { - lp_problem_t *scaled_problem; double *con_rescale; double *var_rescale; double con_bound_rescale; diff --git a/internal/preconditioner.h b/internal/preconditioner.h index 8feb664..71a5c2b 100644 --- a/internal/preconditioner.h +++ b/internal/preconditioner.h @@ -25,7 +25,7 @@ extern "C" rescale_info_t *rescale_problem( const pdhg_parameters_t *params, - const lp_problem_t *original_problem); + pdhg_solver_state_t *state); #ifdef __cplusplus } diff --git a/pyproject.toml b/pyproject.toml index 0596175..4a4626d 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -4,7 +4,7 @@ build-backend = "scikit_build_core.build" [project] name = "cupdlpx" -version = "0.1.3" +version = "0.1.5" description = "Python bindings for cuPDLPx (GPU-accelerated first-order LP solver)" readme = "README.md" license = { text = "Apache-2.0" } @@ -25,7 +25,7 @@ wheel.packages = ["python/cupdlpx"] sdist.include = ["tests/**", "pyproject.toml", "README.md", "LICENSE"] [tool.scikit-build.cmake.define] -CMAKE_CUDA_ARCHITECTURES = "75;80;86;89;90;90-virtual" +CMAKE_CUDA_ARCHITECTURES = "all" CMAKE_CUDA_STANDARD = "17" CUPDLPX_BUILD_PYTHON = "ON" CUPDLPX_BUILD_STATIC_LIB = "ON" diff --git a/src/cli.c b/src/cli.c index b9588ad..b4253d6 100644 --- a/src/cli.c +++ b/src/cli.c @@ -123,6 +123,7 @@ void save_solver_summary(const cupdlpx_result_t *result, const char *output_dir, } fprintf(outfile, "Termination Reason: %s\n", termination_reason_tToString(result->termination_reason)); + fprintf(outfile, "Precondition Time (sec): %e\n", result->rescaling_time_sec); fprintf(outfile, "Runtime (sec): %e\n", result->cumulative_time_sec); fprintf(outfile, "Iterations Count: %d\n", result->total_count); fprintf(outfile, "Primal Objective Value: %e\n", diff --git a/src/preconditioner.c b/src/preconditioner.c deleted file mode 100644 index 41b7542..0000000 --- a/src/preconditioner.c +++ /dev/null @@ -1,306 +0,0 @@ -/* -Copyright 2025 Haihao Lu - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. -*/ - -#include "preconditioner.h" -#include "utils.h" -#include -#include -#include -#include - -#define SCALING_EPSILON 1e-12 - -static lp_problem_t *deepcopy_problem(const lp_problem_t *prob); -static void scale_problem(lp_problem_t *problem, const double *con_rescale, - const double *var_rescale); -static void ruiz_rescaling(lp_problem_t *problem, int num_iters, - double *cum_con_rescale, double *cum_var_rescale); -static void pock_chambolle_rescaling(lp_problem_t *problem, double alpha, - double *cum_con_rescale, - double *cum_var_rescale); - -static lp_problem_t *deepcopy_problem(const lp_problem_t *prob) -{ - lp_problem_t *new_prob = (lp_problem_t *)safe_malloc(sizeof(lp_problem_t)); - - new_prob->num_variables = prob->num_variables; - new_prob->num_constraints = prob->num_constraints; - new_prob->constraint_matrix_num_nonzeros = - prob->constraint_matrix_num_nonzeros; - new_prob->objective_constant = prob->objective_constant; - - size_t var_bytes = prob->num_variables * sizeof(double); - size_t con_bytes = prob->num_constraints * sizeof(double); - size_t nnz_bytes_val = prob->constraint_matrix_num_nonzeros * sizeof(double); - size_t nnz_bytes_col = prob->constraint_matrix_num_nonzeros * sizeof(int); - size_t row_ptr_bytes = (prob->num_constraints + 1) * sizeof(int); - - new_prob->variable_lower_bound = safe_malloc(var_bytes); - new_prob->variable_upper_bound = safe_malloc(var_bytes); - new_prob->objective_vector = safe_malloc(var_bytes); - new_prob->constraint_lower_bound = safe_malloc(con_bytes); - new_prob->constraint_upper_bound = safe_malloc(con_bytes); - new_prob->constraint_matrix_row_pointers = safe_malloc(row_ptr_bytes); - new_prob->constraint_matrix_col_indices = safe_malloc(nnz_bytes_col); - new_prob->constraint_matrix_values = safe_malloc(nnz_bytes_val); - - memcpy(new_prob->variable_lower_bound, prob->variable_lower_bound, var_bytes); - memcpy(new_prob->variable_upper_bound, prob->variable_upper_bound, var_bytes); - memcpy(new_prob->objective_vector, prob->objective_vector, var_bytes); - memcpy(new_prob->constraint_lower_bound, prob->constraint_lower_bound, - con_bytes); - memcpy(new_prob->constraint_upper_bound, prob->constraint_upper_bound, - con_bytes); - memcpy(new_prob->constraint_matrix_row_pointers, - prob->constraint_matrix_row_pointers, row_ptr_bytes); - memcpy(new_prob->constraint_matrix_col_indices, - prob->constraint_matrix_col_indices, nnz_bytes_col); - memcpy(new_prob->constraint_matrix_values, prob->constraint_matrix_values, - nnz_bytes_val); - - if (prob->primal_start) - { - new_prob->primal_start = safe_malloc(var_bytes); - memcpy(new_prob->primal_start, prob->primal_start, var_bytes); - } - else - { - new_prob->primal_start = NULL; - } - if (prob->dual_start) - { - new_prob->dual_start = safe_malloc(con_bytes); - memcpy(new_prob->dual_start, prob->dual_start, con_bytes); - } - else - { - new_prob->dual_start = NULL; - } - - return new_prob; -} - -static void scale_problem(lp_problem_t *problem, - const double *constraint_rescaling, - const double *variable_rescaling) -{ - for (int i = 0; i < problem->num_variables; ++i) - { - problem->objective_vector[i] /= variable_rescaling[i]; - problem->variable_upper_bound[i] *= variable_rescaling[i]; - problem->variable_lower_bound[i] *= variable_rescaling[i]; - } - for (int i = 0; i < problem->num_constraints; ++i) - { - problem->constraint_lower_bound[i] /= constraint_rescaling[i]; - problem->constraint_upper_bound[i] /= constraint_rescaling[i]; - } - - for (int row = 0; row < problem->num_constraints; ++row) - { - for (int nz_idx = problem->constraint_matrix_row_pointers[row]; - nz_idx < problem->constraint_matrix_row_pointers[row + 1]; ++nz_idx) - { - int col = problem->constraint_matrix_col_indices[nz_idx]; - problem->constraint_matrix_values[nz_idx] /= - (constraint_rescaling[row] * variable_rescaling[col]); - } - } -} - -static void ruiz_rescaling(lp_problem_t *problem, int num_iterations, - double *cum_constraint_rescaling, - double *cum_variable_rescaling) -{ - int num_cons = problem->num_constraints; - int num_vars = problem->num_variables; - double *con_rescale = safe_malloc(num_cons * sizeof(double)); - double *var_rescale = safe_malloc(num_vars * sizeof(double)); - - for (int iter = 0; iter < num_iterations; ++iter) - { - for (int i = 0; i < num_vars; ++i) - var_rescale[i] = 0.0; - for (int i = 0; i < num_cons; ++i) - con_rescale[i] = 0.0; - - for (int row = 0; row < num_cons; ++row) - { - for (int nz_idx = problem->constraint_matrix_row_pointers[row]; - nz_idx < problem->constraint_matrix_row_pointers[row + 1]; - ++nz_idx) - { - int col = problem->constraint_matrix_col_indices[nz_idx]; - if (col < 0 || col >= num_vars) - { - fprintf(stderr, - "Error: Invalid column index %d at nz_idx %d for row %d. " - "Must be in [0, %d).\n", - col, nz_idx, row, num_vars); - } - double val = fabs(problem->constraint_matrix_values[nz_idx]); - if (val > var_rescale[col]) - var_rescale[col] = val; - if (val > con_rescale[row]) - con_rescale[row] = val; - } - } - - for (int i = 0; i < num_vars; ++i) - var_rescale[i] = - (var_rescale[i] < SCALING_EPSILON) ? 1.0 : sqrt(var_rescale[i]); - for (int i = 0; i < num_cons; ++i) - con_rescale[i] = - (con_rescale[i] < SCALING_EPSILON) ? 1.0 : sqrt(con_rescale[i]); - - scale_problem(problem, con_rescale, var_rescale); - for (int i = 0; i < num_vars; ++i) - cum_variable_rescaling[i] *= var_rescale[i]; - for (int i = 0; i < num_cons; ++i) - cum_constraint_rescaling[i] *= con_rescale[i]; - } - free(con_rescale); - free(var_rescale); -} - -static void pock_chambolle_rescaling(lp_problem_t *problem, double alpha, - double *cum_constraint_rescaling, - double *cum_variable_rescaling) -{ - int num_cons = problem->num_constraints; - int num_vars = problem->num_variables; - double *con_rescale = safe_calloc(num_cons, sizeof(double)); - double *var_rescale = safe_calloc(num_vars, sizeof(double)); - - for (int row = 0; row < num_cons; ++row) - { - for (int nz_idx = problem->constraint_matrix_row_pointers[row]; - nz_idx < problem->constraint_matrix_row_pointers[row + 1]; ++nz_idx) - { - int col = problem->constraint_matrix_col_indices[nz_idx]; - double val = fabs(problem->constraint_matrix_values[nz_idx]); - var_rescale[col] += pow(val, 2.0 - alpha); - con_rescale[row] += pow(val, alpha); - } - } - - for (int i = 0; i < num_vars; ++i) - var_rescale[i] = - (var_rescale[i] < SCALING_EPSILON) ? 1.0 : sqrt(var_rescale[i]); - for (int i = 0; i < num_cons; ++i) - con_rescale[i] = - (con_rescale[i] < SCALING_EPSILON) ? 1.0 : sqrt(con_rescale[i]); - - scale_problem(problem, con_rescale, var_rescale); - for (int i = 0; i < num_vars; ++i) - cum_variable_rescaling[i] *= var_rescale[i]; - for (int i = 0; i < num_cons; ++i) - cum_constraint_rescaling[i] *= con_rescale[i]; - - free(con_rescale); - free(var_rescale); -} - -rescale_info_t *rescale_problem(const pdhg_parameters_t *params, - const lp_problem_t *original_problem) -{ - clock_t start_rescaling = clock(); - rescale_info_t *rescale_info = - (rescale_info_t *)safe_calloc(1, sizeof(rescale_info_t)); - rescale_info->scaled_problem = deepcopy_problem(original_problem); - if (rescale_info->scaled_problem == NULL) - { - fprintf(stderr, - "Failed to create a copy of the problem. Aborting rescale.\n"); - return NULL; - } - int num_cons = original_problem->num_constraints; - int num_vars = original_problem->num_variables; - - rescale_info->con_rescale = safe_malloc(num_cons * sizeof(double)); - rescale_info->var_rescale = safe_malloc(num_vars * sizeof(double)); - for (int i = 0; i < num_cons; ++i) - rescale_info->con_rescale[i] = 1.0; - for (int i = 0; i < num_vars; ++i) - rescale_info->var_rescale[i] = 1.0; - if (params->l_inf_ruiz_iterations > 0) - { - ruiz_rescaling(rescale_info->scaled_problem, params->l_inf_ruiz_iterations, - rescale_info->con_rescale, rescale_info->var_rescale); - } - if (params->has_pock_chambolle_alpha) - { - pock_chambolle_rescaling( - rescale_info->scaled_problem, params->pock_chambolle_alpha, - rescale_info->con_rescale, rescale_info->var_rescale); - } - if (params->bound_objective_rescaling) - { - double bound_norm_sq = 0.0; - for (int i = 0; i < num_cons; ++i) - { - if (isfinite(rescale_info->scaled_problem->constraint_lower_bound[i]) && - (rescale_info->scaled_problem->constraint_lower_bound[i] != - rescale_info->scaled_problem->constraint_upper_bound[i])) - { - bound_norm_sq += - rescale_info->scaled_problem->constraint_lower_bound[i] * - rescale_info->scaled_problem->constraint_lower_bound[i]; - } - if (isfinite(rescale_info->scaled_problem->constraint_upper_bound[i])) - { - bound_norm_sq += - rescale_info->scaled_problem->constraint_upper_bound[i] * - rescale_info->scaled_problem->constraint_upper_bound[i]; - } - } - - double obj_norm_sq = 0.0; - for (int i = 0; i < num_vars; ++i) - { - obj_norm_sq += rescale_info->scaled_problem->objective_vector[i] * - rescale_info->scaled_problem->objective_vector[i]; - } - - rescale_info->con_bound_rescale = 1.0 / (sqrt(bound_norm_sq) + 1.0); - rescale_info->obj_vec_rescale = 1.0 / (sqrt(obj_norm_sq) + 1.0); - - for (int i = 0; i < num_cons; ++i) - { - rescale_info->scaled_problem->constraint_lower_bound[i] *= - rescale_info->con_bound_rescale; - rescale_info->scaled_problem->constraint_upper_bound[i] *= - rescale_info->con_bound_rescale; - } - for (int i = 0; i < num_vars; ++i) - { - rescale_info->scaled_problem->variable_lower_bound[i] *= - rescale_info->con_bound_rescale; - rescale_info->scaled_problem->variable_upper_bound[i] *= - rescale_info->con_bound_rescale; - rescale_info->scaled_problem->objective_vector[i] *= - rescale_info->obj_vec_rescale; - } - } - else - { - rescale_info->con_bound_rescale = 1.0; - rescale_info->obj_vec_rescale = 1.0; - } - rescale_info->rescaling_time_sec = - (double)(clock() - start_rescaling) / CLOCKS_PER_SEC; - return rescale_info; -} \ No newline at end of file diff --git a/src/preconditioner.cu b/src/preconditioner.cu new file mode 100644 index 0000000..0151334 --- /dev/null +++ b/src/preconditioner.cu @@ -0,0 +1,519 @@ +/* +Copyright 2025 Haihao Lu + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +#include "preconditioner.h" +#include "utils.h" +#include +#include +#include +#include +#include + +#define SCALING_EPSILON 1e-12 + +__global__ void scale_variables_kernel(double *__restrict__ objective_vector, + double *__restrict__ variable_lower_bound, + double *__restrict__ variable_upper_bound, + double *__restrict__ initial_primal_solution, + const double *__restrict__ variable_rescaling, + const double *__restrict__ inverse_variable_rescaling, + int num_variables); +__global__ void scale_constraints_kernel(double *__restrict__ constraint_lower_bound, + double *__restrict__ constraint_upper_bound, + double *__restrict__ initial_dual_solution, + const double *__restrict__ constraint_rescaling, + const double *__restrict__ inverse_constraint_rescaling, + int num_constraints); +__global__ void scale_csr_nnz_kernel(const int *__restrict__ constraint_row_ind, + const int *__restrict__ constraint_col_ind, + double *__restrict__ constraint_matrix_val, + double *__restrict__ constraint_matrix_transpose_val, + const int *__restrict__ constraint_to_transpose_position, + const double *__restrict__ inverse_variable_rescaling, + const double *__restrict__ inverse_constraint_rescaling, + int nnz); +__global__ void compute_csr_row_absmax_kernel(const int *__restrict__ row_ptr, + const double *__restrict__ matrix_vals, + int num_rows, + double *__restrict__ row_absmax_values); +__global__ void compute_csr_row_powsum_kernel(const int *__restrict__ row_ptr, + const double *__restrict__ matrix_vals, + int num_rows, + double degree, + double *__restrict__ row_powsum_values); +__global__ void clamp_sqrt_and_accum_kernel(double *__restrict__ scaling_factors, + double *__restrict__ inverse_scaling_factors, + double *__restrict__ cumulative_rescaling, + int num_variables); +__global__ void reduce_bound_norm_sq_kernel( + const double *__restrict__ constraint_lower_bound, + const double *__restrict__ constraint_upper_bound, + int num_constraints, + double *__restrict__ block_sums); +__global__ void scale_bounds_kernel( + double *__restrict__ constraint_lower_bound, + double *__restrict__ constraint_upper_bound, + double *__restrict__ initial_dual_solution, + int num_constraints, + double constraint_scale, + double objective_scale); +__global__ void scale_objective_kernel( + double *__restrict__ objective_vector, + double *__restrict__ variable_lower_bound, + double *__restrict__ variable_upper_bound, + double *__restrict__ initial_primal_solution, + int num_variables, + double constraint_scale, + double objective_scale); +__global__ void fill_ones_kernel(double *__restrict__ x, int num_variables); +static void scale_problem(pdhg_solver_state_t *state, double *constraint_rescaling, double *variable_rescaling, double *inverse_constraint_rescaling, double *inverse_variable_rescaling); +static void ruiz_rescaling(pdhg_solver_state_t *state, int num_iters, rescale_info_t *rescale_info, + double *constraint_rescaling, double *variable_rescaling, double *inverse_constraint_rescaling, double *inverse_variable_rescaling); +static void pock_chambolle_rescaling(pdhg_solver_state_t *state, const double alpha, rescale_info_t *rescale_info, + double *constraint_rescaling, double *variable_rescaling, double *inverse_constraint_rescaling, double *inverse_variable_rescaling); +static void bound_objective_rescaling(pdhg_solver_state_t *state, rescale_info_t *rescale_info); + +static void scale_problem( + pdhg_solver_state_t *state, + double *constraint_rescaling, + double *variable_rescaling, + double *inverse_constraint_rescaling, + double *inverse_variable_rescaling) +{ + int num_variables = state->num_variables; + int num_constraints = state->num_constraints; + + scale_variables_kernel<<num_blocks_primal, THREADS_PER_BLOCK>>>( + state->objective_vector, + state->variable_lower_bound, + state->variable_upper_bound, + state->initial_primal_solution, + variable_rescaling, + inverse_variable_rescaling, + num_variables); + + scale_constraints_kernel<<num_blocks_dual, THREADS_PER_BLOCK>>>( + state->constraint_lower_bound, + state->constraint_upper_bound, + state->initial_dual_solution, + constraint_rescaling, + inverse_constraint_rescaling, + num_constraints); + + scale_csr_nnz_kernel<<num_blocks_nnz, THREADS_PER_BLOCK>>>( + state->constraint_matrix->row_ind, + state->constraint_matrix->col_ind, + state->constraint_matrix->val, + state->constraint_matrix_t->val, + state->constraint_matrix->transpose_map, + inverse_variable_rescaling, + inverse_constraint_rescaling, + state->constraint_matrix->num_nonzeros); +} + +static void ruiz_rescaling( + pdhg_solver_state_t *state, + int num_iterations, + rescale_info_t *rescale_info, + double *constraint_rescaling, + double *variable_rescaling, + double *inverse_constraint_rescaling, + double *inverse_variable_rescaling) +{ + const int num_constraints = state->num_constraints; + const int num_variables = state->num_variables; + + for (int iter = 0; iter < num_iterations; ++iter) + { + compute_csr_row_absmax_kernel<<num_blocks_dual, THREADS_PER_BLOCK>>>( + state->constraint_matrix->row_ptr, + state->constraint_matrix->val, + num_constraints, + constraint_rescaling); + clamp_sqrt_and_accum_kernel<<num_blocks_dual, THREADS_PER_BLOCK>>>( + constraint_rescaling, + inverse_constraint_rescaling, + rescale_info->con_rescale, + num_constraints); + + compute_csr_row_absmax_kernel<<num_blocks_primal, THREADS_PER_BLOCK>>>( + state->constraint_matrix_t->row_ptr, + state->constraint_matrix_t->val, + num_variables, + variable_rescaling); + clamp_sqrt_and_accum_kernel<<num_blocks_primal, THREADS_PER_BLOCK>>>( + variable_rescaling, + inverse_variable_rescaling, + rescale_info->var_rescale, + num_variables); + + scale_problem(state, constraint_rescaling, variable_rescaling, inverse_constraint_rescaling, inverse_variable_rescaling); + } +} + +static void pock_chambolle_rescaling( + pdhg_solver_state_t *state, + const double alpha, + rescale_info_t *rescale_info, + double *constraint_rescaling, + double *variable_rescaling, + double *inverse_constraint_rescaling, + double *inverse_variable_rescaling) +{ + const int num_constraints = state->num_constraints; + const int num_variables = state->num_variables; + + compute_csr_row_powsum_kernel<<num_blocks_dual, THREADS_PER_BLOCK>>>( + state->constraint_matrix->row_ptr, + state->constraint_matrix->val, + num_constraints, + alpha, + constraint_rescaling); + clamp_sqrt_and_accum_kernel<<num_blocks_dual, THREADS_PER_BLOCK>>>( + constraint_rescaling, + inverse_constraint_rescaling, + rescale_info->con_rescale, + num_constraints); + + compute_csr_row_powsum_kernel<<num_blocks_primal, THREADS_PER_BLOCK>>>( + state->constraint_matrix_t->row_ptr, + state->constraint_matrix_t->val, + num_variables, + 2.0 - alpha, + variable_rescaling); + clamp_sqrt_and_accum_kernel<<num_blocks_primal, THREADS_PER_BLOCK>>>( + variable_rescaling, + inverse_variable_rescaling, + rescale_info->var_rescale, + num_variables); + + scale_problem(state, constraint_rescaling, variable_rescaling, inverse_constraint_rescaling, inverse_variable_rescaling); +} + +static void bound_objective_rescaling( + pdhg_solver_state_t *state, + rescale_info_t *rescale_info) +{ + const int num_constraints = state->num_constraints; + const int num_variables = state->num_variables; + + double *bnd_norm_sq_d = NULL; + CUDA_CHECK(cudaMalloc(&bnd_norm_sq_d, sizeof(double))); + CUDA_CHECK(cudaMemset(bnd_norm_sq_d, 0, sizeof(double))); + reduce_bound_norm_sq_kernel<<<1, THREADS_PER_BLOCK, THREADS_PER_BLOCK * sizeof(double)>>>( + state->constraint_lower_bound, + state->constraint_upper_bound, + num_constraints, + bnd_norm_sq_d); + + double bnd_norm_sq_h = 0.0; + CUDA_CHECK(cudaMemcpy(&bnd_norm_sq_h, bnd_norm_sq_d, sizeof(double), cudaMemcpyDeviceToHost)); + CUDA_CHECK(cudaFree(bnd_norm_sq_d)); + double bnd_norm = sqrt(bnd_norm_sq_h); + + double obj_norm = 0.0; + CUBLAS_CHECK(cublasDnrm2(state->blas_handle, + state->num_variables, + state->objective_vector, 1, + &obj_norm)); + + double constraint_scale = 1.0 / (bnd_norm + 1.0); + double objective_scale = 1.0 / (obj_norm + 1.0); + + scale_bounds_kernel<<num_blocks_dual, THREADS_PER_BLOCK>>>( + state->constraint_lower_bound, + state->constraint_upper_bound, + state->initial_dual_solution, + num_constraints, + constraint_scale, + objective_scale); + + scale_objective_kernel<<num_blocks_primal, THREADS_PER_BLOCK>>>( + state->objective_vector, + state->variable_lower_bound, + state->variable_upper_bound, + state->initial_primal_solution, + num_variables, + constraint_scale, + objective_scale); + + rescale_info->con_bound_rescale = constraint_scale; + rescale_info->obj_vec_rescale = objective_scale; +} + +rescale_info_t *rescale_problem( + const pdhg_parameters_t *params, + pdhg_solver_state_t *state) +{ + printf("[Precondition] Start\n"); + + int num_variables = state->num_variables; + int num_constraints = state->num_constraints; + + clock_t start_rescaling = clock(); + rescale_info_t *rescale_info = (rescale_info_t *)safe_calloc(1, sizeof(rescale_info_t)); + CUDA_CHECK(cudaMalloc(&rescale_info->con_rescale, num_constraints * sizeof(double))); + CUDA_CHECK(cudaMalloc(&rescale_info->var_rescale, num_variables * sizeof(double))); + fill_ones_kernel<<num_blocks_dual, THREADS_PER_BLOCK>>>( + rescale_info->con_rescale, num_constraints); + fill_ones_kernel<<num_blocks_primal, THREADS_PER_BLOCK>>>( + rescale_info->var_rescale, num_variables); + + double *constraint_rescaling = NULL, *variable_rescaling = NULL, *inverse_constraint_rescaling = NULL, *inverse_variable_rescaling = NULL; + CUDA_CHECK(cudaMalloc(&constraint_rescaling, num_constraints * sizeof(double))); + CUDA_CHECK(cudaMalloc(&variable_rescaling, num_variables * sizeof(double))); + CUDA_CHECK(cudaMalloc(&inverse_constraint_rescaling, num_constraints * sizeof(double))); + CUDA_CHECK(cudaMalloc(&inverse_variable_rescaling, num_variables * sizeof(double))); + + if (params->l_inf_ruiz_iterations > 0) + { + printf("[Precondition] Ruiz scaling (%d iterations)\n", params->l_inf_ruiz_iterations); + ruiz_rescaling(state, params->l_inf_ruiz_iterations, rescale_info, constraint_rescaling, variable_rescaling, inverse_constraint_rescaling, inverse_variable_rescaling); + } + if (params->has_pock_chambolle_alpha) + { + printf("[Precondition] Pock-Chambolle scaling (alpha=%.4f)\n", params->pock_chambolle_alpha); + pock_chambolle_rescaling(state, params->pock_chambolle_alpha, rescale_info, constraint_rescaling, variable_rescaling, inverse_constraint_rescaling, inverse_variable_rescaling); + } + + rescale_info->con_bound_rescale = 1.0; + rescale_info->obj_vec_rescale = 1.0; + if (params->bound_objective_rescaling) + { + printf("[Precondition] Bound-objective scaling\n"); + bound_objective_rescaling(state, rescale_info); + } + + rescale_info->rescaling_time_sec = (double)(clock() - start_rescaling) / CLOCKS_PER_SEC; + + CUDA_CHECK(cudaFree(constraint_rescaling)); + CUDA_CHECK(cudaFree(variable_rescaling)); + CUDA_CHECK(cudaFree(inverse_constraint_rescaling)); + CUDA_CHECK(cudaFree(inverse_variable_rescaling)); + + return rescale_info; +} + +__global__ void scale_variables_kernel(double *__restrict__ objective_vector, + double *__restrict__ variable_lower_bound, + double *__restrict__ variable_upper_bound, + double *__restrict__ initial_primal_solution, + const double *__restrict__ variable_rescaling, + const double *__restrict__ inverse_variable_rescaling, + int num_variables) +{ + int j = blockIdx.x * blockDim.x + threadIdx.x; + if (j >= num_variables) + return; + double dj = variable_rescaling[j]; + double inv_dj = inverse_variable_rescaling[j]; + objective_vector[j] *= inv_dj; + variable_lower_bound[j] *= dj; + variable_upper_bound[j] *= dj; + initial_primal_solution[j] *= dj; +} + +__global__ void scale_constraints_kernel(double *__restrict__ constraint_lower_bound, + double *__restrict__ constraint_upper_bound, + double *__restrict__ initial_dual_solution, + const double *__restrict__ constraint_rescaling, + const double *__restrict__ inverse_constraint_rescaling, + int num_constraints) +{ + int i = blockIdx.x * blockDim.x + threadIdx.x; + if (i >= num_constraints) + return; + double inv_ei = inverse_constraint_rescaling[i]; + double ei = constraint_rescaling[i]; + constraint_lower_bound[i] *= inv_ei; + constraint_upper_bound[i] *= inv_ei; + initial_dual_solution[i] *= ei; +} + +__global__ void scale_csr_nnz_kernel(const int *__restrict__ constraint_row_ind, + const int *__restrict__ constraint_col_ind, + double *__restrict__ constraint_matrix_val, + double *__restrict__ constraint_matrix_transpose_val, + const int *__restrict__ constraint_to_transpose_position, + const double *__restrict__ inverse_variable_rescaling, + const double *__restrict__ inverse_constraint_rescaling, + int nnz) +{ + for (int k = blockIdx.x * blockDim.x + threadIdx.x; + k < nnz; + k += gridDim.x * blockDim.x) + { + int i = constraint_row_ind[k]; + int j = constraint_col_ind[k]; + double scale = inverse_variable_rescaling[j] * inverse_constraint_rescaling[i]; + constraint_matrix_val[k] *= scale; + constraint_matrix_transpose_val[constraint_to_transpose_position[k]] *= scale; + } +} + +__global__ void compute_csr_row_absmax_kernel(const int *__restrict__ row_ptr, + const double *__restrict__ matrix_vals, + int num_rows, + double *__restrict__ row_absmax_values) +{ + int i = blockIdx.x * blockDim.x + threadIdx.x; + if (i >= num_rows) + return; + int s = row_ptr[i], e = row_ptr[i + 1]; + double m = 0.0; + for (int k = s; k < e; ++k) + { + double v = fabs(matrix_vals[k]); + if (!isfinite(v)) + v = 0.0; + if (v > m) + m = v; + } + row_absmax_values[i] = m; +} + +__device__ __forceinline__ double pow_fast(double v, double p) +{ + if (p == 2.0) + return v * v; + if (p == 1.0) + return v; + if (p == 0.5) + return sqrt(v); + return pow(v, p); +} + +__global__ void compute_csr_row_powsum_kernel(const int *__restrict__ row_ptr, + const double *__restrict__ matrix_vals, + int num_rows, + double degree, + double *__restrict__ row_powsum_values) +{ + int i = blockIdx.x * blockDim.x + threadIdx.x; + if (i >= num_rows) + return; + int s = row_ptr[i], e = row_ptr[i + 1]; + double acc = 0.0; + for (int k = s; k < e; ++k) + { + double v = fabs(matrix_vals[k]); + if (!isfinite(v)) + v = 0.0; + acc += pow_fast(v, degree); + } + row_powsum_values[i] = acc; +} + +__global__ void clamp_sqrt_and_accum_kernel(double *__restrict__ scaling_factors, + double *__restrict__ inverse_scaling_factors, + double *__restrict__ cumulative_rescaling, + int num_variables) +{ + for (int t = blockIdx.x * blockDim.x + threadIdx.x; t < num_variables; t += blockDim.x * gridDim.x) + { + double v = scaling_factors[t]; + double s = (v < SCALING_EPSILON) ? 1.0 : sqrt(v); + cumulative_rescaling[t] *= s; + scaling_factors[t] = s; + inverse_scaling_factors[t] = 1.0 / s; + } +} + +__global__ void reduce_bound_norm_sq_kernel( + const double *__restrict__ constraint_lower_bound, + const double *__restrict__ constraint_upper_bound, + int num_constraints, + double *__restrict__ block_sums) +{ + extern __shared__ double sdata[]; + int tid = threadIdx.x; + int global_tid = blockIdx.x * blockDim.x + tid; + int stride = blockDim.x * gridDim.x; + + double acc = 0.0; + for (int i = global_tid; i < num_constraints; i += stride) + { + double Li = constraint_lower_bound[i], Ui = constraint_upper_bound[i]; + bool fL = isfinite(Li), fU = isfinite(Ui); + + if (fL && (!fU || fabs(Li - Ui) > SCALING_EPSILON)) + { + acc += Li * Li; + } + if (fU) + { + acc += Ui * Ui; + } + } + + sdata[tid] = acc; + __syncthreads(); + + for (int s = blockDim.x / 2; s > 0; s >>= 1) + { + if (tid < s) + { + sdata[tid] += sdata[tid + s]; + } + __syncthreads(); + } + + if (tid == 0) + { + block_sums[blockIdx.x] = sdata[0]; + } +} + +__global__ void scale_bounds_kernel( + double *__restrict__ constraint_lower_bound, + double *__restrict__ constraint_upper_bound, + double *__restrict__ initial_dual_solution, + int num_constraints, + double constraint_scale, + double objective_scale) +{ + int i = blockIdx.x * blockDim.x + threadIdx.x; + if (i >= num_constraints) + return; + constraint_lower_bound[i] *= constraint_scale; + constraint_upper_bound[i] *= constraint_scale; + initial_dual_solution[i] *= objective_scale; +} + +__global__ void scale_objective_kernel( + double *__restrict__ objective_vector, + double *__restrict__ variable_lower_bound, + double *__restrict__ variable_upper_bound, + double *__restrict__ initial_primal_solution, + int num_variables, + double constraint_scale, + double objective_scale) +{ + int j = blockIdx.x * blockDim.x + threadIdx.x; + if (j >= num_variables) + return; + variable_lower_bound[j] *= constraint_scale; + variable_upper_bound[j] *= constraint_scale; + objective_vector[j] *= objective_scale; + initial_primal_solution[j] *= constraint_scale; +} + +__global__ void fill_ones_kernel(double *__restrict__ x, int num_variables) +{ + int i = blockIdx.x * blockDim.x + threadIdx.x; + if (i < num_variables) + x[i] = 1.0; +} diff --git a/src/solver.cu b/src/solver.cu index a9e5f2b..8a3cdea 100644 --- a/src/solver.cu +++ b/src/solver.cu @@ -27,28 +27,63 @@ limitations under the License. #include #include -__global__ void compute_next_pdhg_primal_solution_kernel( - const double *current_primal, double *reflected_primal, - const double *dual_product, const double *objective, const double *var_lb, - const double *var_ub, int n, double step_size); -__global__ void compute_next_pdhg_primal_solution_major_kernel( - const double *current_primal, double *pdhg_primal, double *reflected_primal, - const double *dual_product, const double *objective, const double *var_lb, - const double *var_ub, int n, double step_size, double *dual_slack); -__global__ void compute_next_pdhg_dual_solution_kernel( - const double *current_dual, double *reflected_dual, - const double *primal_product, const double *const_lb, - const double *const_ub, int n, double step_size); -__global__ void compute_next_pdhg_dual_solution_major_kernel( - const double *current_dual, double *pdhg_dual, double *reflected_dual, - const double *primal_product, const double *const_lb, - const double *const_ub, int n, double step_size); -__global__ void -halpern_update_kernel(const double *initial_primal, double *current_primal, - const double *reflected_primal, - const double *initial_dual, double *current_dual, - const double *reflected_dual, int n_vars, int n_cons, - double weight, double reflection_coeff); +__global__ void build_row_ind(const int *__restrict__ row_ptr, + int num_rows, + int *__restrict__ row_ind); +__global__ void build_transpose_map(const int *__restrict__ A_row_ind, + const int *__restrict__ A_col_ind, + const int *__restrict__ At_row_ptr, + const int *__restrict__ At_col_ind, + int nnz, + int *__restrict__ A_to_At); +__global__ void fill_finite_bounds_kernel(const double *__restrict__ lower_bound, + const double *__restrict__ upper_bound, + double *__restrict__ lower_bound_finite_val, + double *__restrict__ upper_bound_finite_val, + int num_elements); +__global__ void compute_next_pdhg_primal_solution_kernel(const double *current_primal, + double *reflected_primal, + const double *dual_product, + const double *objective, + const double *var_lb, + const double *var_ub, + int n, + double step_size); +__global__ void compute_next_pdhg_primal_solution_major_kernel(const double *current_primal, + double *pdhg_primal, + double *reflected_primal, + const double *dual_product, + const double *objective, + const double *var_lb, + const double *var_ub, + int n, + double step_size, + double *dual_slack); +__global__ void compute_next_pdhg_dual_solution_kernel(const double *current_dual, + double *reflected_dual, + const double *primal_product, + const double *const_lb, + const double *const_ub, + int n, + double step_size); +__global__ void compute_next_pdhg_dual_solution_major_kernel(const double *current_dual, + double *pdhg_dual, + double *reflected_dual, + const double *primal_product, + const double *const_lb, + const double *const_ub, + int n, + double step_size); +__global__ void halpern_update_kernel(const double *initial_primal, + double *current_primal, + const double *reflected_primal, + const double *initial_dual, + double *current_dual, + const double *reflected_dual, + int n_vars, + int n_cons, + double weight, + double reflection_coeff); __global__ void rescale_solution_kernel(double *primal_solution, double *dual_solution, const double *variable_rescaling, @@ -56,29 +91,27 @@ __global__ void rescale_solution_kernel(double *primal_solution, const double objective_vector_rescaling, const double constraint_bound_rescaling, int n_vars, int n_cons); -__global__ void compute_delta_solution_kernel( - const double *initial_primal, const double *pdhg_primal, - double *delta_primal, const double *initial_dual, const double *pdhg_dual, - double *delta_dual, int n_vars, int n_cons); +__global__ void compute_delta_solution_kernel(const double *initial_primal, + const double *pdhg_primal, + double *delta_primal, + const double *initial_dual, + const double *pdhg_dual, + double *delta_dual, + int n_vars, + int n_cons); static void compute_next_pdhg_primal_solution(pdhg_solver_state_t *state); static void compute_next_pdhg_dual_solution(pdhg_solver_state_t *state); -static void halpern_update(pdhg_solver_state_t *state, - double reflection_coefficient); +static void halpern_update(pdhg_solver_state_t *state, double reflection_coefficient); static void rescale_solution(pdhg_solver_state_t *state); static cupdlpx_result_t *create_result_from_state(pdhg_solver_state_t *state); -static void perform_restart(pdhg_solver_state_t *state, - const pdhg_parameters_t *params); -static void -initialize_step_size_and_primal_weight(pdhg_solver_state_t *state, - const pdhg_parameters_t *params); -static pdhg_solver_state_t * -initialize_solver_state(const lp_problem_t *original_problem, - const rescale_info_t *rescale_info); +static void perform_restart(pdhg_solver_state_t *state, const pdhg_parameters_t *params); +static void initialize_step_size_and_primal_weight(pdhg_solver_state_t *state, const pdhg_parameters_t *params); +static pdhg_solver_state_t *initialize_solver_state(const lp_problem_t *original_problem, const pdhg_parameters_t *params); static void compute_fixed_point_error(pdhg_solver_state_t *state); void pdhg_solver_state_free(pdhg_solver_state_t *state); void rescale_info_free(rescale_info_t *info); -static void perform_primal_restart(pdhg_solver_state_t *state); +static void perform_primal_restart(pdhg_solver_state_t *state); static void perform_dual_restart(pdhg_solver_state_t *state); void primal_feasibility_polish(const pdhg_parameters_t *params, pdhg_solver_state_t *state, const pdhg_solver_state_t *ori_state); void dual_feasibility_polish(const pdhg_parameters_t *params, pdhg_solver_state_t *state, const pdhg_solver_state_t *ori_state); @@ -92,16 +125,12 @@ static pdhg_solver_state_t *initialize_primal_feas_polish_state( static pdhg_solver_state_t *initialize_dual_feas_polish_state( const pdhg_solver_state_t *original_state); - cupdlpx_result_t *optimize(const pdhg_parameters_t *params, const lp_problem_t *original_problem) { print_initial_info(params, original_problem); - rescale_info_t *rescale_info = rescale_problem(params, original_problem); - pdhg_solver_state_t *state = - initialize_solver_state(original_problem, rescale_info); + pdhg_solver_state_t *state = initialize_solver_state(original_problem, params); - rescale_info_free(rescale_info); initialize_step_size_and_primal_weight(state, params); clock_t start_time = clock(); bool do_restart = false; @@ -157,8 +186,8 @@ cupdlpx_result_t *optimize(const pdhg_parameters_t *params, pdhg_final_log(state, params->verbose, state->termination_reason); - if (params->feasibility_polishing && - state->termination_reason != TERMINATION_REASON_DUAL_INFEASIBLE && + if (params->feasibility_polishing && + state->termination_reason != TERMINATION_REASON_DUAL_INFEASIBLE && state->termination_reason != TERMINATION_REASON_PRIMAL_INFEASIBLE) { feasibility_polish(params, state); @@ -169,15 +198,15 @@ cupdlpx_result_t *optimize(const pdhg_parameters_t *params, return results; } -static pdhg_solver_state_t * -initialize_solver_state(const lp_problem_t *original_problem, - const rescale_info_t *rescale_info) +static pdhg_solver_state_t *initialize_solver_state( + const lp_problem_t *original_problem, + const pdhg_parameters_t *params) { - pdhg_solver_state_t *state = - (pdhg_solver_state_t *)safe_calloc(1, sizeof(pdhg_solver_state_t)); + pdhg_solver_state_t *state = (pdhg_solver_state_t *)safe_calloc(1, sizeof(pdhg_solver_state_t)); int n_vars = original_problem->num_variables; int n_cons = original_problem->num_constraints; + int nnz = original_problem->constraint_matrix_num_nonzeros; size_t var_bytes = n_vars * sizeof(double); size_t con_bytes = n_cons * sizeof(double); @@ -185,99 +214,85 @@ initialize_solver_state(const lp_problem_t *original_problem, state->num_constraints = n_cons; state->objective_constant = original_problem->objective_constant; - state->constraint_matrix = - (cu_sparse_matrix_csr_t *)safe_malloc(sizeof(cu_sparse_matrix_csr_t)); - state->constraint_matrix_t = - (cu_sparse_matrix_csr_t *)safe_malloc(sizeof(cu_sparse_matrix_csr_t)); + state->constraint_matrix = (cu_sparse_matrix_csr_t *)safe_malloc(sizeof(cu_sparse_matrix_csr_t)); + state->constraint_matrix_t = (cu_sparse_matrix_csr_t *)safe_malloc(sizeof(cu_sparse_matrix_csr_t)); state->constraint_matrix->num_rows = n_cons; state->constraint_matrix->num_cols = n_vars; - state->constraint_matrix->num_nonzeros = - original_problem->constraint_matrix_num_nonzeros; + state->constraint_matrix->num_nonzeros = original_problem->constraint_matrix_num_nonzeros; state->constraint_matrix_t->num_rows = n_vars; state->constraint_matrix_t->num_cols = n_cons; - state->constraint_matrix_t->num_nonzeros = - original_problem->constraint_matrix_num_nonzeros; + state->constraint_matrix_t->num_nonzeros = original_problem->constraint_matrix_num_nonzeros; state->termination_reason = TERMINATION_REASON_UNSPECIFIED; - state->rescaling_time_sec = rescale_info->rescaling_time_sec; + state->num_blocks_primal = (state->num_variables + THREADS_PER_BLOCK - 1) / THREADS_PER_BLOCK; + state->num_blocks_dual = (state->num_constraints + THREADS_PER_BLOCK - 1) / THREADS_PER_BLOCK; + state->num_blocks_primal_dual = (state->num_variables + state->num_constraints + THREADS_PER_BLOCK - 1) / THREADS_PER_BLOCK; + state->num_blocks_nnz = (state->constraint_matrix->num_nonzeros + THREADS_PER_BLOCK - 1) / THREADS_PER_BLOCK; + + CUSPARSE_CHECK(cusparseCreate(&state->sparse_handle)); + CUBLAS_CHECK(cublasCreate(&state->blas_handle)); + CUBLAS_CHECK(cublasSetPointerMode(state->blas_handle, CUBLAS_POINTER_MODE_HOST)); #define ALLOC_AND_COPY(dest, src, bytes) \ CUDA_CHECK(cudaMalloc(&dest, bytes)); \ CUDA_CHECK(cudaMemcpy(dest, src, bytes, cudaMemcpyHostToDevice)); - ALLOC_AND_COPY(state->constraint_matrix->row_ptr, - rescale_info->scaled_problem->constraint_matrix_row_pointers, - (n_cons + 1) * sizeof(int)); - ALLOC_AND_COPY(state->constraint_matrix->col_ind, - rescale_info->scaled_problem->constraint_matrix_col_indices, - rescale_info->scaled_problem->constraint_matrix_num_nonzeros * - sizeof(int)); - ALLOC_AND_COPY(state->constraint_matrix->val, - rescale_info->scaled_problem->constraint_matrix_values, - rescale_info->scaled_problem->constraint_matrix_num_nonzeros * - sizeof(double)); - - CUDA_CHECK(cudaMalloc(&state->constraint_matrix_t->row_ptr, - (n_vars + 1) * sizeof(int))); - CUDA_CHECK( - cudaMalloc(&state->constraint_matrix_t->col_ind, - rescale_info->scaled_problem->constraint_matrix_num_nonzeros * - sizeof(int))); - CUDA_CHECK( - cudaMalloc(&state->constraint_matrix_t->val, - rescale_info->scaled_problem->constraint_matrix_num_nonzeros * - sizeof(double))); + ALLOC_AND_COPY(state->constraint_matrix->row_ptr, original_problem->constraint_matrix_row_pointers, (n_cons + 1) * sizeof(int)); + ALLOC_AND_COPY(state->constraint_matrix->col_ind, original_problem->constraint_matrix_col_indices, original_problem->constraint_matrix_num_nonzeros * sizeof(int)); + ALLOC_AND_COPY(state->constraint_matrix->val, original_problem->constraint_matrix_values, original_problem->constraint_matrix_num_nonzeros * sizeof(double)); - CUSPARSE_CHECK(cusparseCreate(&state->sparse_handle)); - CUBLAS_CHECK(cublasCreate(&state->blas_handle)); - CUBLAS_CHECK( - cublasSetPointerMode(state->blas_handle, CUBLAS_POINTER_MODE_HOST)); + CUDA_CHECK(cudaMalloc(&state->constraint_matrix->row_ind, nnz * sizeof(int))); + build_row_ind<<num_blocks_dual, THREADS_PER_BLOCK>>>( + state->constraint_matrix->row_ptr, n_cons, state->constraint_matrix->row_ind); + CUDA_CHECK(cudaGetLastError()); + + CUDA_CHECK(cudaMalloc(&state->constraint_matrix_t->row_ptr, (n_vars + 1) * sizeof(int))); + CUDA_CHECK(cudaMalloc(&state->constraint_matrix_t->col_ind, original_problem->constraint_matrix_num_nonzeros * sizeof(int))); + CUDA_CHECK(cudaMalloc(&state->constraint_matrix_t->val, original_problem->constraint_matrix_num_nonzeros * sizeof(double))); size_t buffer_size = 0; void *buffer = nullptr; CUSPARSE_CHECK(cusparseCsr2cscEx2_bufferSize( - state->sparse_handle, state->constraint_matrix->num_rows, - state->constraint_matrix->num_cols, - state->constraint_matrix->num_nonzeros, state->constraint_matrix->val, - state->constraint_matrix->row_ptr, state->constraint_matrix->col_ind, - state->constraint_matrix_t->val, state->constraint_matrix_t->row_ptr, - state->constraint_matrix_t->col_ind, CUDA_R_64F, CUSPARSE_ACTION_NUMERIC, - CUSPARSE_INDEX_BASE_ZERO, CUSPARSE_CSR2CSC_ALG_DEFAULT, &buffer_size)); + state->sparse_handle, state->constraint_matrix->num_rows, state->constraint_matrix->num_cols, state->constraint_matrix->num_nonzeros, + state->constraint_matrix->val, state->constraint_matrix->row_ptr, state->constraint_matrix->col_ind, + state->constraint_matrix_t->val, state->constraint_matrix_t->row_ptr, state->constraint_matrix_t->col_ind, + CUDA_R_64F, CUSPARSE_ACTION_NUMERIC, CUSPARSE_INDEX_BASE_ZERO, + CUSPARSE_CSR2CSC_ALG_DEFAULT, &buffer_size)); CUDA_CHECK(cudaMalloc(&buffer, buffer_size)); CUSPARSE_CHECK(cusparseCsr2cscEx2( - state->sparse_handle, state->constraint_matrix->num_rows, - state->constraint_matrix->num_cols, - state->constraint_matrix->num_nonzeros, state->constraint_matrix->val, - state->constraint_matrix->row_ptr, state->constraint_matrix->col_ind, - state->constraint_matrix_t->val, state->constraint_matrix_t->row_ptr, - state->constraint_matrix_t->col_ind, CUDA_R_64F, CUSPARSE_ACTION_NUMERIC, - CUSPARSE_INDEX_BASE_ZERO, CUSPARSE_CSR2CSC_ALG_DEFAULT, buffer)); + state->sparse_handle, state->constraint_matrix->num_rows, state->constraint_matrix->num_cols, state->constraint_matrix->num_nonzeros, + state->constraint_matrix->val, state->constraint_matrix->row_ptr, state->constraint_matrix->col_ind, + state->constraint_matrix_t->val, state->constraint_matrix_t->row_ptr, state->constraint_matrix_t->col_ind, + CUDA_R_64F, CUSPARSE_ACTION_NUMERIC, CUSPARSE_INDEX_BASE_ZERO, + CUSPARSE_CSR2CSC_ALG_DEFAULT, buffer)); CUDA_CHECK(cudaFree(buffer)); - ALLOC_AND_COPY(state->variable_lower_bound, - rescale_info->scaled_problem->variable_lower_bound, var_bytes); - ALLOC_AND_COPY(state->variable_upper_bound, - rescale_info->scaled_problem->variable_upper_bound, var_bytes); - ALLOC_AND_COPY(state->objective_vector, - rescale_info->scaled_problem->objective_vector, var_bytes); - ALLOC_AND_COPY(state->constraint_lower_bound, - rescale_info->scaled_problem->constraint_lower_bound, - con_bytes); - ALLOC_AND_COPY(state->constraint_upper_bound, - rescale_info->scaled_problem->constraint_upper_bound, - con_bytes); - ALLOC_AND_COPY(state->constraint_rescaling, rescale_info->con_rescale, - con_bytes); - ALLOC_AND_COPY(state->variable_rescaling, rescale_info->var_rescale, - var_bytes); + CUDA_CHECK(cudaMalloc(&state->constraint_matrix_t->row_ind, nnz * sizeof(int))); + build_row_ind<<num_blocks_primal, THREADS_PER_BLOCK>>>( + state->constraint_matrix_t->row_ptr, n_vars, state->constraint_matrix_t->row_ind); + CUDA_CHECK(cudaGetLastError()); - state->constraint_bound_rescaling = rescale_info->con_bound_rescale; - state->objective_vector_rescaling = rescale_info->obj_vec_rescale; + CUDA_CHECK(cudaMalloc(&state->constraint_matrix->transpose_map, nnz * sizeof(int))); + state->constraint_matrix_t->transpose_map = NULL; + build_transpose_map<<num_blocks_nnz, THREADS_PER_BLOCK>>>( + state->constraint_matrix->row_ind, + state->constraint_matrix->col_ind, + state->constraint_matrix_t->row_ptr, + state->constraint_matrix_t->col_ind, + nnz, + state->constraint_matrix->transpose_map); + CUDA_CHECK(cudaGetLastError()); + + ALLOC_AND_COPY(state->variable_lower_bound, original_problem->variable_lower_bound, var_bytes); + ALLOC_AND_COPY(state->variable_upper_bound, original_problem->variable_upper_bound, var_bytes); + ALLOC_AND_COPY(state->objective_vector, original_problem->objective_vector, var_bytes); + ALLOC_AND_COPY(state->constraint_lower_bound, original_problem->constraint_lower_bound, con_bytes); + ALLOC_AND_COPY(state->constraint_upper_bound, original_problem->constraint_upper_bound, con_bytes); #define ALLOC_ZERO(dest, bytes) \ CUDA_CHECK(cudaMalloc(&dest, bytes)); \ @@ -303,91 +318,77 @@ initialize_solver_state(const lp_problem_t *original_problem, if (original_problem->primal_start) { - double *rescaled = (double *)safe_malloc(var_bytes); - for (int i = 0; i < n_vars; ++i) - rescaled[i] = original_problem->primal_start[i] * - rescale_info->var_rescale[i] * - rescale_info->con_bound_rescale; - CUDA_CHECK(cudaMemcpy(state->initial_primal_solution, rescaled, var_bytes, - cudaMemcpyHostToDevice)); - free(rescaled); + CUDA_CHECK(cudaMemcpy(state->initial_primal_solution, + original_problem->primal_start, + var_bytes, cudaMemcpyHostToDevice)); } if (original_problem->dual_start) { - double *rescaled = (double *)safe_malloc(con_bytes); - for (int i = 0; i < n_cons; ++i) - rescaled[i] = original_problem->dual_start[i] * - rescale_info->con_rescale[i] * - rescale_info->obj_vec_rescale; - CUDA_CHECK(cudaMemcpy(state->initial_dual_solution, rescaled, con_bytes, - cudaMemcpyHostToDevice)); - free(rescaled); + CUDA_CHECK(cudaMemcpy(state->initial_dual_solution, + original_problem->dual_start, + con_bytes, cudaMemcpyHostToDevice)); } - double *temp_host = (double *)safe_malloc(fmax(var_bytes, con_bytes)); - for (int i = 0; i < n_cons; ++i) - temp_host[i] = - isfinite(rescale_info->scaled_problem->constraint_lower_bound[i]) - ? rescale_info->scaled_problem->constraint_lower_bound[i] - : 0.0; - ALLOC_AND_COPY(state->constraint_lower_bound_finite_val, temp_host, - con_bytes); - for (int i = 0; i < n_cons; ++i) - temp_host[i] = - isfinite(rescale_info->scaled_problem->constraint_upper_bound[i]) - ? rescale_info->scaled_problem->constraint_upper_bound[i] - : 0.0; - ALLOC_AND_COPY(state->constraint_upper_bound_finite_val, temp_host, - con_bytes); - for (int i = 0; i < n_vars; ++i) - temp_host[i] = - isfinite(rescale_info->scaled_problem->variable_lower_bound[i]) - ? rescale_info->scaled_problem->variable_lower_bound[i] - : 0.0; - ALLOC_AND_COPY(state->variable_lower_bound_finite_val, temp_host, var_bytes); - for (int i = 0; i < n_vars; ++i) - temp_host[i] = - isfinite(rescale_info->scaled_problem->variable_upper_bound[i]) - ? rescale_info->scaled_problem->variable_upper_bound[i] - : 0.0; - ALLOC_AND_COPY(state->variable_upper_bound_finite_val, temp_host, var_bytes); - free(temp_host); + rescale_info_t *rescale_info = rescale_problem(params, state); - double sum_of_squares = 0.0; + state->constraint_rescaling = rescale_info->con_rescale; + state->variable_rescaling = rescale_info->var_rescale; + state->constraint_bound_rescaling = rescale_info->con_bound_rescale; + state->objective_vector_rescaling = rescale_info->obj_vec_rescale; + state->rescaling_time_sec = rescale_info->rescaling_time_sec; + + rescale_info->con_rescale = NULL; + rescale_info->var_rescale = NULL; + rescale_info_free(rescale_info); + + CUDA_CHECK(cudaMalloc(&state->constraint_lower_bound_finite_val, con_bytes)); + CUDA_CHECK(cudaMalloc(&state->constraint_upper_bound_finite_val, con_bytes)); + CUDA_CHECK(cudaMalloc(&state->variable_lower_bound_finite_val, var_bytes)); + CUDA_CHECK(cudaMalloc(&state->variable_upper_bound_finite_val, var_bytes)); + + fill_finite_bounds_kernel<<num_blocks_dual, THREADS_PER_BLOCK>>>( + state->constraint_lower_bound, + state->constraint_upper_bound, + state->constraint_lower_bound_finite_val, + state->constraint_upper_bound_finite_val, + n_cons); + + fill_finite_bounds_kernel<<num_blocks_primal, THREADS_PER_BLOCK>>>( + state->variable_lower_bound, + state->variable_upper_bound, + state->variable_lower_bound_finite_val, + state->variable_upper_bound_finite_val, + n_vars); + + CUDA_CHECK(cudaFree(state->constraint_matrix->row_ind)); + state->constraint_matrix->row_ind = NULL; + CUDA_CHECK(cudaFree(state->constraint_matrix_t->row_ind)); + state->constraint_matrix_t->row_ind = NULL; + CUDA_CHECK(cudaFree(state->constraint_matrix->transpose_map)); + state->constraint_matrix->transpose_map = NULL; + double sum_of_squares = 0.0; for (int i = 0; i < n_vars; ++i) { - sum_of_squares += original_problem->objective_vector[i] * - original_problem->objective_vector[i]; + sum_of_squares += original_problem->objective_vector[i] * original_problem->objective_vector[i]; } state->objective_vector_norm = sqrt(sum_of_squares); sum_of_squares = 0.0; - for (int i = 0; i < n_cons; ++i) { double lower = original_problem->constraint_lower_bound[i]; double upper = original_problem->constraint_upper_bound[i]; - if (isfinite(lower) && (lower != upper)) { sum_of_squares += lower * lower; } - if (isfinite(upper)) { sum_of_squares += upper * upper; } } - state->constraint_bound_norm = sqrt(sum_of_squares); - state->num_blocks_primal = - (state->num_variables + THREADS_PER_BLOCK - 1) / THREADS_PER_BLOCK; - state->num_blocks_dual = - (state->num_constraints + THREADS_PER_BLOCK - 1) / THREADS_PER_BLOCK; - state->num_blocks_primal_dual = - (state->num_variables + state->num_constraints + THREADS_PER_BLOCK - 1) / - THREADS_PER_BLOCK; state->best_primal_dual_residual_gap = INFINITY; state->last_trial_fixed_point_error = INFINITY; @@ -397,82 +398,116 @@ initialize_solver_state(const lp_problem_t *original_problem, size_t primal_spmv_buffer_size; size_t dual_spmv_buffer_size; - CUSPARSE_CHECK(cusparseCreateCsr( - &state->matA, state->num_constraints, state->num_variables, - state->constraint_matrix->num_nonzeros, state->constraint_matrix->row_ptr, - state->constraint_matrix->col_ind, state->constraint_matrix->val, - CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_BASE_ZERO, - CUDA_R_64F)); - + CUSPARSE_CHECK(cusparseCreateCsr(&state->matA, state->num_constraints, state->num_variables, state->constraint_matrix->num_nonzeros, state->constraint_matrix->row_ptr, state->constraint_matrix->col_ind, state->constraint_matrix->val, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_BASE_ZERO, CUDA_R_64F)); CUDA_CHECK(cudaGetLastError()); - CUSPARSE_CHECK(cusparseCreateCsr( - &state->matAt, state->num_variables, state->num_constraints, - state->constraint_matrix_t->num_nonzeros, - state->constraint_matrix_t->row_ptr, state->constraint_matrix_t->col_ind, - state->constraint_matrix_t->val, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, - CUSPARSE_INDEX_BASE_ZERO, CUDA_R_64F)); + CUSPARSE_CHECK(cusparseCreateCsr(&state->matAt, state->num_variables, state->num_constraints, state->constraint_matrix_t->num_nonzeros, state->constraint_matrix_t->row_ptr, state->constraint_matrix_t->col_ind, state->constraint_matrix_t->val, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_BASE_ZERO, CUDA_R_64F)); CUDA_CHECK(cudaGetLastError()); - CUSPARSE_CHECK(cusparseCreateDnVec(&state->vec_primal_sol, - state->num_variables, - state->pdhg_primal_solution, CUDA_R_64F)); - CUSPARSE_CHECK(cusparseCreateDnVec(&state->vec_dual_sol, - state->num_constraints, - state->pdhg_dual_solution, CUDA_R_64F)); - CUSPARSE_CHECK(cusparseCreateDnVec(&state->vec_primal_prod, - state->num_constraints, - state->primal_product, CUDA_R_64F)); - CUSPARSE_CHECK(cusparseCreateDnVec(&state->vec_dual_prod, - state->num_variables, state->dual_product, - CUDA_R_64F)); - CUSPARSE_CHECK(cusparseSpMV_bufferSize( - state->sparse_handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &HOST_ONE, - state->matA, state->vec_primal_sol, &HOST_ZERO, state->vec_primal_prod, - CUDA_R_64F, CUSPARSE_SPMV_CSR_ALG2, &primal_spmv_buffer_size)); + CUSPARSE_CHECK(cusparseCreateDnVec(&state->vec_primal_sol, state->num_variables, state->pdhg_primal_solution, CUDA_R_64F)); + CUSPARSE_CHECK(cusparseCreateDnVec(&state->vec_dual_sol, state->num_constraints, state->pdhg_dual_solution, CUDA_R_64F)); + CUSPARSE_CHECK(cusparseCreateDnVec(&state->vec_primal_prod, state->num_constraints, state->primal_product, CUDA_R_64F)); + CUSPARSE_CHECK(cusparseCreateDnVec(&state->vec_dual_prod, state->num_variables, state->dual_product, CUDA_R_64F)); + CUSPARSE_CHECK(cusparseSpMV_bufferSize(state->sparse_handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &HOST_ONE, state->matA, state->vec_primal_sol, &HOST_ZERO, state->vec_primal_prod, CUDA_R_64F, CUSPARSE_SPMV_CSR_ALG2, &primal_spmv_buffer_size)); - CUSPARSE_CHECK(cusparseSpMV_bufferSize( - state->sparse_handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &HOST_ONE, - state->matAt, state->vec_dual_sol, &HOST_ZERO, state->vec_dual_prod, - CUDA_R_64F, CUSPARSE_SPMV_CSR_ALG2, &dual_spmv_buffer_size)); + CUSPARSE_CHECK(cusparseSpMV_bufferSize(state->sparse_handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &HOST_ONE, state->matAt, state->vec_dual_sol, &HOST_ZERO, state->vec_dual_prod, CUDA_R_64F, CUSPARSE_SPMV_CSR_ALG2, &dual_spmv_buffer_size)); CUDA_CHECK(cudaMalloc(&state->primal_spmv_buffer, primal_spmv_buffer_size)); - CUSPARSE_CHECK(cusparseSpMV_preprocess( - state->sparse_handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &HOST_ONE, - state->matA, state->vec_primal_sol, &HOST_ZERO, state->vec_primal_prod, - CUDA_R_64F, CUSPARSE_SPMV_CSR_ALG2, state->primal_spmv_buffer)); + CUSPARSE_CHECK(cusparseSpMV_preprocess(state->sparse_handle, CUSPARSE_OPERATION_NON_TRANSPOSE, + &HOST_ONE, state->matA, state->vec_primal_sol, &HOST_ZERO, state->vec_primal_prod, + CUDA_R_64F, CUSPARSE_SPMV_CSR_ALG2, state->primal_spmv_buffer)); CUDA_CHECK(cudaMalloc(&state->dual_spmv_buffer, dual_spmv_buffer_size)); - CUSPARSE_CHECK(cusparseSpMV_preprocess( - state->sparse_handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &HOST_ONE, - state->matAt, state->vec_dual_sol, &HOST_ZERO, state->vec_dual_prod, - CUDA_R_64F, CUSPARSE_SPMV_CSR_ALG2, state->dual_spmv_buffer)); + CUSPARSE_CHECK(cusparseSpMV_preprocess(state->sparse_handle, CUSPARSE_OPERATION_NON_TRANSPOSE, + &HOST_ONE, state->matAt, state->vec_dual_sol, &HOST_ZERO, state->vec_dual_prod, + CUDA_R_64F, CUSPARSE_SPMV_CSR_ALG2, state->dual_spmv_buffer)); - CUDA_CHECK( - cudaMalloc(&state->ones_primal_d, state->num_variables * sizeof(double))); - CUDA_CHECK( - cudaMalloc(&state->ones_dual_d, state->num_constraints * sizeof(double))); + CUDA_CHECK(cudaMalloc(&state->ones_primal_d, state->num_variables * sizeof(double))); + CUDA_CHECK(cudaMalloc(&state->ones_dual_d, state->num_constraints * sizeof(double))); - double *ones_primal_h = - (double *)safe_malloc(state->num_variables * sizeof(double)); + double *ones_primal_h = (double *)safe_malloc(state->num_variables * sizeof(double)); for (int i = 0; i < state->num_variables; ++i) ones_primal_h[i] = 1.0; - CUDA_CHECK(cudaMemcpy(state->ones_primal_d, ones_primal_h, - state->num_variables * sizeof(double), - cudaMemcpyHostToDevice)); + CUDA_CHECK(cudaMemcpy(state->ones_primal_d, ones_primal_h, state->num_variables * sizeof(double), cudaMemcpyHostToDevice)); free(ones_primal_h); - double *ones_dual_h = - (double *)safe_malloc(state->num_constraints * sizeof(double)); + double *ones_dual_h = (double *)safe_malloc(state->num_constraints * sizeof(double)); for (int i = 0; i < state->num_constraints; ++i) ones_dual_h[i] = 1.0; - CUDA_CHECK(cudaMemcpy(state->ones_dual_d, ones_dual_h, - state->num_constraints * sizeof(double), - cudaMemcpyHostToDevice)); + CUDA_CHECK(cudaMemcpy(state->ones_dual_d, ones_dual_h, state->num_constraints * sizeof(double), cudaMemcpyHostToDevice)); free(ones_dual_h); return state; } +__global__ void build_row_ind(const int *__restrict__ row_ptr, + int num_rows, + int *__restrict__ row_ind) +{ + int i = blockIdx.x * blockDim.x + threadIdx.x; + if (i >= num_rows) + return; + + int s = row_ptr[i]; + int e = row_ptr[i + 1]; + + for (int k = s; k < e; ++k) + { + row_ind[k] = i; + } +} + +__global__ void build_transpose_map( + const int *__restrict__ A_row_ind, + const int *__restrict__ A_col_ind, + const int *__restrict__ At_row_ptr, + const int *__restrict__ At_col_ind, + int nnz, + int *__restrict__ A_to_At) +{ + int k = blockIdx.x * blockDim.x + threadIdx.x; + if (k >= nnz) + return; + + int i = A_row_ind[k]; + int j = A_col_ind[k]; + + int start = At_row_ptr[j]; + int end = At_row_ptr[j + 1]; + + int pos = -1; + for (int idx = start; idx < end; ++idx) + { + if (At_col_ind[idx] == i) + { + pos = idx; + break; + } + } + + if (pos < 0) + return; + + A_to_At[k] = pos; +} + +__global__ void fill_finite_bounds_kernel( + const double *__restrict__ lb, + const double *__restrict__ ub, + double *__restrict__ lb_finite, + double *__restrict__ ub_finite, + int num_elements) +{ + int i = blockIdx.x * blockDim.x + threadIdx.x; + if (i >= num_elements) + return; + + double Li = lb[i]; + double Ui = ub[i]; + + lb_finite[i] = isfinite(Li) ? Li : 0.0; + ub_finite[i] = isfinite(Ui) ? Ui : 0.0; +} + __global__ void compute_next_pdhg_primal_solution_kernel( const double *current_primal, double *reflected_primal, const double *dual_product, const double *objective, const double *var_lb, @@ -754,9 +789,8 @@ static void perform_restart(pdhg_solver_state_t *state, state->last_trial_fixed_point_error = INFINITY; } -static void -initialize_step_size_and_primal_weight(pdhg_solver_state_t *state, - const pdhg_parameters_t *params) +static void initialize_step_size_and_primal_weight(pdhg_solver_state_t *state, + const pdhg_parameters_t *params) { double max_sv = estimate_maximum_singular_value( state->sparse_handle, state->blas_handle, state->constraint_matrix, @@ -823,6 +857,27 @@ void pdhg_solver_state_free(pdhg_solver_state_t *state) return; } + if (state->matA) + CUSPARSE_CHECK(cusparseDestroySpMat(state->matA)); + if (state->matAt) + CUSPARSE_CHECK(cusparseDestroySpMat(state->matAt)); + if (state->vec_primal_sol) + CUSPARSE_CHECK(cusparseDestroyDnVec(state->vec_primal_sol)); + if (state->vec_dual_sol) + CUSPARSE_CHECK(cusparseDestroyDnVec(state->vec_dual_sol)); + if (state->vec_primal_prod) + CUSPARSE_CHECK(cusparseDestroyDnVec(state->vec_primal_prod)); + if (state->vec_dual_prod) + CUSPARSE_CHECK(cusparseDestroyDnVec(state->vec_dual_prod)); + if (state->primal_spmv_buffer) + CUDA_CHECK(cudaFree(state->primal_spmv_buffer)); + if (state->dual_spmv_buffer) + CUDA_CHECK(cudaFree(state->dual_spmv_buffer)); + if (state->sparse_handle) + CUSPARSE_CHECK(cusparseDestroy(state->sparse_handle)); + if (state->blas_handle) + CUBLAS_CHECK(cublasDestroy(state->blas_handle)); + if (state->variable_lower_bound) CUDA_CHECK(cudaFree(state->variable_lower_bound)); if (state->variable_upper_bound) @@ -833,14 +888,22 @@ void pdhg_solver_state_free(pdhg_solver_state_t *state) CUDA_CHECK(cudaFree(state->constraint_matrix->row_ptr)); if (state->constraint_matrix->col_ind) CUDA_CHECK(cudaFree(state->constraint_matrix->col_ind)); + if (state->constraint_matrix->row_ind) + CUDA_CHECK(cudaFree(state->constraint_matrix->row_ind)); if (state->constraint_matrix->val) CUDA_CHECK(cudaFree(state->constraint_matrix->val)); + if (state->constraint_matrix->transpose_map) + CUDA_CHECK(cudaFree(state->constraint_matrix->transpose_map)); if (state->constraint_matrix_t->row_ptr) CUDA_CHECK(cudaFree(state->constraint_matrix_t->row_ptr)); if (state->constraint_matrix_t->col_ind) CUDA_CHECK(cudaFree(state->constraint_matrix_t->col_ind)); + if (state->constraint_matrix_t->row_ind) + CUDA_CHECK(cudaFree(state->constraint_matrix_t->row_ind)); if (state->constraint_matrix_t->val) CUDA_CHECK(cudaFree(state->constraint_matrix_t->val)); + if (state->constraint_matrix_t->transpose_map) + CUDA_CHECK(cudaFree(state->constraint_matrix_t->transpose_map)); if (state->constraint_lower_bound) CUDA_CHECK(cudaFree(state->constraint_lower_bound)); if (state->constraint_upper_bound) @@ -894,6 +957,11 @@ void pdhg_solver_state_free(pdhg_solver_state_t *state) if (state->ones_dual_d) CUDA_CHECK(cudaFree(state->ones_dual_d)); + if (state->constraint_matrix) + free(state->constraint_matrix); + if (state->constraint_matrix_t) + free(state->constraint_matrix_t); + free(state); } @@ -904,9 +972,8 @@ void rescale_info_free(rescale_info_t *info) return; } - lp_problem_free(info->scaled_problem); - free(info->con_rescale); - free(info->var_rescale); + CUDA_CHECK(cudaFree(info->con_rescale)); + CUDA_CHECK(cudaFree(info->var_rescale)); free(info); } @@ -979,14 +1046,14 @@ void set_default_parameters(pdhg_parameters_t *params) params->restart_params.i_smooth = 0.3; } -//Feasibility Polishing +// Feasibility Polishing void feasibility_polish(const pdhg_parameters_t *params, pdhg_solver_state_t *state) { clock_t start_time = clock(); if (state->relative_primal_residual < params->termination_criteria.eps_feas_polish_relative && state->relative_dual_residual < params->termination_criteria.eps_feas_polish_relative) { - + printf("Skipping feasibility polishing as the solution is already sufficiently feasible.\n"); return; } @@ -1000,7 +1067,7 @@ void feasibility_polish(const pdhg_parameters_t *params, pdhg_solver_state_t *st original_primal_weight = (state->objective_vector_norm + 1.0) / (state->constraint_bound_norm + 1.0); } - //PRIMAL FEASIBILITY POLISHING + // PRIMAL FEASIBILITY POLISHING pdhg_solver_state_t *primal_state = initialize_primal_feas_polish_state(state); primal_state->primal_weight = original_primal_weight; primal_state->best_primal_weight = original_primal_weight; @@ -1016,8 +1083,8 @@ void feasibility_polish(const pdhg_parameters_t *params, pdhg_solver_state_t *st state->primal_objective_value = primal_state->primal_objective_value; } state->feasibility_iteration += primal_state->total_count - 1; - - //DUAL FEASIBILITY POLISHING + + // DUAL FEASIBILITY POLISHING pdhg_solver_state_t *dual_state = initialize_dual_feas_polish_state(state); dual_state->primal_weight = original_primal_weight; dual_state->best_primal_weight = original_primal_weight; @@ -1036,7 +1103,7 @@ void feasibility_polish(const pdhg_parameters_t *params, pdhg_solver_state_t *st state->objective_gap = fabs(state->primal_objective_value - state->dual_objective_value); state->relative_objective_gap = state->objective_gap / (1.0 + fabs(state->primal_objective_value) + fabs(state->dual_objective_value)); - + // FINAL LOGGING pdhg_feas_polish_final_log(primal_state, dual_state, params->verbose); primal_feas_polish_state_free(primal_state); @@ -1150,22 +1217,22 @@ static pdhg_solver_state_t *initialize_primal_feas_polish_state( CUDA_CHECK(cudaMalloc(&dest, bytes)); \ CUDA_CHECK(cudaMemset(dest, 0, bytes)); - //RESET PROBLEM TO FEASIBILITY PROBLEM + // RESET PROBLEM TO FEASIBILITY PROBLEM ALLOC_ZERO(primal_state->objective_vector, num_var * sizeof(double)); primal_state->objective_constant = 0.0; -#define ALLOC_AND_COPY_DEV(dest, src, bytes) \ - CUDA_CHECK(cudaMalloc(&dest, bytes)); \ +#define ALLOC_AND_COPY_DEV(dest, src, bytes) \ + CUDA_CHECK(cudaMalloc(&dest, bytes)); \ CUDA_CHECK(cudaMemcpy(dest, src, bytes, cudaMemcpyDeviceToDevice)); - //ALLOCATE AND COPY SOLUTION VECTORS + // ALLOCATE AND COPY SOLUTION VECTORS ALLOC_AND_COPY_DEV(primal_state->initial_primal_solution, original_state->initial_primal_solution, num_var * sizeof(double)); ALLOC_AND_COPY_DEV(primal_state->current_primal_solution, original_state->current_primal_solution, num_var * sizeof(double)); ALLOC_AND_COPY_DEV(primal_state->pdhg_primal_solution, original_state->pdhg_primal_solution, num_var * sizeof(double)); ALLOC_AND_COPY_DEV(primal_state->reflected_primal_solution, original_state->reflected_primal_solution, num_var * sizeof(double)); ALLOC_AND_COPY_DEV(primal_state->primal_product, original_state->primal_product, num_cons * sizeof(double)); - //ALLOC ZERO FOR OTHERS + // ALLOC ZERO FOR OTHERS ALLOC_ZERO(primal_state->initial_dual_solution, num_cons * sizeof(double)); ALLOC_ZERO(primal_state->current_dual_solution, num_cons * sizeof(double)); ALLOC_ZERO(primal_state->pdhg_dual_solution, num_cons * sizeof(double)); @@ -1179,7 +1246,7 @@ static pdhg_solver_state_t *initialize_primal_feas_polish_state( ALLOC_ZERO(primal_state->delta_primal_solution, num_var * sizeof(double)); ALLOC_ZERO(primal_state->delta_dual_solution, num_cons * sizeof(double)); - //RESET SCALAR + // RESET SCALAR primal_state->primal_weight_error_sum = 0.0; primal_state->primal_weight_last_error = 0.0; primal_state->best_primal_weight = 0.0; @@ -1195,7 +1262,7 @@ static pdhg_solver_state_t *initialize_primal_feas_polish_state( primal_state->cumulative_time_sec = 0.0; primal_state->best_primal_dual_residual_gap = INFINITY; - //IGNORE DUAL RESIDUAL AND OBJECTIVE GAP + // IGNORE DUAL RESIDUAL AND OBJECTIVE GAP primal_state->relative_dual_residual = 0.0; primal_state->absolute_dual_residual = 0.0; primal_state->relative_objective_gap = 0.0; @@ -1206,13 +1273,15 @@ static pdhg_solver_state_t *initialize_primal_feas_polish_state( void primal_feas_polish_state_free(pdhg_solver_state_t *state) { - #define SAFE_CUDA_FREE(p) \ - if ((p) != NULL) { \ - CUDA_CHECK(cudaFree(p));\ - (p) = NULL; \ - } \ - - if (!state) return; +#define SAFE_CUDA_FREE(p) \ + if ((p) != NULL) \ + { \ + CUDA_CHECK(cudaFree(p)); \ + (p) = NULL; \ + } + + if (!state) + return; SAFE_CUDA_FREE(state->objective_vector); SAFE_CUDA_FREE(state->initial_primal_solution); SAFE_CUDA_FREE(state->current_primal_solution); @@ -1225,7 +1294,7 @@ void primal_feas_polish_state_free(pdhg_solver_state_t *state) SAFE_CUDA_FREE(state->pdhg_dual_solution); SAFE_CUDA_FREE(state->reflected_dual_solution); SAFE_CUDA_FREE(state->primal_product); - + SAFE_CUDA_FREE(state->primal_slack); SAFE_CUDA_FREE(state->dual_slack); SAFE_CUDA_FREE(state->primal_residual); @@ -1245,7 +1314,6 @@ __global__ void zero_finite_value_vectors_kernel( vec[idx] = 0.0; } } - static pdhg_solver_state_t *initialize_dual_feas_polish_state( const pdhg_solver_state_t *original_state) @@ -1255,18 +1323,18 @@ static pdhg_solver_state_t *initialize_dual_feas_polish_state( int num_var = original_state->num_variables; int num_cons = original_state->num_constraints; - #define ALLOC_AND_COPY_DEV(dest, src, bytes) \ - CUDA_CHECK(cudaMalloc(&dest, bytes)); \ - CUDA_CHECK(cudaMemcpy(dest, src, bytes, cudaMemcpyDeviceToDevice)); - - //RESET PROBLEM TO DUAL FEASIBILITY PROBLEM - #define SET_FINITE_TO_ZERO(vec, n) \ - { \ - int threads = 256; \ - int blocks = (n + threads - 1) / threads; \ - zero_finite_value_vectors_kernel<<>>(vec, n); \ - CUDA_CHECK(cudaDeviceSynchronize()); \ - } +#define ALLOC_AND_COPY_DEV(dest, src, bytes) \ + CUDA_CHECK(cudaMalloc(&dest, bytes)); \ + CUDA_CHECK(cudaMemcpy(dest, src, bytes, cudaMemcpyDeviceToDevice)); + +// RESET PROBLEM TO DUAL FEASIBILITY PROBLEM +#define SET_FINITE_TO_ZERO(vec, n) \ + { \ + int threads = 256; \ + int blocks = (n + threads - 1) / threads; \ + zero_finite_value_vectors_kernel<<>>(vec, n); \ + CUDA_CHECK(cudaDeviceSynchronize()); \ + } ALLOC_AND_COPY_DEV(dual_state->constraint_lower_bound, original_state->constraint_lower_bound, num_cons * sizeof(double)); ALLOC_AND_COPY_DEV(dual_state->constraint_upper_bound, original_state->constraint_upper_bound, num_cons * sizeof(double)); @@ -1278,16 +1346,16 @@ static pdhg_solver_state_t *initialize_dual_feas_polish_state( SET_FINITE_TO_ZERO(dual_state->variable_lower_bound, num_var); SET_FINITE_TO_ZERO(dual_state->variable_upper_bound, num_var); - #define ALLOC_ZERO(dest, bytes) \ - CUDA_CHECK(cudaMalloc(&dest, bytes)); \ - CUDA_CHECK(cudaMemset(dest, 0, bytes)); +#define ALLOC_ZERO(dest, bytes) \ + CUDA_CHECK(cudaMalloc(&dest, bytes)); \ + CUDA_CHECK(cudaMemset(dest, 0, bytes)); ALLOC_ZERO(dual_state->constraint_lower_bound_finite_val, num_cons * sizeof(double)); ALLOC_ZERO(dual_state->constraint_upper_bound_finite_val, num_cons * sizeof(double)); ALLOC_ZERO(dual_state->variable_lower_bound_finite_val, num_var * sizeof(double)); ALLOC_ZERO(dual_state->variable_upper_bound_finite_val, num_var * sizeof(double)); - //ALLOCATE AND COPY SOLUTION VECTORS + // ALLOCATE AND COPY SOLUTION VECTORS ALLOC_AND_COPY_DEV(dual_state->initial_dual_solution, original_state->initial_dual_solution, num_cons * sizeof(double)); ALLOC_AND_COPY_DEV(dual_state->current_dual_solution, original_state->current_dual_solution, num_cons * sizeof(double)); ALLOC_AND_COPY_DEV(dual_state->pdhg_dual_solution, original_state->pdhg_dual_solution, num_cons * sizeof(double)); @@ -1295,7 +1363,7 @@ static pdhg_solver_state_t *initialize_dual_feas_polish_state( ALLOC_AND_COPY_DEV(dual_state->dual_product, original_state->dual_product, num_var * sizeof(double)); ALLOC_AND_COPY_DEV(dual_state->dual_slack, original_state->dual_slack, num_var * sizeof(double)); - //ALLOC ZERO FOR OTHERS + // ALLOC ZERO FOR OTHERS ALLOC_ZERO(dual_state->initial_primal_solution, num_var * sizeof(double)); ALLOC_ZERO(dual_state->current_primal_solution, num_var * sizeof(double)); ALLOC_ZERO(dual_state->pdhg_primal_solution, num_var * sizeof(double)); @@ -1307,7 +1375,7 @@ static pdhg_solver_state_t *initialize_dual_feas_polish_state( ALLOC_ZERO(dual_state->delta_primal_solution, num_var * sizeof(double)); ALLOC_ZERO(dual_state->delta_dual_solution, num_cons * sizeof(double)); - //RESET SCALAR + // RESET SCALAR dual_state->primal_weight_error_sum = 0.0; dual_state->primal_weight_last_error = 0.0; dual_state->best_primal_weight = 0.0; @@ -1323,7 +1391,7 @@ static pdhg_solver_state_t *initialize_dual_feas_polish_state( dual_state->cumulative_time_sec = 0.0; dual_state->best_primal_dual_residual_gap = INFINITY; - //IGNORE PRIMAL RESIDUAL AND OBJECTIVE GAP + // IGNORE PRIMAL RESIDUAL AND OBJECTIVE GAP dual_state->relative_primal_residual = 0.0; dual_state->absolute_primal_residual = 0.0; dual_state->relative_objective_gap = 0.0; @@ -1333,13 +1401,15 @@ static pdhg_solver_state_t *initialize_dual_feas_polish_state( void dual_feas_polish_state_free(pdhg_solver_state_t *state) { - #define SAFE_CUDA_FREE(p) \ - if ((p) != NULL) { \ - CUDA_CHECK(cudaFree(p));\ - (p) = NULL; \ - } \ - - if (!state) return; +#define SAFE_CUDA_FREE(p) \ + if ((p) != NULL) \ + { \ + CUDA_CHECK(cudaFree(p)); \ + (p) = NULL; \ + } + + if (!state) + return; SAFE_CUDA_FREE(state->constraint_lower_bound); SAFE_CUDA_FREE(state->constraint_upper_bound); SAFE_CUDA_FREE(state->variable_lower_bound); @@ -1360,7 +1430,7 @@ void dual_feas_polish_state_free(pdhg_solver_state_t *state) SAFE_CUDA_FREE(state->pdhg_dual_solution); SAFE_CUDA_FREE(state->reflected_dual_solution); SAFE_CUDA_FREE(state->primal_product); - + SAFE_CUDA_FREE(state->primal_slack); SAFE_CUDA_FREE(state->dual_slack); SAFE_CUDA_FREE(state->primal_residual); @@ -1439,5 +1509,3 @@ static void compute_dual_fixed_point_error(pdhg_solver_state_t *state) &dual_norm)); state->fixed_point_error = dual_norm * dual_norm / state->primal_weight; } - - diff --git a/src/utils.cu b/src/utils.cu index 172e731..5864c56 100644 --- a/src/utils.cu +++ b/src/utils.cu @@ -373,13 +373,14 @@ void pdhg_final_log(const pdhg_solver_state_t *state, bool verbose, "--------------------\n"); } printf("Solution Summary\n"); - printf(" Status : %s\n", termination_reason_to_string(reason)); - printf(" Iterations : %d\n", state->total_count - 1); - printf(" Solve time : %.3g sec\n", state->cumulative_time_sec); - printf(" Primal obj : %.10g\n", state->primal_objective_value); - printf(" Dual obj : %.10g\n", state->dual_objective_value); - printf(" Primal infeas : %.3e\n", state->relative_primal_residual); - printf(" Dual infeas : %.3e\n", state->relative_dual_residual); + printf(" Status : %s\n", termination_reason_to_string(reason)); + printf(" Precondition time : %.5g sec\n", state->rescaling_time_sec); + printf(" Iterations : %d\n", state->total_count - 1); + printf(" Solve time : %.3g sec\n", state->cumulative_time_sec); + printf(" Primal obj : %.10g\n", state->primal_objective_value); + printf(" Dual obj : %.10g\n", state->dual_objective_value); + printf(" Primal infeas : %.3e\n", state->relative_primal_residual); + printf(" Dual infeas : %.3e\n", state->relative_dual_residual); } void display_iteration_stats(const pdhg_solver_state_t *state, bool verbose) @@ -1013,7 +1014,7 @@ void print_initial_feas_polish_info(bool is_primal_polish, const pdhg_parameters printf("---------------------------------------------------------------------------------------\n"); if (is_primal_polish) printf("%s %s | %s | %s | %s \n", " iter", " time ", "pr obj", " abs pr res ", " rel pr res "); // else printf("%s %s | %s | %s \n", " iter", " time ", " abs du res ", " rel du res "); - else printf("%s %s | %s | %s | %s \n", " iter", " time ", "du obj", " abs du res ", " rel du res "); + else printf("%s %s | %s | %s | %s \n", " iter", " time ", "du obj", " abs du res ", " rel du res "); printf("---------------------------------------------------------------------------------------\n"); } @@ -1137,7 +1138,7 @@ void compute_dual_feas_polish_residual(pdhg_solver_state_t *state, const pdhg_so CUSPARSE_CHECK(cusparseSpMV(state->sparse_handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &HOST_ONE, state->matAt, state->vec_dual_sol, &HOST_ZERO, state->vec_dual_prod, CUDA_R_64F, CUSPARSE_SPMV_CSR_ALG2, state->dual_spmv_buffer)); compute_dual_feas_polish_residual_kerenl<<num_blocks_primal_dual, THREADS_PER_BLOCK>>>( - state->dual_residual, + state->dual_residual, state->pdhg_dual_solution, state->dual_product, state->dual_slack, state->objective_vector,