From 2a31c9966e55f1db809b438391ce6015cf6e1ffd Mon Sep 17 00:00:00 2001 From: Bo Tang Date: Fri, 14 Nov 2025 18:33:51 -0500 Subject: [PATCH 01/13] New feat: GPU precondition --- CMakeLists.txt | 2 +- internal/internal_types.h | 7 +- internal/preconditioner.h | 2 +- internal/utils.h | 2 + pyproject.toml | 2 +- src/preconditioner.c | 306 ------------------------ src/preconditioner.cu | 489 ++++++++++++++++++++++++++++++++++++++ src/solver.cu | 366 ++++++++++------------------ src/utils.cu | 34 +-- 9 files changed, 644 insertions(+), 566 deletions(-) delete mode 100644 src/preconditioner.c create mode 100644 src/preconditioner.cu diff --git a/CMakeLists.txt b/CMakeLists.txt index e06c649..bd1a955 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -20,7 +20,7 @@ if(NOT CMAKE_BUILD_TYPE AND NOT CMAKE_CONFIGURATION_TYPES) endif() # Increase compatibility by compiling for all supported real and virtual architectures. -set(CMAKE_CUDA_ARCHITECTURES all) +set(CMAKE_CUDA_ARCHITECTURES 60 70 75 80 86 89 90) # Global Compile flags (corresponding to CFLAGS/NVCCFLAGS) add_compile_options(-fPIC -O3 -Wall -Wextra -g) diff --git a/internal/internal_types.h b/internal/internal_types.h index e66600d..f0d8def 100644 --- a/internal/internal_types.h +++ b/internal/internal_types.h @@ -28,6 +28,7 @@ typedef struct int num_nonzeros; int *row_ptr; int *col_ind; + int *row_ind; double *val; } cu_sparse_matrix_csr_t; @@ -46,6 +47,7 @@ typedef struct int num_blocks_primal; int num_blocks_dual; int num_blocks_primal_dual; + int num_blocks_nnz; double objective_vector_norm; double constraint_bound_norm; double *constraint_lower_bound_finite_val; @@ -74,8 +76,6 @@ typedef struct double *constraint_rescaling; double *variable_rescaling; - double constraint_bound_rescaling; - double objective_vector_rescaling; double *primal_slack; double *dual_slack; double rescaling_time_sec; @@ -126,10 +126,7 @@ typedef struct typedef struct { - lp_problem_t *scaled_problem; double *con_rescale; double *var_rescale; - double con_bound_rescale; - double obj_vec_rescale; double rescaling_time_sec; } rescale_info_t; diff --git a/internal/preconditioner.h b/internal/preconditioner.h index 8feb664..71a5c2b 100644 --- a/internal/preconditioner.h +++ b/internal/preconditioner.h @@ -25,7 +25,7 @@ extern "C" rescale_info_t *rescale_problem( const pdhg_parameters_t *params, - const lp_problem_t *original_problem); + pdhg_solver_state_t *state); #ifdef __cplusplus } diff --git a/internal/utils.h b/internal/utils.h index 62da072..7b939ea 100644 --- a/internal/utils.h +++ b/internal/utils.h @@ -125,6 +125,8 @@ extern "C" int coo_to_csr(const matrix_desc_t *desc, int **row_ptr, int **col_ind, double **vals, int *nnz_out); + int* build_row_ind_from_row_ptr(int* row_ptr, int num_rows, int nnz); + #ifdef __cplusplus } diff --git a/pyproject.toml b/pyproject.toml index 9c65a6c..2a0a2c0 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -4,7 +4,7 @@ build-backend = "scikit_build_core.build" [project] name = "cupdlpx" -version = "0.1.2" +version = "0.1.3" description = "Python bindings for cuPDLPx (GPU-accelerated first-order LP solver)" readme = "README.md" license = { text = "Apache-2.0" } diff --git a/src/preconditioner.c b/src/preconditioner.c deleted file mode 100644 index 41b7542..0000000 --- a/src/preconditioner.c +++ /dev/null @@ -1,306 +0,0 @@ -/* -Copyright 2025 Haihao Lu - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. -*/ - -#include "preconditioner.h" -#include "utils.h" -#include -#include -#include -#include - -#define SCALING_EPSILON 1e-12 - -static lp_problem_t *deepcopy_problem(const lp_problem_t *prob); -static void scale_problem(lp_problem_t *problem, const double *con_rescale, - const double *var_rescale); -static void ruiz_rescaling(lp_problem_t *problem, int num_iters, - double *cum_con_rescale, double *cum_var_rescale); -static void pock_chambolle_rescaling(lp_problem_t *problem, double alpha, - double *cum_con_rescale, - double *cum_var_rescale); - -static lp_problem_t *deepcopy_problem(const lp_problem_t *prob) -{ - lp_problem_t *new_prob = (lp_problem_t *)safe_malloc(sizeof(lp_problem_t)); - - new_prob->num_variables = prob->num_variables; - new_prob->num_constraints = prob->num_constraints; - new_prob->constraint_matrix_num_nonzeros = - prob->constraint_matrix_num_nonzeros; - new_prob->objective_constant = prob->objective_constant; - - size_t var_bytes = prob->num_variables * sizeof(double); - size_t con_bytes = prob->num_constraints * sizeof(double); - size_t nnz_bytes_val = prob->constraint_matrix_num_nonzeros * sizeof(double); - size_t nnz_bytes_col = prob->constraint_matrix_num_nonzeros * sizeof(int); - size_t row_ptr_bytes = (prob->num_constraints + 1) * sizeof(int); - - new_prob->variable_lower_bound = safe_malloc(var_bytes); - new_prob->variable_upper_bound = safe_malloc(var_bytes); - new_prob->objective_vector = safe_malloc(var_bytes); - new_prob->constraint_lower_bound = safe_malloc(con_bytes); - new_prob->constraint_upper_bound = safe_malloc(con_bytes); - new_prob->constraint_matrix_row_pointers = safe_malloc(row_ptr_bytes); - new_prob->constraint_matrix_col_indices = safe_malloc(nnz_bytes_col); - new_prob->constraint_matrix_values = safe_malloc(nnz_bytes_val); - - memcpy(new_prob->variable_lower_bound, prob->variable_lower_bound, var_bytes); - memcpy(new_prob->variable_upper_bound, prob->variable_upper_bound, var_bytes); - memcpy(new_prob->objective_vector, prob->objective_vector, var_bytes); - memcpy(new_prob->constraint_lower_bound, prob->constraint_lower_bound, - con_bytes); - memcpy(new_prob->constraint_upper_bound, prob->constraint_upper_bound, - con_bytes); - memcpy(new_prob->constraint_matrix_row_pointers, - prob->constraint_matrix_row_pointers, row_ptr_bytes); - memcpy(new_prob->constraint_matrix_col_indices, - prob->constraint_matrix_col_indices, nnz_bytes_col); - memcpy(new_prob->constraint_matrix_values, prob->constraint_matrix_values, - nnz_bytes_val); - - if (prob->primal_start) - { - new_prob->primal_start = safe_malloc(var_bytes); - memcpy(new_prob->primal_start, prob->primal_start, var_bytes); - } - else - { - new_prob->primal_start = NULL; - } - if (prob->dual_start) - { - new_prob->dual_start = safe_malloc(con_bytes); - memcpy(new_prob->dual_start, prob->dual_start, con_bytes); - } - else - { - new_prob->dual_start = NULL; - } - - return new_prob; -} - -static void scale_problem(lp_problem_t *problem, - const double *constraint_rescaling, - const double *variable_rescaling) -{ - for (int i = 0; i < problem->num_variables; ++i) - { - problem->objective_vector[i] /= variable_rescaling[i]; - problem->variable_upper_bound[i] *= variable_rescaling[i]; - problem->variable_lower_bound[i] *= variable_rescaling[i]; - } - for (int i = 0; i < problem->num_constraints; ++i) - { - problem->constraint_lower_bound[i] /= constraint_rescaling[i]; - problem->constraint_upper_bound[i] /= constraint_rescaling[i]; - } - - for (int row = 0; row < problem->num_constraints; ++row) - { - for (int nz_idx = problem->constraint_matrix_row_pointers[row]; - nz_idx < problem->constraint_matrix_row_pointers[row + 1]; ++nz_idx) - { - int col = problem->constraint_matrix_col_indices[nz_idx]; - problem->constraint_matrix_values[nz_idx] /= - (constraint_rescaling[row] * variable_rescaling[col]); - } - } -} - -static void ruiz_rescaling(lp_problem_t *problem, int num_iterations, - double *cum_constraint_rescaling, - double *cum_variable_rescaling) -{ - int num_cons = problem->num_constraints; - int num_vars = problem->num_variables; - double *con_rescale = safe_malloc(num_cons * sizeof(double)); - double *var_rescale = safe_malloc(num_vars * sizeof(double)); - - for (int iter = 0; iter < num_iterations; ++iter) - { - for (int i = 0; i < num_vars; ++i) - var_rescale[i] = 0.0; - for (int i = 0; i < num_cons; ++i) - con_rescale[i] = 0.0; - - for (int row = 0; row < num_cons; ++row) - { - for (int nz_idx = problem->constraint_matrix_row_pointers[row]; - nz_idx < problem->constraint_matrix_row_pointers[row + 1]; - ++nz_idx) - { - int col = problem->constraint_matrix_col_indices[nz_idx]; - if (col < 0 || col >= num_vars) - { - fprintf(stderr, - "Error: Invalid column index %d at nz_idx %d for row %d. " - "Must be in [0, %d).\n", - col, nz_idx, row, num_vars); - } - double val = fabs(problem->constraint_matrix_values[nz_idx]); - if (val > var_rescale[col]) - var_rescale[col] = val; - if (val > con_rescale[row]) - con_rescale[row] = val; - } - } - - for (int i = 0; i < num_vars; ++i) - var_rescale[i] = - (var_rescale[i] < SCALING_EPSILON) ? 1.0 : sqrt(var_rescale[i]); - for (int i = 0; i < num_cons; ++i) - con_rescale[i] = - (con_rescale[i] < SCALING_EPSILON) ? 1.0 : sqrt(con_rescale[i]); - - scale_problem(problem, con_rescale, var_rescale); - for (int i = 0; i < num_vars; ++i) - cum_variable_rescaling[i] *= var_rescale[i]; - for (int i = 0; i < num_cons; ++i) - cum_constraint_rescaling[i] *= con_rescale[i]; - } - free(con_rescale); - free(var_rescale); -} - -static void pock_chambolle_rescaling(lp_problem_t *problem, double alpha, - double *cum_constraint_rescaling, - double *cum_variable_rescaling) -{ - int num_cons = problem->num_constraints; - int num_vars = problem->num_variables; - double *con_rescale = safe_calloc(num_cons, sizeof(double)); - double *var_rescale = safe_calloc(num_vars, sizeof(double)); - - for (int row = 0; row < num_cons; ++row) - { - for (int nz_idx = problem->constraint_matrix_row_pointers[row]; - nz_idx < problem->constraint_matrix_row_pointers[row + 1]; ++nz_idx) - { - int col = problem->constraint_matrix_col_indices[nz_idx]; - double val = fabs(problem->constraint_matrix_values[nz_idx]); - var_rescale[col] += pow(val, 2.0 - alpha); - con_rescale[row] += pow(val, alpha); - } - } - - for (int i = 0; i < num_vars; ++i) - var_rescale[i] = - (var_rescale[i] < SCALING_EPSILON) ? 1.0 : sqrt(var_rescale[i]); - for (int i = 0; i < num_cons; ++i) - con_rescale[i] = - (con_rescale[i] < SCALING_EPSILON) ? 1.0 : sqrt(con_rescale[i]); - - scale_problem(problem, con_rescale, var_rescale); - for (int i = 0; i < num_vars; ++i) - cum_variable_rescaling[i] *= var_rescale[i]; - for (int i = 0; i < num_cons; ++i) - cum_constraint_rescaling[i] *= con_rescale[i]; - - free(con_rescale); - free(var_rescale); -} - -rescale_info_t *rescale_problem(const pdhg_parameters_t *params, - const lp_problem_t *original_problem) -{ - clock_t start_rescaling = clock(); - rescale_info_t *rescale_info = - (rescale_info_t *)safe_calloc(1, sizeof(rescale_info_t)); - rescale_info->scaled_problem = deepcopy_problem(original_problem); - if (rescale_info->scaled_problem == NULL) - { - fprintf(stderr, - "Failed to create a copy of the problem. Aborting rescale.\n"); - return NULL; - } - int num_cons = original_problem->num_constraints; - int num_vars = original_problem->num_variables; - - rescale_info->con_rescale = safe_malloc(num_cons * sizeof(double)); - rescale_info->var_rescale = safe_malloc(num_vars * sizeof(double)); - for (int i = 0; i < num_cons; ++i) - rescale_info->con_rescale[i] = 1.0; - for (int i = 0; i < num_vars; ++i) - rescale_info->var_rescale[i] = 1.0; - if (params->l_inf_ruiz_iterations > 0) - { - ruiz_rescaling(rescale_info->scaled_problem, params->l_inf_ruiz_iterations, - rescale_info->con_rescale, rescale_info->var_rescale); - } - if (params->has_pock_chambolle_alpha) - { - pock_chambolle_rescaling( - rescale_info->scaled_problem, params->pock_chambolle_alpha, - rescale_info->con_rescale, rescale_info->var_rescale); - } - if (params->bound_objective_rescaling) - { - double bound_norm_sq = 0.0; - for (int i = 0; i < num_cons; ++i) - { - if (isfinite(rescale_info->scaled_problem->constraint_lower_bound[i]) && - (rescale_info->scaled_problem->constraint_lower_bound[i] != - rescale_info->scaled_problem->constraint_upper_bound[i])) - { - bound_norm_sq += - rescale_info->scaled_problem->constraint_lower_bound[i] * - rescale_info->scaled_problem->constraint_lower_bound[i]; - } - if (isfinite(rescale_info->scaled_problem->constraint_upper_bound[i])) - { - bound_norm_sq += - rescale_info->scaled_problem->constraint_upper_bound[i] * - rescale_info->scaled_problem->constraint_upper_bound[i]; - } - } - - double obj_norm_sq = 0.0; - for (int i = 0; i < num_vars; ++i) - { - obj_norm_sq += rescale_info->scaled_problem->objective_vector[i] * - rescale_info->scaled_problem->objective_vector[i]; - } - - rescale_info->con_bound_rescale = 1.0 / (sqrt(bound_norm_sq) + 1.0); - rescale_info->obj_vec_rescale = 1.0 / (sqrt(obj_norm_sq) + 1.0); - - for (int i = 0; i < num_cons; ++i) - { - rescale_info->scaled_problem->constraint_lower_bound[i] *= - rescale_info->con_bound_rescale; - rescale_info->scaled_problem->constraint_upper_bound[i] *= - rescale_info->con_bound_rescale; - } - for (int i = 0; i < num_vars; ++i) - { - rescale_info->scaled_problem->variable_lower_bound[i] *= - rescale_info->con_bound_rescale; - rescale_info->scaled_problem->variable_upper_bound[i] *= - rescale_info->con_bound_rescale; - rescale_info->scaled_problem->objective_vector[i] *= - rescale_info->obj_vec_rescale; - } - } - else - { - rescale_info->con_bound_rescale = 1.0; - rescale_info->obj_vec_rescale = 1.0; - } - rescale_info->rescaling_time_sec = - (double)(clock() - start_rescaling) / CLOCKS_PER_SEC; - return rescale_info; -} \ No newline at end of file diff --git a/src/preconditioner.cu b/src/preconditioner.cu new file mode 100644 index 0000000..e197905 --- /dev/null +++ b/src/preconditioner.cu @@ -0,0 +1,489 @@ +/* +Copyright 2025 Haihao Lu + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +#include "preconditioner.h" +#include "utils.h" +#include +#include +#include +#include +#include +#include + +#define SCALING_EPSILON 1e-12 + +__global__ void scale_variables_kernel(double* __restrict__ c, + double* __restrict__ var_lb, + double* __restrict__ var_ub, + double* __restrict__ var_lb_finite, + double* __restrict__ var_ub_finite, + double* __restrict__ primal_start, + const double* __restrict__ D, + const double* __restrict__ invD, + int n); +__global__ void scale_constraints_kernel(double* __restrict__ con_lb, + double* __restrict__ con_ub, + double* __restrict__ con_lb_finite, + double* __restrict__ con_ub_finite, + double* __restrict__ dual_start, + const double* __restrict__ E, + const double* __restrict__ invE, + int m); +__global__ void csr_scale_nnz_kernel(const int* __restrict__ row_ids, + const int* __restrict__ col_ind, + double* __restrict__ vals, + const double* __restrict__ invD, + const double* __restrict__ invE, + int nnz); +__global__ void csr_row_absmax_kernel(const int* __restrict__ row_ptr, + const double* __restrict__ vals, + int num_rows, + double* __restrict__ out_max); +__global__ void csr_col_absmax_atomic_kernel(const int* __restrict__ col_ind, + const double* __restrict__ vals, + int nnz, + unsigned long long* __restrict__ out_max_bits); +__global__ void u64bits_to_double(const unsigned long long* __restrict__ in_bits, + double* __restrict__ out_val, + int n); +__global__ void csr_row_powsum_kernel(const int* __restrict__ row_ptr, + const double* __restrict__ vals, + int num_rows, + double degree, + double* __restrict__ out_sum); +__global__ void csr_col_powsum_atomic_kernel(const int* __restrict__ col_ind, + const double* __restrict__ vals, + int nnz, + double degree, + double* __restrict__ out_sum); +__global__ void clamp_sqrt_and_accum(double* __restrict__ x, + double* __restrict__ inv_x, + double* __restrict__ cum, + int n); +static void scale_problem(pdhg_solver_state_t *state, double *E, double *D, double *invE, double *invD); +static void ruiz_rescaling(pdhg_solver_state_t *state, int num_iters, rescale_info_t *rescale_info, + double *E, double *D, double *invE, double *invD); +static void pock_chambolle_rescaling(pdhg_solver_state_t *state, double alpha, rescale_info_t *rescale_info, + double *E, double *D, double *invE, double *invD); +static void bound_objective_rescaling(pdhg_solver_state_t *state, rescale_info_t *rescale_info, + double *E, double *D, double *invE, double *invD); + +__global__ void scale_variables_kernel(double* __restrict__ c, + double* __restrict__ var_lb, + double* __restrict__ var_ub, + double* __restrict__ var_lb_finite, + double* __restrict__ var_ub_finite, + double* __restrict__ primal_start, + const double* __restrict__ D, + const double* __restrict__ invD, + int n) +{ + int j = blockIdx.x * blockDim.x + threadIdx.x; + if (j >= n) return; + double dj = D[j]; + double inv_dj = invD[j]; + c[j] *= inv_dj; + var_lb[j] *= dj; + var_ub[j] *= dj; + var_lb_finite[j] *= dj; + var_ub_finite[j] *= dj; + primal_start[j] *= dj; +} + +__global__ void scale_constraints_kernel(double* __restrict__ con_lb, + double* __restrict__ con_ub, + double* __restrict__ con_lb_finite, + double* __restrict__ con_ub_finite, + double* __restrict__ dual_start, + const double* __restrict__ E, + const double* __restrict__ invE, + int m) +{ + int i = blockIdx.x * blockDim.x + threadIdx.x; + if (i >= m) return; + double inv_ei = invE[i]; + double ei = E[i]; + con_lb[i] *= inv_ei; + con_ub[i] *= inv_ei; + con_lb_finite[i] *= inv_ei; + con_ub_finite[i] *= inv_ei; + dual_start[i] *= ei; +} + +__global__ void csr_scale_nnz_kernel(const int* __restrict__ row_ids, + const int* __restrict__ col_ind, + double* __restrict__ vals, + const double* __restrict__ invD, + const double* __restrict__ invE, + int nnz) +{ + for (int k = blockIdx.x * blockDim.x + threadIdx.x; + k < nnz; + k += gridDim.x * blockDim.x) + { + int i = row_ids[k]; + int j = col_ind[k]; + vals[k] *= invD[j] * invE[i]; + } +} + +__global__ void csr_row_absmax_kernel(const int* __restrict__ row_ptr, + const double* __restrict__ vals, + int num_rows, + double* __restrict__ out_max) +{ + int i = blockIdx.x * blockDim.x + threadIdx.x; + if (i >= num_rows) return; + int s = row_ptr[i], e = row_ptr[i + 1]; + double m = 0.0; + for (int k = s; k < e; ++k) { + double v = fabs(vals[k]); + if (!isfinite(v)) v = 0.0; + if (v > m) m = v; + } + out_max[i] = m; +} + +__global__ void csr_col_absmax_atomic_kernel(const int* __restrict__ col_ind, + const double* __restrict__ vals, + int nnz, + unsigned long long* __restrict__ out_max_bits) +{ + for (int k = blockIdx.x * blockDim.x + threadIdx.x; k < nnz; k += gridDim.x * blockDim.x) { + int j = col_ind[k]; + double v = fabs(vals[k]); + if (!isfinite(v)) v = 0.0; + unsigned long long bits = __double_as_longlong(v); + atomicMax(&out_max_bits[j], bits); + } +} + +__global__ void u64bits_to_double(const unsigned long long* __restrict__ in_bits, + double* __restrict__ out_val, + int n) +{ + for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < n; i += gridDim.x * blockDim.x) { + out_val[i] = __longlong_as_double(in_bits[i]); + } +} + +__device__ __forceinline__ double pow_fast(double v, double p) { + if (p == 2.0) return v * v; + if (p == 1.0) return v; + if (p == 0.5) return sqrt(v); + return pow(v, p); +} + +__global__ void csr_row_powsum_kernel(const int* __restrict__ row_ptr, + const double* __restrict__ vals, + int num_rows, + double degree, + double* __restrict__ out_sum) +{ + int i = blockIdx.x * blockDim.x + threadIdx.x; + if (i >= num_rows) return; + int s = row_ptr[i], e = row_ptr[i + 1]; + double acc = 0.0; + for (int k = s; k < e; ++k) { + double v = fabs(vals[k]); + if (!isfinite(v)) v = 0.0; + acc += pow_fast(v, degree); + } + out_sum[i] = acc; +} + +__global__ void csr_col_powsum_atomic_kernel(const int* __restrict__ col_ind, + const double* __restrict__ vals, + int nnz, + double degree, + double* __restrict__ out_sum) +{ + for (int k = blockIdx.x * blockDim.x + threadIdx.x; k < nnz; k += gridDim.x * blockDim.x) { + int j = col_ind[k]; + double v = fabs(vals[k]); + if (!isfinite(v)) v = 0.0; + double t = pow_fast(v, degree); + atomicAdd(&out_sum[j], t); + } +} + +__global__ void clamp_sqrt_and_accum(double* __restrict__ x, + double* __restrict__ inv_x, + double* __restrict__ cum, + int n) +{ + for (int t = blockIdx.x * blockDim.x + threadIdx.x; t < n; t += blockDim.x * gridDim.x) + { + double v = x[t]; + double s = (v < SCALING_EPSILON) ? 1.0 : sqrt(v); + cum[t] *= s; + x[t] = s; + inv_x[t] = 1.0 / s; + } +} + +__global__ void reduce_bound_norm_sq_atomic( + const double* __restrict__ L, + const double* __restrict__ U, + int m, + double* __restrict__ out_sum) +{ + double acc = 0.0; + for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < m; i += blockDim.x * gridDim.x) { + double Li = L[i], Ui = U[i]; + bool fL = isfinite(Li), fU = isfinite(Ui); + if (fL && (!fU || fabs(Li - Ui) > SCALING_EPSILON)) acc += Li * Li; + if (fU) acc += Ui * Ui; + } + atomicAdd(out_sum, acc); +} + +static void scale_problem( + pdhg_solver_state_t *state, + double *E, + double *D, + double *invE, + double *invD) +{ + int n_vars = state->num_variables; + int n_cons = state->num_constraints; + + scale_variables_kernel<<num_blocks_primal, THREADS_PER_BLOCK>>>( + state->objective_vector, + state->variable_lower_bound, + state->variable_upper_bound, + state->variable_lower_bound_finite_val, + state->variable_upper_bound_finite_val, + state->initial_primal_solution, + D, + invD, + n_vars); + + scale_constraints_kernel<<num_blocks_dual, THREADS_PER_BLOCK>>>( + state->constraint_lower_bound, + state->constraint_upper_bound, + state->constraint_lower_bound_finite_val, + state->constraint_upper_bound_finite_val, + state->initial_dual_solution, + E, + invE, + n_cons); + + csr_scale_nnz_kernel<<num_blocks_nnz, THREADS_PER_BLOCK>>>( + state->constraint_matrix->row_ind, + state->constraint_matrix->col_ind, + state->constraint_matrix->val, + invD, + invE, + state->constraint_matrix->num_nonzeros); +} + +static void ruiz_rescaling( + pdhg_solver_state_t *state, + int num_iterations, + rescale_info_t *rescale_info, + double *E, + double *D, + double *invE, + double *invD) +{ + const int n_cons = state->num_constraints; + const int n_vars = state->num_variables; + const int nnz = state->constraint_matrix->num_nonzeros; + + unsigned long long *D_bits=nullptr; + CUDA_CHECK(cudaMalloc(&D_bits, n_vars*sizeof(unsigned long long))); + + for (int iter = 0; iter < num_iterations; ++iter) + { + csr_row_absmax_kernel<<num_blocks_dual, THREADS_PER_BLOCK>>>( + state->constraint_matrix->row_ptr, + state->constraint_matrix->val, + n_cons, + E); + clamp_sqrt_and_accum<<num_blocks_dual, THREADS_PER_BLOCK>>>( + E, + invE, + rescale_info->con_rescale, + n_cons); + + CUDA_CHECK(cudaMemset(D_bits, 0, n_vars*sizeof(unsigned long long))); + csr_col_absmax_atomic_kernel<<num_blocks_nnz, THREADS_PER_BLOCK>>>( + state->constraint_matrix->col_ind, + state->constraint_matrix->val, + nnz, + D_bits); + u64bits_to_double<<num_blocks_primal, THREADS_PER_BLOCK>>>(D_bits, D, n_vars); + clamp_sqrt_and_accum<<num_blocks_primal, THREADS_PER_BLOCK>>>( + D, + invD, + rescale_info->var_rescale, + n_vars); + + scale_problem(state, E, D, invE, invD); + } + + CUDA_CHECK(cudaFree(D_bits)); +} + +static void pock_chambolle_rescaling( + pdhg_solver_state_t *state, + const double alpha, + rescale_info_t *rescale_info, + double *E, + double *D, + double *invE, + double *invD) +{ + const int n_cons = state->num_constraints; + const int n_vars = state->num_variables; + const int nnz = state->constraint_matrix->num_nonzeros; + + csr_row_powsum_kernel<<num_blocks_dual, THREADS_PER_BLOCK>>>( + state->constraint_matrix->row_ptr, + state->constraint_matrix->val, + n_cons, + alpha, + E); + clamp_sqrt_and_accum<<num_blocks_dual, THREADS_PER_BLOCK>>>( + E, + invE, + rescale_info->con_rescale, + n_cons); + + CUDA_CHECK(cudaMemset(D, 0, n_vars*sizeof(double))); + csr_col_powsum_atomic_kernel<<num_blocks_nnz, THREADS_PER_BLOCK>>>( + state->constraint_matrix->col_ind, + state->constraint_matrix->val, + nnz, + 2.0 - alpha, + D); + clamp_sqrt_and_accum<<num_blocks_primal, THREADS_PER_BLOCK>>>( + D, + invD, + rescale_info->var_rescale, + n_vars); + + scale_problem(state, E, D, invE, invD); + +} + +static void bound_objective_rescaling( + pdhg_solver_state_t *state, + rescale_info_t *rescale_info, + double *E, + double *D, + double *invE, + double *invD + ) +{ + const int n_cons = state->num_constraints; + const int n_vars = state->num_variables; + + double *bnd_norm_sq_cuda = nullptr; + CUDA_CHECK(cudaMalloc(&bnd_norm_sq_cuda, sizeof(double))); + CUDA_CHECK(cudaMemset(bnd_norm_sq_cuda, 0, sizeof(double))); + reduce_bound_norm_sq_atomic<<num_blocks_dual, THREADS_PER_BLOCK>>>( + state->constraint_lower_bound, + state->constraint_upper_bound, + n_cons, + bnd_norm_sq_cuda); + + double bnd_norm_sq = 0.0; + CUDA_CHECK(cudaMemcpy(&bnd_norm_sq, bnd_norm_sq_cuda, sizeof(double), cudaMemcpyDeviceToHost)); + CUDA_CHECK(cudaFree(bnd_norm_sq_cuda)); + double bnd_norm = sqrt(bnd_norm_sq); + + double obj_norm = 0.0; + CUBLAS_CHECK(cublasDnrm2(state->blas_handle, + state->num_variables, + state->objective_vector, 1, + &obj_norm)); + + const double E_const = bnd_norm + 1.0; + const double D_const = obj_norm + 1.0; + { + std::vector h1(n_cons,E_const), h2(n_vars,D_const); + CUDA_CHECK(cudaMemcpy(E, h1.data(), n_cons*sizeof(double), cudaMemcpyHostToDevice)); + CUDA_CHECK(cudaMemcpy(D, h2.data(), n_vars*sizeof(double), cudaMemcpyHostToDevice)); + std::vector h3(n_cons,1/E_const), h4(n_vars,1/D_const); + CUDA_CHECK(cudaMemcpy(invE, h3.data(), n_cons*sizeof(double), cudaMemcpyHostToDevice)); + CUDA_CHECK(cudaMemcpy(invD, h4.data(), n_vars*sizeof(double), cudaMemcpyHostToDevice)); + } + + CUBLAS_CHECK(cublasDscal(state->blas_handle, + n_cons, + &E_const, + rescale_info->con_rescale, + 1)); + CUBLAS_CHECK(cublasDscal(state->blas_handle, + n_vars, + &D_const, + rescale_info->var_rescale, + 1)); + + scale_problem(state, E, D, invE, invD); +} + +rescale_info_t *rescale_problem( + const pdhg_parameters_t *params, + pdhg_solver_state_t *state) +{ + int n_vars = state->num_variables; + int n_cons = state->num_constraints; + + clock_t start_rescaling = clock(); + rescale_info_t *rescale_info = (rescale_info_t *)safe_calloc(1, sizeof(rescale_info_t)); + rescale_info->con_rescale = nullptr; + rescale_info->var_rescale = nullptr; + CUDA_CHECK(cudaMalloc(&rescale_info->con_rescale, n_cons*sizeof(double))); + CUDA_CHECK(cudaMalloc(&rescale_info->var_rescale, n_vars*sizeof(double))); + { + std::vector h1(n_cons,1.0), h2(n_vars,1.0); + CUDA_CHECK(cudaMemcpy(rescale_info->con_rescale, h1.data(), + n_cons*sizeof(double), cudaMemcpyHostToDevice)); + CUDA_CHECK(cudaMemcpy(rescale_info->var_rescale, h2.data(), + n_vars*sizeof(double), cudaMemcpyHostToDevice)); + } + + double *E=nullptr, *D=nullptr, *invE=nullptr, *invD=nullptr; + CUDA_CHECK(cudaMalloc(&E, n_cons*sizeof(double))); + CUDA_CHECK(cudaMalloc(&D, n_vars*sizeof(double))); + CUDA_CHECK(cudaMalloc(&invE, n_cons*sizeof(double))); + CUDA_CHECK(cudaMalloc(&invD, n_vars*sizeof(double))); + + if (params->l_inf_ruiz_iterations > 0) + { + ruiz_rescaling(state, params->l_inf_ruiz_iterations, rescale_info, E, D, invE, invD); + } + if (params->has_pock_chambolle_alpha) + { + pock_chambolle_rescaling(state, params->pock_chambolle_alpha, rescale_info, E, D, invE, invD); + } + if (params->bound_objective_rescaling) + { + bound_objective_rescaling(state, rescale_info, E, D, invE, invD); + } + + rescale_info->rescaling_time_sec = (double)(clock() - start_rescaling) / CLOCKS_PER_SEC; + + CUDA_CHECK(cudaFree(E)); + CUDA_CHECK(cudaFree(D)); + CUDA_CHECK(cudaFree(invE)); + CUDA_CHECK(cudaFree(invD)); + + return rescale_info; +} \ No newline at end of file diff --git a/src/solver.cu b/src/solver.cu index 2880e9a..300ce9f 100644 --- a/src/solver.cu +++ b/src/solver.cu @@ -28,52 +28,40 @@ limitations under the License. #include __global__ void compute_next_pdhg_primal_solution_kernel( - const double *current_primal, double *reflected_primal, - const double *dual_product, const double *objective, const double *var_lb, - const double *var_ub, int n, double step_size); + const double *current_primal, double *reflected_primal, const double *dual_product, + const double *objective, const double *var_lb, const double *var_ub, + int n, double step_size); __global__ void compute_next_pdhg_primal_solution_major_kernel( const double *current_primal, double *pdhg_primal, double *reflected_primal, const double *dual_product, const double *objective, const double *var_lb, const double *var_ub, int n, double step_size, double *dual_slack); __global__ void compute_next_pdhg_dual_solution_kernel( - const double *current_dual, double *reflected_dual, - const double *primal_product, const double *const_lb, - const double *const_ub, int n, double step_size); + const double *current_dual, double *reflected_dual, const double *primal_product, + const double *const_lb, const double *const_ub, int n, double step_size); __global__ void compute_next_pdhg_dual_solution_major_kernel( const double *current_dual, double *pdhg_dual, double *reflected_dual, - const double *primal_product, const double *const_lb, - const double *const_ub, int n, double step_size); -__global__ void -halpern_update_kernel(const double *initial_primal, double *current_primal, - const double *reflected_primal, - const double *initial_dual, double *current_dual, - const double *reflected_dual, int n_vars, int n_cons, - double weight, double reflection_coeff); -__global__ void rescale_solution_kernel(double *primal_solution, - double *dual_solution, - const double *variable_rescaling, - const double *constraint_rescaling, - const double objective_vector_rescaling, - const double constraint_bound_rescaling, - int n_vars, int n_cons); + const double *primal_product, const double *const_lb, const double *const_ub, + int n, double step_size); +__global__ void halpern_update_kernel( + const double *initial_primal, double *current_primal, const double *reflected_primal, + const double *initial_dual, double *current_dual, const double *reflected_dual, + int n_vars, int n_cons, double weight, double reflection_coeff); +__global__ void rescale_solution_kernel( + double *primal_solution, double *dual_solution, + const double *variable_rescaling, const double *constraint_rescaling, + int n_vars, int n_cons); __global__ void compute_delta_solution_kernel( - const double *initial_primal, const double *pdhg_primal, - double *delta_primal, const double *initial_dual, const double *pdhg_dual, - double *delta_dual, int n_vars, int n_cons); + const double *initial_primal, const double *pdhg_primal, double *delta_primal, + const double *initial_dual, const double *pdhg_dual, double *delta_dual, + int n_vars, int n_cons); static void compute_next_pdhg_primal_solution(pdhg_solver_state_t *state); static void compute_next_pdhg_dual_solution(pdhg_solver_state_t *state); -static void halpern_update(pdhg_solver_state_t *state, - double reflection_coefficient); +static void halpern_update(pdhg_solver_state_t *state, double reflection_coefficient); static void rescale_solution(pdhg_solver_state_t *state); static cupdlpx_result_t *create_result_from_state(pdhg_solver_state_t *state); -static void perform_restart(pdhg_solver_state_t *state, - const pdhg_parameters_t *params); -static void -initialize_step_size_and_primal_weight(pdhg_solver_state_t *state, - const pdhg_parameters_t *params); -static pdhg_solver_state_t * -initialize_solver_state(const lp_problem_t *original_problem, - const rescale_info_t *rescale_info); +static void perform_restart(pdhg_solver_state_t *state, const pdhg_parameters_t *params); +static void initialize_step_size_and_primal_weight(pdhg_solver_state_t *state, const pdhg_parameters_t *params); +static pdhg_solver_state_t *initialize_solver_state(const lp_problem_t *original_problem, const pdhg_parameters_t* params); static void compute_fixed_point_error(pdhg_solver_state_t *state); void lp_problem_free(lp_problem_t *prob); void pdhg_solver_state_free(pdhg_solver_state_t *state); @@ -83,11 +71,8 @@ cupdlpx_result_t *optimize(const pdhg_parameters_t *params, const lp_problem_t *original_problem) { print_initial_info(params, original_problem); - rescale_info_t *rescale_info = rescale_problem(params, original_problem); - pdhg_solver_state_t *state = - initialize_solver_state(original_problem, rescale_info); + pdhg_solver_state_t *state = initialize_solver_state(original_problem, params); - rescale_info_free(rescale_info); initialize_step_size_and_primal_weight(state, params); clock_t start_time = clock(); bool do_restart = false; @@ -147,15 +132,15 @@ cupdlpx_result_t *optimize(const pdhg_parameters_t *params, return results; } -static pdhg_solver_state_t * -initialize_solver_state(const lp_problem_t *original_problem, - const rescale_info_t *rescale_info) +static pdhg_solver_state_t *initialize_solver_state( + const lp_problem_t *original_problem, + const pdhg_parameters_t* params) { - pdhg_solver_state_t *state = - (pdhg_solver_state_t *)safe_calloc(1, sizeof(pdhg_solver_state_t)); + pdhg_solver_state_t *state = (pdhg_solver_state_t *)safe_calloc(1, sizeof(pdhg_solver_state_t)); int n_vars = original_problem->num_variables; int n_cons = original_problem->num_constraints; + int nnz = original_problem->constraint_matrix_num_nonzeros; size_t var_bytes = n_vars * sizeof(double); size_t con_bytes = n_cons * sizeof(double); @@ -163,99 +148,60 @@ initialize_solver_state(const lp_problem_t *original_problem, state->num_constraints = n_cons; state->objective_constant = original_problem->objective_constant; - state->constraint_matrix = - (cu_sparse_matrix_csr_t *)safe_malloc(sizeof(cu_sparse_matrix_csr_t)); - state->constraint_matrix_t = - (cu_sparse_matrix_csr_t *)safe_malloc(sizeof(cu_sparse_matrix_csr_t)); + state->constraint_matrix = (cu_sparse_matrix_csr_t *)safe_malloc(sizeof(cu_sparse_matrix_csr_t)); + state->constraint_matrix_t = (cu_sparse_matrix_csr_t *)safe_malloc(sizeof(cu_sparse_matrix_csr_t)); state->constraint_matrix->num_rows = n_cons; state->constraint_matrix->num_cols = n_vars; - state->constraint_matrix->num_nonzeros = - original_problem->constraint_matrix_num_nonzeros; + state->constraint_matrix->num_nonzeros = original_problem->constraint_matrix_num_nonzeros; state->constraint_matrix_t->num_rows = n_vars; state->constraint_matrix_t->num_cols = n_cons; - state->constraint_matrix_t->num_nonzeros = - original_problem->constraint_matrix_num_nonzeros; + state->constraint_matrix_t->num_nonzeros = original_problem->constraint_matrix_num_nonzeros; state->termination_reason = TERMINATION_REASON_UNSPECIFIED; - state->rescaling_time_sec = rescale_info->rescaling_time_sec; + state->num_blocks_primal = (state->num_variables + THREADS_PER_BLOCK - 1) / THREADS_PER_BLOCK; + state->num_blocks_dual = (state->num_constraints + THREADS_PER_BLOCK - 1) / THREADS_PER_BLOCK; + state->num_blocks_primal_dual = (state->num_variables + state->num_constraints + THREADS_PER_BLOCK - 1) / THREADS_PER_BLOCK; + state->num_blocks_nnz = (state->constraint_matrix->num_nonzeros + THREADS_PER_BLOCK - 1) / THREADS_PER_BLOCK; + + CUSPARSE_CHECK(cusparseCreate(&state->sparse_handle)); + CUBLAS_CHECK(cublasCreate(&state->blas_handle)); + CUBLAS_CHECK(cublasSetPointerMode(state->blas_handle, CUBLAS_POINTER_MODE_HOST)); #define ALLOC_AND_COPY(dest, src, bytes) \ CUDA_CHECK(cudaMalloc(&dest, bytes)); \ CUDA_CHECK(cudaMemcpy(dest, src, bytes, cudaMemcpyHostToDevice)); - ALLOC_AND_COPY(state->constraint_matrix->row_ptr, - rescale_info->scaled_problem->constraint_matrix_row_pointers, - (n_cons + 1) * sizeof(int)); - ALLOC_AND_COPY(state->constraint_matrix->col_ind, - rescale_info->scaled_problem->constraint_matrix_col_indices, - rescale_info->scaled_problem->constraint_matrix_num_nonzeros * - sizeof(int)); - ALLOC_AND_COPY(state->constraint_matrix->val, - rescale_info->scaled_problem->constraint_matrix_values, - rescale_info->scaled_problem->constraint_matrix_num_nonzeros * - sizeof(double)); - - CUDA_CHECK(cudaMalloc(&state->constraint_matrix_t->row_ptr, - (n_vars + 1) * sizeof(int))); - CUDA_CHECK( - cudaMalloc(&state->constraint_matrix_t->col_ind, - rescale_info->scaled_problem->constraint_matrix_num_nonzeros * - sizeof(int))); - CUDA_CHECK( - cudaMalloc(&state->constraint_matrix_t->val, - rescale_info->scaled_problem->constraint_matrix_num_nonzeros * - sizeof(double))); + ALLOC_AND_COPY(state->constraint_matrix->row_ptr, original_problem->constraint_matrix_row_pointers, (n_cons + 1) * sizeof(int)); + ALLOC_AND_COPY(state->constraint_matrix->col_ind, original_problem->constraint_matrix_col_indices, original_problem->constraint_matrix_num_nonzeros * sizeof(int)); + ALLOC_AND_COPY(state->constraint_matrix->val, original_problem->constraint_matrix_values, original_problem->constraint_matrix_num_nonzeros * sizeof(double)); - CUSPARSE_CHECK(cusparseCreate(&state->sparse_handle)); - CUBLAS_CHECK(cublasCreate(&state->blas_handle)); - CUBLAS_CHECK( - cublasSetPointerMode(state->blas_handle, CUBLAS_POINTER_MODE_HOST)); + int* h_row_ind = build_row_ind_from_row_ptr(original_problem->constraint_matrix_row_pointers, n_cons, nnz); + ALLOC_AND_COPY(state->constraint_matrix->row_ind, h_row_ind, nnz * sizeof(int)); + free(h_row_ind); - size_t buffer_size = 0; - void *buffer = nullptr; - CUSPARSE_CHECK(cusparseCsr2cscEx2_bufferSize( - state->sparse_handle, state->constraint_matrix->num_rows, - state->constraint_matrix->num_cols, - state->constraint_matrix->num_nonzeros, state->constraint_matrix->val, - state->constraint_matrix->row_ptr, state->constraint_matrix->col_ind, - state->constraint_matrix_t->val, state->constraint_matrix_t->row_ptr, - state->constraint_matrix_t->col_ind, CUDA_R_64F, CUSPARSE_ACTION_NUMERIC, - CUSPARSE_INDEX_BASE_ZERO, CUSPARSE_CSR2CSC_ALG_DEFAULT, &buffer_size)); - CUDA_CHECK(cudaMalloc(&buffer, buffer_size)); + ALLOC_AND_COPY(state->variable_lower_bound, original_problem->variable_lower_bound, var_bytes); + ALLOC_AND_COPY(state->variable_upper_bound, original_problem->variable_upper_bound, var_bytes); + ALLOC_AND_COPY(state->objective_vector, original_problem->objective_vector, var_bytes); + ALLOC_AND_COPY(state->constraint_lower_bound, original_problem->constraint_lower_bound, con_bytes); + ALLOC_AND_COPY(state->constraint_upper_bound, original_problem->constraint_upper_bound, con_bytes); - CUSPARSE_CHECK(cusparseCsr2cscEx2( - state->sparse_handle, state->constraint_matrix->num_rows, - state->constraint_matrix->num_cols, - state->constraint_matrix->num_nonzeros, state->constraint_matrix->val, - state->constraint_matrix->row_ptr, state->constraint_matrix->col_ind, - state->constraint_matrix_t->val, state->constraint_matrix_t->row_ptr, - state->constraint_matrix_t->col_ind, CUDA_R_64F, CUSPARSE_ACTION_NUMERIC, - CUSPARSE_INDEX_BASE_ZERO, CUSPARSE_CSR2CSC_ALG_DEFAULT, buffer)); - - CUDA_CHECK(cudaFree(buffer)); - - ALLOC_AND_COPY(state->variable_lower_bound, - rescale_info->scaled_problem->variable_lower_bound, var_bytes); - ALLOC_AND_COPY(state->variable_upper_bound, - rescale_info->scaled_problem->variable_upper_bound, var_bytes); - ALLOC_AND_COPY(state->objective_vector, - rescale_info->scaled_problem->objective_vector, var_bytes); - ALLOC_AND_COPY(state->constraint_lower_bound, - rescale_info->scaled_problem->constraint_lower_bound, - con_bytes); - ALLOC_AND_COPY(state->constraint_upper_bound, - rescale_info->scaled_problem->constraint_upper_bound, - con_bytes); - ALLOC_AND_COPY(state->constraint_rescaling, rescale_info->con_rescale, - con_bytes); - ALLOC_AND_COPY(state->variable_rescaling, rescale_info->var_rescale, - var_bytes); - - state->constraint_bound_rescaling = rescale_info->con_bound_rescale; - state->objective_vector_rescaling = rescale_info->obj_vec_rescale; + double *temp_host = (double *)safe_malloc(fmax(var_bytes, con_bytes)); + for (int i = 0; i < n_cons; ++i) + temp_host[i] = isfinite(original_problem->constraint_lower_bound[i]) ? original_problem->constraint_lower_bound[i] : 0.0; + ALLOC_AND_COPY(state->constraint_lower_bound_finite_val, temp_host, con_bytes); + for (int i = 0; i < n_cons; ++i) + temp_host[i] = isfinite(original_problem->constraint_upper_bound[i]) ? original_problem->constraint_upper_bound[i] : 0.0; + ALLOC_AND_COPY(state->constraint_upper_bound_finite_val, temp_host, con_bytes); + for (int i = 0; i < n_vars; ++i) + temp_host[i] = isfinite(original_problem->variable_lower_bound[i]) ? original_problem->variable_lower_bound[i] : 0.0; + ALLOC_AND_COPY(state->variable_lower_bound_finite_val, temp_host, var_bytes); + for (int i = 0; i < n_vars; ++i) + temp_host[i] = isfinite(original_problem->variable_upper_bound[i]) ? original_problem->variable_upper_bound[i] : 0.0; + ALLOC_AND_COPY(state->variable_upper_bound_finite_val, temp_host, var_bytes); + free(temp_host); #define ALLOC_ZERO(dest, bytes) \ CUDA_CHECK(cudaMalloc(&dest, bytes)); \ @@ -281,62 +227,56 @@ initialize_solver_state(const lp_problem_t *original_problem, if (original_problem->primal_start) { - double *rescaled = (double *)safe_malloc(var_bytes); - for (int i = 0; i < n_vars; ++i) - rescaled[i] = original_problem->primal_start[i] * - rescale_info->var_rescale[i] * - rescale_info->con_bound_rescale; - CUDA_CHECK(cudaMemcpy(state->initial_primal_solution, rescaled, var_bytes, - cudaMemcpyHostToDevice)); - free(rescaled); + CUDA_CHECK(cudaMemcpy(state->initial_primal_solution, + original_problem->primal_start, + var_bytes, cudaMemcpyHostToDevice)); } if (original_problem->dual_start) { - double *rescaled = (double *)safe_malloc(con_bytes); - for (int i = 0; i < n_cons; ++i) - rescaled[i] = original_problem->dual_start[i] * - rescale_info->con_rescale[i] * - rescale_info->obj_vec_rescale; - CUDA_CHECK(cudaMemcpy(state->initial_dual_solution, rescaled, con_bytes, - cudaMemcpyHostToDevice)); - free(rescaled); + CUDA_CHECK(cudaMemcpy(state->initial_dual_solution, + original_problem->dual_start, + con_bytes, cudaMemcpyHostToDevice)); } - double *temp_host = (double *)safe_malloc(fmax(var_bytes, con_bytes)); - for (int i = 0; i < n_cons; ++i) - temp_host[i] = - isfinite(rescale_info->scaled_problem->constraint_lower_bound[i]) - ? rescale_info->scaled_problem->constraint_lower_bound[i] - : 0.0; - ALLOC_AND_COPY(state->constraint_lower_bound_finite_val, temp_host, - con_bytes); - for (int i = 0; i < n_cons; ++i) - temp_host[i] = - isfinite(rescale_info->scaled_problem->constraint_upper_bound[i]) - ? rescale_info->scaled_problem->constraint_upper_bound[i] - : 0.0; - ALLOC_AND_COPY(state->constraint_upper_bound_finite_val, temp_host, - con_bytes); - for (int i = 0; i < n_vars; ++i) - temp_host[i] = - isfinite(rescale_info->scaled_problem->variable_lower_bound[i]) - ? rescale_info->scaled_problem->variable_lower_bound[i] - : 0.0; - ALLOC_AND_COPY(state->variable_lower_bound_finite_val, temp_host, var_bytes); - for (int i = 0; i < n_vars; ++i) - temp_host[i] = - isfinite(rescale_info->scaled_problem->variable_upper_bound[i]) - ? rescale_info->scaled_problem->variable_upper_bound[i] - : 0.0; - ALLOC_AND_COPY(state->variable_upper_bound_finite_val, temp_host, var_bytes); - free(temp_host); + rescale_info_t *rescale_info = rescale_problem(params, state); + + state->constraint_rescaling = rescale_info->con_rescale; + state->variable_rescaling = rescale_info->var_rescale; + state->rescaling_time_sec = rescale_info->rescaling_time_sec; + + CUDA_CHECK(cudaMalloc(&state->constraint_matrix_t->row_ptr, (n_vars + 1) * sizeof(int))); + CUDA_CHECK(cudaMalloc(&state->constraint_matrix_t->col_ind, original_problem->constraint_matrix_num_nonzeros * sizeof(int))); + CUDA_CHECK(cudaMalloc(&state->constraint_matrix_t->val, original_problem->constraint_matrix_num_nonzeros * sizeof(double))); + + size_t buffer_size = 0; + void *buffer = nullptr; + CUSPARSE_CHECK(cusparseCsr2cscEx2_bufferSize( + state->sparse_handle, state->constraint_matrix->num_rows, state->constraint_matrix->num_cols, state->constraint_matrix->num_nonzeros, + state->constraint_matrix->val, state->constraint_matrix->row_ptr, state->constraint_matrix->col_ind, + state->constraint_matrix_t->val, state->constraint_matrix_t->row_ptr, state->constraint_matrix_t->col_ind, + CUDA_R_64F, CUSPARSE_ACTION_NUMERIC, CUSPARSE_INDEX_BASE_ZERO, + CUSPARSE_CSR2CSC_ALG_DEFAULT, &buffer_size)); + CUDA_CHECK(cudaMalloc(&buffer, buffer_size)); + + CUSPARSE_CHECK(cusparseCsr2cscEx2( + state->sparse_handle, state->constraint_matrix->num_rows, state->constraint_matrix->num_cols, state->constraint_matrix->num_nonzeros, + state->constraint_matrix->val, state->constraint_matrix->row_ptr, state->constraint_matrix->col_ind, + state->constraint_matrix_t->val, state->constraint_matrix_t->row_ptr, state->constraint_matrix_t->col_ind, + CUDA_R_64F, CUSPARSE_ACTION_NUMERIC, CUSPARSE_INDEX_BASE_ZERO, + CUSPARSE_CSR2CSC_ALG_DEFAULT, buffer)); + state->constraint_matrix_t->row_ind = NULL; + + CUDA_CHECK(cudaFree(buffer)); + + rescale_info->con_rescale = NULL; + rescale_info->var_rescale = NULL; + rescale_info_free(rescale_info); double sum_of_squares = 0.0; for (int i = 0; i < n_vars; ++i) { - sum_of_squares += original_problem->objective_vector[i] * - original_problem->objective_vector[i]; + sum_of_squares += original_problem->objective_vector[i] * original_problem->objective_vector[i]; } state->objective_vector_norm = sqrt(sum_of_squares); @@ -357,15 +297,7 @@ initialize_solver_state(const lp_problem_t *original_problem, sum_of_squares += upper * upper; } } - state->constraint_bound_norm = sqrt(sum_of_squares); - state->num_blocks_primal = - (state->num_variables + THREADS_PER_BLOCK - 1) / THREADS_PER_BLOCK; - state->num_blocks_dual = - (state->num_constraints + THREADS_PER_BLOCK - 1) / THREADS_PER_BLOCK; - state->num_blocks_primal_dual = - (state->num_variables + state->num_constraints + THREADS_PER_BLOCK - 1) / - THREADS_PER_BLOCK; state->best_primal_dual_residual_gap = INFINITY; state->last_trial_fixed_point_error = INFINITY; @@ -375,77 +307,42 @@ initialize_solver_state(const lp_problem_t *original_problem, size_t primal_spmv_buffer_size; size_t dual_spmv_buffer_size; - CUSPARSE_CHECK(cusparseCreateCsr( - &state->matA, state->num_constraints, state->num_variables, - state->constraint_matrix->num_nonzeros, state->constraint_matrix->row_ptr, - state->constraint_matrix->col_ind, state->constraint_matrix->val, - CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_BASE_ZERO, - CUDA_R_64F)); - + CUSPARSE_CHECK(cusparseCreateCsr(&state->matA, state->num_constraints, state->num_variables, state->constraint_matrix->num_nonzeros, state->constraint_matrix->row_ptr, state->constraint_matrix->col_ind, state->constraint_matrix->val, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_BASE_ZERO, CUDA_R_64F)); CUDA_CHECK(cudaGetLastError()); - CUSPARSE_CHECK(cusparseCreateCsr( - &state->matAt, state->num_variables, state->num_constraints, - state->constraint_matrix_t->num_nonzeros, - state->constraint_matrix_t->row_ptr, state->constraint_matrix_t->col_ind, - state->constraint_matrix_t->val, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, - CUSPARSE_INDEX_BASE_ZERO, CUDA_R_64F)); + CUSPARSE_CHECK(cusparseCreateCsr(&state->matAt, state->num_variables, state->num_constraints, state->constraint_matrix_t->num_nonzeros, state->constraint_matrix_t->row_ptr, state->constraint_matrix_t->col_ind, state->constraint_matrix_t->val, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_BASE_ZERO, CUDA_R_64F)); CUDA_CHECK(cudaGetLastError()); - CUSPARSE_CHECK(cusparseCreateDnVec(&state->vec_primal_sol, - state->num_variables, - state->pdhg_primal_solution, CUDA_R_64F)); - CUSPARSE_CHECK(cusparseCreateDnVec(&state->vec_dual_sol, - state->num_constraints, - state->pdhg_dual_solution, CUDA_R_64F)); - CUSPARSE_CHECK(cusparseCreateDnVec(&state->vec_primal_prod, - state->num_constraints, - state->primal_product, CUDA_R_64F)); - CUSPARSE_CHECK(cusparseCreateDnVec(&state->vec_dual_prod, - state->num_variables, state->dual_product, - CUDA_R_64F)); - CUSPARSE_CHECK(cusparseSpMV_bufferSize( - state->sparse_handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &HOST_ONE, - state->matA, state->vec_primal_sol, &HOST_ZERO, state->vec_primal_prod, - CUDA_R_64F, CUSPARSE_SPMV_CSR_ALG2, &primal_spmv_buffer_size)); + CUSPARSE_CHECK(cusparseCreateDnVec(&state->vec_primal_sol, state->num_variables, state->pdhg_primal_solution, CUDA_R_64F)); + CUSPARSE_CHECK(cusparseCreateDnVec(&state->vec_dual_sol, state->num_constraints, state->pdhg_dual_solution, CUDA_R_64F)); + CUSPARSE_CHECK(cusparseCreateDnVec(&state->vec_primal_prod, state->num_constraints, state->primal_product, CUDA_R_64F)); + CUSPARSE_CHECK(cusparseCreateDnVec(&state->vec_dual_prod, state->num_variables, state->dual_product, CUDA_R_64F)); + CUSPARSE_CHECK(cusparseSpMV_bufferSize(state->sparse_handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &HOST_ONE, state->matA, state->vec_primal_sol, &HOST_ZERO, state->vec_primal_prod, CUDA_R_64F, CUSPARSE_SPMV_CSR_ALG2, &primal_spmv_buffer_size)); - CUSPARSE_CHECK(cusparseSpMV_bufferSize( - state->sparse_handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &HOST_ONE, - state->matAt, state->vec_dual_sol, &HOST_ZERO, state->vec_dual_prod, - CUDA_R_64F, CUSPARSE_SPMV_CSR_ALG2, &dual_spmv_buffer_size)); + CUSPARSE_CHECK(cusparseSpMV_bufferSize(state->sparse_handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &HOST_ONE, state->matAt, state->vec_dual_sol, &HOST_ZERO, state->vec_dual_prod, CUDA_R_64F, CUSPARSE_SPMV_CSR_ALG2, &dual_spmv_buffer_size)); CUDA_CHECK(cudaMalloc(&state->primal_spmv_buffer, primal_spmv_buffer_size)); - CUSPARSE_CHECK(cusparseSpMV_preprocess( - state->sparse_handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &HOST_ONE, - state->matA, state->vec_primal_sol, &HOST_ZERO, state->vec_primal_prod, - CUDA_R_64F, CUSPARSE_SPMV_CSR_ALG2, state->primal_spmv_buffer)); + CUSPARSE_CHECK(cusparseSpMV_preprocess(state->sparse_handle, CUSPARSE_OPERATION_NON_TRANSPOSE, + &HOST_ONE, state->matA, state->vec_primal_sol, &HOST_ZERO, state->vec_primal_prod, + CUDA_R_64F, CUSPARSE_SPMV_CSR_ALG2, state->primal_spmv_buffer)); CUDA_CHECK(cudaMalloc(&state->dual_spmv_buffer, dual_spmv_buffer_size)); - CUSPARSE_CHECK(cusparseSpMV_preprocess( - state->sparse_handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &HOST_ONE, - state->matAt, state->vec_dual_sol, &HOST_ZERO, state->vec_dual_prod, - CUDA_R_64F, CUSPARSE_SPMV_CSR_ALG2, state->dual_spmv_buffer)); + CUSPARSE_CHECK(cusparseSpMV_preprocess(state->sparse_handle, CUSPARSE_OPERATION_NON_TRANSPOSE, + &HOST_ONE, state->matAt, state->vec_dual_sol, &HOST_ZERO, state->vec_dual_prod, + CUDA_R_64F, CUSPARSE_SPMV_CSR_ALG2, state->dual_spmv_buffer)); - CUDA_CHECK( - cudaMalloc(&state->ones_primal_d, state->num_variables * sizeof(double))); - CUDA_CHECK( - cudaMalloc(&state->ones_dual_d, state->num_constraints * sizeof(double))); + CUDA_CHECK(cudaMalloc(&state->ones_primal_d, state->num_variables * sizeof(double))); + CUDA_CHECK(cudaMalloc(&state->ones_dual_d, state->num_constraints * sizeof(double))); - double *ones_primal_h = - (double *)safe_malloc(state->num_variables * sizeof(double)); + double *ones_primal_h = (double *)safe_malloc(state->num_variables * sizeof(double)); for (int i = 0; i < state->num_variables; ++i) ones_primal_h[i] = 1.0; - CUDA_CHECK(cudaMemcpy(state->ones_primal_d, ones_primal_h, - state->num_variables * sizeof(double), - cudaMemcpyHostToDevice)); + CUDA_CHECK(cudaMemcpy(state->ones_primal_d, ones_primal_h, state->num_variables * sizeof(double), cudaMemcpyHostToDevice)); free(ones_primal_h); - double *ones_dual_h = - (double *)safe_malloc(state->num_constraints * sizeof(double)); + double *ones_dual_h = (double *)safe_malloc(state->num_constraints * sizeof(double)); for (int i = 0; i < state->num_constraints; ++i) ones_dual_h[i] = 1.0; - CUDA_CHECK(cudaMemcpy(state->ones_dual_d, ones_dual_h, - state->num_constraints * sizeof(double), - cudaMemcpyHostToDevice)); + CUDA_CHECK(cudaMemcpy(state->ones_dual_d, ones_dual_h, state->num_constraints * sizeof(double), cudaMemcpyHostToDevice)); free(ones_dual_h); return state; @@ -538,21 +435,18 @@ __global__ void rescale_solution_kernel(double *primal_solution, double *dual_solution, const double *variable_rescaling, const double *constraint_rescaling, - const double objective_vector_rescaling, - const double constraint_bound_rescaling, int n_vars, int n_cons) { int i = blockIdx.x * blockDim.x + threadIdx.x; if (i < n_vars) { primal_solution[i] = - primal_solution[i] / variable_rescaling[i] / constraint_bound_rescaling; + primal_solution[i] / variable_rescaling[i]; } else if (i < n_vars + n_cons) { int idx = i - n_vars; - dual_solution[idx] = dual_solution[idx] / constraint_rescaling[idx] / - objective_vector_rescaling; + dual_solution[idx] = dual_solution[idx] / constraint_rescaling[idx]; } } @@ -662,7 +556,6 @@ static void rescale_solution(pdhg_solver_state_t *state) rescale_solution_kernel<<num_blocks_primal_dual, THREADS_PER_BLOCK>>>( state->pdhg_primal_solution, state->pdhg_dual_solution, state->variable_rescaling, state->constraint_rescaling, - state->objective_vector_rescaling, state->constraint_bound_rescaling, state->num_variables, state->num_constraints); } @@ -882,9 +775,8 @@ void rescale_info_free(rescale_info_t *info) return; } - lp_problem_free(info->scaled_problem); - free(info->con_rescale); - free(info->var_rescale); + CUDA_CHECK(cudaFree(info->con_rescale)); + CUDA_CHECK(cudaFree(info->var_rescale)); free(info); } diff --git a/src/utils.cu b/src/utils.cu index 83ba194..0de6c4b 100644 --- a/src/utils.cu +++ b/src/utils.cu @@ -596,19 +596,14 @@ void compute_residual(pdhg_solver_state_t *state) CUBLAS_CHECK(cublasDnrm2_v2_64(state->blas_handle, state->num_constraints, state->primal_residual, 1, &state->absolute_primal_residual)); - state->absolute_primal_residual /= state->constraint_bound_rescaling; CUBLAS_CHECK(cublasDnrm2_v2_64(state->blas_handle, state->num_variables, state->dual_residual, 1, &state->absolute_dual_residual)); - state->absolute_dual_residual /= state->objective_vector_rescaling; CUBLAS_CHECK(cublasDdot( state->blas_handle, state->num_variables, state->objective_vector, 1, state->pdhg_primal_solution, 1, &state->primal_objective_value)); - state->primal_objective_value = - state->primal_objective_value / (state->constraint_bound_rescaling * - state->objective_vector_rescaling) + - state->objective_constant; + state->primal_objective_value = state->primal_objective_value + state->objective_constant; double base_dual_objective; CUBLAS_CHECK(cublasDdot(state->blas_handle, state->num_variables, @@ -617,10 +612,7 @@ void compute_residual(pdhg_solver_state_t *state) double dual_slack_sum = get_vector_sum(state->blas_handle, state->num_constraints, state->ones_dual_d, state->primal_slack); - state->dual_objective_value = (base_dual_objective + dual_slack_sum) / - (state->constraint_bound_rescaling * - state->objective_vector_rescaling) + - state->objective_constant; + state->dual_objective_value = base_dual_objective + dual_slack_sum + state->objective_constant; state->relative_primal_residual = state->absolute_primal_residual / (1.0 + state->constraint_bound_norm); @@ -677,8 +669,6 @@ void compute_infeasibility_information(pdhg_solver_state_t *state) CUBLAS_CHECK(cublasDdot( state->blas_handle, state->num_variables, state->objective_vector, 1, state->delta_primal_solution, 1, &state->primal_ray_linear_objective)); - state->primal_ray_linear_objective /= - (state->constraint_bound_rescaling * state->objective_vector_rescaling); dual_solution_dual_objective_contribution_kernel<<num_blocks_dual, THREADS_PER_BLOCK>>>( @@ -698,9 +688,7 @@ void compute_infeasibility_information(pdhg_solver_state_t *state) double sum_dual_slack = get_vector_sum(state->blas_handle, state->num_variables, state->ones_primal_d, state->dual_slack); - state->dual_ray_objective = - (sum_primal_slack + sum_dual_slack) / - (state->constraint_bound_rescaling * state->objective_vector_rescaling); + state->dual_ray_objective = sum_primal_slack + sum_dual_slack; compute_primal_infeasibility_kernel<<num_blocks_dual, THREADS_PER_BLOCK>>>( @@ -961,3 +949,19 @@ int coo_to_csr(const matrix_desc_t *desc, int **row_ptr, int **col_ind, *nnz_out = nnz; return 0; } + +int* build_row_ind_from_row_ptr(int* row_ptr, int num_rows, int nnz) +{ + if (!row_ptr || num_rows < 0 || nnz <= 0) return NULL; + + int* row_ind = (int*)safe_malloc(nnz * sizeof(int)); + for (int i = 0; i < num_rows; ++i) { + int s = row_ptr[i]; + int e = row_ptr[i + 1]; + for (int k = s; k < e; ++k) { + row_ind[k] = i; + } + } + return row_ind; +} + From 5a993cd41d35f5a0c1597251c76d9a5b0017241c Mon Sep 17 00:00:00 2001 From: Bo Tang Date: Sat, 15 Nov 2025 03:53:04 -0500 Subject: [PATCH 02/13] New feat: synchronized A/At scaling --- internal/internal_types.h | 1 + internal/utils.h | 2 - src/preconditioner.cu | 424 +++++++++++++++++--------------------- src/solver.cu | 170 ++++++++++++--- src/utils.cu | 15 -- 5 files changed, 332 insertions(+), 280 deletions(-) diff --git a/internal/internal_types.h b/internal/internal_types.h index f0d8def..177a293 100644 --- a/internal/internal_types.h +++ b/internal/internal_types.h @@ -30,6 +30,7 @@ typedef struct int *col_ind; int *row_ind; double *val; + int *transpose_pos; } cu_sparse_matrix_csr_t; typedef struct diff --git a/internal/utils.h b/internal/utils.h index 7b939ea..62da072 100644 --- a/internal/utils.h +++ b/internal/utils.h @@ -125,8 +125,6 @@ extern "C" int coo_to_csr(const matrix_desc_t *desc, int **row_ptr, int **col_ind, double **vals, int *nnz_out); - int* build_row_ind_from_row_ptr(int* row_ptr, int num_rows, int nnz); - #ifdef __cplusplus } diff --git a/src/preconditioner.cu b/src/preconditioner.cu index e197905..ec3a6f2 100644 --- a/src/preconditioner.cu +++ b/src/preconditioner.cu @@ -19,7 +19,6 @@ limitations under the License. #include #include #include -#include #include #include @@ -44,7 +43,9 @@ __global__ void scale_constraints_kernel(double* __restrict__ con_lb, int m); __global__ void csr_scale_nnz_kernel(const int* __restrict__ row_ids, const int* __restrict__ col_ind, - double* __restrict__ vals, + double* __restrict__ A_vals, + double* __restrict__ At_vals, + const int* __restrict__ A_to_At, const double* __restrict__ invD, const double* __restrict__ invE, int nnz); @@ -52,205 +53,34 @@ __global__ void csr_row_absmax_kernel(const int* __restrict__ row_ptr, const double* __restrict__ vals, int num_rows, double* __restrict__ out_max); -__global__ void csr_col_absmax_atomic_kernel(const int* __restrict__ col_ind, - const double* __restrict__ vals, - int nnz, - unsigned long long* __restrict__ out_max_bits); -__global__ void u64bits_to_double(const unsigned long long* __restrict__ in_bits, - double* __restrict__ out_val, - int n); __global__ void csr_row_powsum_kernel(const int* __restrict__ row_ptr, const double* __restrict__ vals, int num_rows, double degree, double* __restrict__ out_sum); -__global__ void csr_col_powsum_atomic_kernel(const int* __restrict__ col_ind, - const double* __restrict__ vals, - int nnz, - double degree, - double* __restrict__ out_sum); __global__ void clamp_sqrt_and_accum(double* __restrict__ x, double* __restrict__ inv_x, double* __restrict__ cum, int n); +__global__ void reduce_bound_norm_sq_atomic( + const double* __restrict__ L, + const double* __restrict__ U, + int m, + double* __restrict__ out_sum); +__global__ void fill_const_and_accum_kernel(double* __restrict__ x, + double* __restrict__ inv_x, + double* __restrict__ cum, + double val, + int n); static void scale_problem(pdhg_solver_state_t *state, double *E, double *D, double *invE, double *invD); static void ruiz_rescaling(pdhg_solver_state_t *state, int num_iters, rescale_info_t *rescale_info, double *E, double *D, double *invE, double *invD); +__global__ void fill_ones_kernel(double* __restrict__ x, int n); static void pock_chambolle_rescaling(pdhg_solver_state_t *state, double alpha, rescale_info_t *rescale_info, double *E, double *D, double *invE, double *invD); static void bound_objective_rescaling(pdhg_solver_state_t *state, rescale_info_t *rescale_info, double *E, double *D, double *invE, double *invD); -__global__ void scale_variables_kernel(double* __restrict__ c, - double* __restrict__ var_lb, - double* __restrict__ var_ub, - double* __restrict__ var_lb_finite, - double* __restrict__ var_ub_finite, - double* __restrict__ primal_start, - const double* __restrict__ D, - const double* __restrict__ invD, - int n) -{ - int j = blockIdx.x * blockDim.x + threadIdx.x; - if (j >= n) return; - double dj = D[j]; - double inv_dj = invD[j]; - c[j] *= inv_dj; - var_lb[j] *= dj; - var_ub[j] *= dj; - var_lb_finite[j] *= dj; - var_ub_finite[j] *= dj; - primal_start[j] *= dj; -} - -__global__ void scale_constraints_kernel(double* __restrict__ con_lb, - double* __restrict__ con_ub, - double* __restrict__ con_lb_finite, - double* __restrict__ con_ub_finite, - double* __restrict__ dual_start, - const double* __restrict__ E, - const double* __restrict__ invE, - int m) -{ - int i = blockIdx.x * blockDim.x + threadIdx.x; - if (i >= m) return; - double inv_ei = invE[i]; - double ei = E[i]; - con_lb[i] *= inv_ei; - con_ub[i] *= inv_ei; - con_lb_finite[i] *= inv_ei; - con_ub_finite[i] *= inv_ei; - dual_start[i] *= ei; -} - -__global__ void csr_scale_nnz_kernel(const int* __restrict__ row_ids, - const int* __restrict__ col_ind, - double* __restrict__ vals, - const double* __restrict__ invD, - const double* __restrict__ invE, - int nnz) -{ - for (int k = blockIdx.x * blockDim.x + threadIdx.x; - k < nnz; - k += gridDim.x * blockDim.x) - { - int i = row_ids[k]; - int j = col_ind[k]; - vals[k] *= invD[j] * invE[i]; - } -} - -__global__ void csr_row_absmax_kernel(const int* __restrict__ row_ptr, - const double* __restrict__ vals, - int num_rows, - double* __restrict__ out_max) -{ - int i = blockIdx.x * blockDim.x + threadIdx.x; - if (i >= num_rows) return; - int s = row_ptr[i], e = row_ptr[i + 1]; - double m = 0.0; - for (int k = s; k < e; ++k) { - double v = fabs(vals[k]); - if (!isfinite(v)) v = 0.0; - if (v > m) m = v; - } - out_max[i] = m; -} - -__global__ void csr_col_absmax_atomic_kernel(const int* __restrict__ col_ind, - const double* __restrict__ vals, - int nnz, - unsigned long long* __restrict__ out_max_bits) -{ - for (int k = blockIdx.x * blockDim.x + threadIdx.x; k < nnz; k += gridDim.x * blockDim.x) { - int j = col_ind[k]; - double v = fabs(vals[k]); - if (!isfinite(v)) v = 0.0; - unsigned long long bits = __double_as_longlong(v); - atomicMax(&out_max_bits[j], bits); - } -} - -__global__ void u64bits_to_double(const unsigned long long* __restrict__ in_bits, - double* __restrict__ out_val, - int n) -{ - for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < n; i += gridDim.x * blockDim.x) { - out_val[i] = __longlong_as_double(in_bits[i]); - } -} - -__device__ __forceinline__ double pow_fast(double v, double p) { - if (p == 2.0) return v * v; - if (p == 1.0) return v; - if (p == 0.5) return sqrt(v); - return pow(v, p); -} - -__global__ void csr_row_powsum_kernel(const int* __restrict__ row_ptr, - const double* __restrict__ vals, - int num_rows, - double degree, - double* __restrict__ out_sum) -{ - int i = blockIdx.x * blockDim.x + threadIdx.x; - if (i >= num_rows) return; - int s = row_ptr[i], e = row_ptr[i + 1]; - double acc = 0.0; - for (int k = s; k < e; ++k) { - double v = fabs(vals[k]); - if (!isfinite(v)) v = 0.0; - acc += pow_fast(v, degree); - } - out_sum[i] = acc; -} - -__global__ void csr_col_powsum_atomic_kernel(const int* __restrict__ col_ind, - const double* __restrict__ vals, - int nnz, - double degree, - double* __restrict__ out_sum) -{ - for (int k = blockIdx.x * blockDim.x + threadIdx.x; k < nnz; k += gridDim.x * blockDim.x) { - int j = col_ind[k]; - double v = fabs(vals[k]); - if (!isfinite(v)) v = 0.0; - double t = pow_fast(v, degree); - atomicAdd(&out_sum[j], t); - } -} - -__global__ void clamp_sqrt_and_accum(double* __restrict__ x, - double* __restrict__ inv_x, - double* __restrict__ cum, - int n) -{ - for (int t = blockIdx.x * blockDim.x + threadIdx.x; t < n; t += blockDim.x * gridDim.x) - { - double v = x[t]; - double s = (v < SCALING_EPSILON) ? 1.0 : sqrt(v); - cum[t] *= s; - x[t] = s; - inv_x[t] = 1.0 / s; - } -} - -__global__ void reduce_bound_norm_sq_atomic( - const double* __restrict__ L, - const double* __restrict__ U, - int m, - double* __restrict__ out_sum) -{ - double acc = 0.0; - for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < m; i += blockDim.x * gridDim.x) { - double Li = L[i], Ui = U[i]; - bool fL = isfinite(Li), fU = isfinite(Ui); - if (fL && (!fU || fabs(Li - Ui) > SCALING_EPSILON)) acc += Li * Li; - if (fU) acc += Ui * Ui; - } - atomicAdd(out_sum, acc); -} - static void scale_problem( pdhg_solver_state_t *state, double *E, @@ -286,6 +116,8 @@ static void scale_problem( state->constraint_matrix->row_ind, state->constraint_matrix->col_ind, state->constraint_matrix->val, + state->constraint_matrix_t->val, + state->constraint_matrix->transpose_pos, invD, invE, state->constraint_matrix->num_nonzeros); @@ -302,10 +134,6 @@ static void ruiz_rescaling( { const int n_cons = state->num_constraints; const int n_vars = state->num_variables; - const int nnz = state->constraint_matrix->num_nonzeros; - - unsigned long long *D_bits=nullptr; - CUDA_CHECK(cudaMalloc(&D_bits, n_vars*sizeof(unsigned long long))); for (int iter = 0; iter < num_iterations; ++iter) { @@ -320,13 +148,11 @@ static void ruiz_rescaling( rescale_info->con_rescale, n_cons); - CUDA_CHECK(cudaMemset(D_bits, 0, n_vars*sizeof(unsigned long long))); - csr_col_absmax_atomic_kernel<<num_blocks_nnz, THREADS_PER_BLOCK>>>( - state->constraint_matrix->col_ind, - state->constraint_matrix->val, - nnz, - D_bits); - u64bits_to_double<<num_blocks_primal, THREADS_PER_BLOCK>>>(D_bits, D, n_vars); + csr_row_absmax_kernel<<num_blocks_primal, THREADS_PER_BLOCK>>>( + state->constraint_matrix_t->row_ptr, + state->constraint_matrix_t->val, + n_vars, + D); clamp_sqrt_and_accum<<num_blocks_primal, THREADS_PER_BLOCK>>>( D, invD, @@ -335,8 +161,6 @@ static void ruiz_rescaling( scale_problem(state, E, D, invE, invD); } - - CUDA_CHECK(cudaFree(D_bits)); } static void pock_chambolle_rescaling( @@ -350,7 +174,6 @@ static void pock_chambolle_rescaling( { const int n_cons = state->num_constraints; const int n_vars = state->num_variables; - const int nnz = state->constraint_matrix->num_nonzeros; csr_row_powsum_kernel<<num_blocks_dual, THREADS_PER_BLOCK>>>( state->constraint_matrix->row_ptr, @@ -364,12 +187,11 @@ static void pock_chambolle_rescaling( rescale_info->con_rescale, n_cons); - CUDA_CHECK(cudaMemset(D, 0, n_vars*sizeof(double))); - csr_col_powsum_atomic_kernel<<num_blocks_nnz, THREADS_PER_BLOCK>>>( - state->constraint_matrix->col_ind, - state->constraint_matrix->val, - nnz, - 2.0 - alpha, + csr_row_powsum_kernel<<num_blocks_primal, THREADS_PER_BLOCK>>>( + state->constraint_matrix_t->row_ptr, + state->constraint_matrix_t->val, + n_vars, + alpha, D); clamp_sqrt_and_accum<<num_blocks_primal, THREADS_PER_BLOCK>>>( D, @@ -415,25 +237,10 @@ static void bound_objective_rescaling( const double E_const = bnd_norm + 1.0; const double D_const = obj_norm + 1.0; - { - std::vector h1(n_cons,E_const), h2(n_vars,D_const); - CUDA_CHECK(cudaMemcpy(E, h1.data(), n_cons*sizeof(double), cudaMemcpyHostToDevice)); - CUDA_CHECK(cudaMemcpy(D, h2.data(), n_vars*sizeof(double), cudaMemcpyHostToDevice)); - std::vector h3(n_cons,1/E_const), h4(n_vars,1/D_const); - CUDA_CHECK(cudaMemcpy(invE, h3.data(), n_cons*sizeof(double), cudaMemcpyHostToDevice)); - CUDA_CHECK(cudaMemcpy(invD, h4.data(), n_vars*sizeof(double), cudaMemcpyHostToDevice)); - } - - CUBLAS_CHECK(cublasDscal(state->blas_handle, - n_cons, - &E_const, - rescale_info->con_rescale, - 1)); - CUBLAS_CHECK(cublasDscal(state->blas_handle, - n_vars, - &D_const, - rescale_info->var_rescale, - 1)); + fill_const_and_accum_kernel<<num_blocks_dual, THREADS_PER_BLOCK>>>( + E, invE, rescale_info->con_rescale, E_const, n_cons); + fill_const_and_accum_kernel<<num_blocks_primal, THREADS_PER_BLOCK>>>( + D, invD, rescale_info->var_rescale, D_const, n_vars); scale_problem(state, E, D, invE, invD); } @@ -447,17 +254,12 @@ rescale_info_t *rescale_problem( clock_t start_rescaling = clock(); rescale_info_t *rescale_info = (rescale_info_t *)safe_calloc(1, sizeof(rescale_info_t)); - rescale_info->con_rescale = nullptr; - rescale_info->var_rescale = nullptr; CUDA_CHECK(cudaMalloc(&rescale_info->con_rescale, n_cons*sizeof(double))); CUDA_CHECK(cudaMalloc(&rescale_info->var_rescale, n_vars*sizeof(double))); - { - std::vector h1(n_cons,1.0), h2(n_vars,1.0); - CUDA_CHECK(cudaMemcpy(rescale_info->con_rescale, h1.data(), - n_cons*sizeof(double), cudaMemcpyHostToDevice)); - CUDA_CHECK(cudaMemcpy(rescale_info->var_rescale, h2.data(), - n_vars*sizeof(double), cudaMemcpyHostToDevice)); - } + fill_ones_kernel<<num_blocks_dual, THREADS_PER_BLOCK>>>( + rescale_info->con_rescale, n_cons); + fill_ones_kernel<<num_blocks_primal, THREADS_PER_BLOCK>>>( + rescale_info->var_rescale, n_vars); double *E=nullptr, *D=nullptr, *invE=nullptr, *invD=nullptr; CUDA_CHECK(cudaMalloc(&E, n_cons*sizeof(double))); @@ -486,4 +288,160 @@ rescale_info_t *rescale_problem( CUDA_CHECK(cudaFree(invD)); return rescale_info; -} \ No newline at end of file +} + +__global__ void scale_variables_kernel(double* __restrict__ c, + double* __restrict__ var_lb, + double* __restrict__ var_ub, + double* __restrict__ var_lb_finite, + double* __restrict__ var_ub_finite, + double* __restrict__ primal_start, + const double* __restrict__ D, + const double* __restrict__ invD, + int n) +{ + int j = blockIdx.x * blockDim.x + threadIdx.x; + if (j >= n) return; + double dj = D[j]; + double inv_dj = invD[j]; + c[j] *= inv_dj; + var_lb[j] *= dj; + var_ub[j] *= dj; + var_lb_finite[j] *= dj; + var_ub_finite[j] *= dj; + primal_start[j] *= dj; +} + +__global__ void scale_constraints_kernel(double* __restrict__ con_lb, + double* __restrict__ con_ub, + double* __restrict__ con_lb_finite, + double* __restrict__ con_ub_finite, + double* __restrict__ dual_start, + const double* __restrict__ E, + const double* __restrict__ invE, + int m) +{ + int i = blockIdx.x * blockDim.x + threadIdx.x; + if (i >= m) return; + double inv_ei = invE[i]; + double ei = E[i]; + con_lb[i] *= inv_ei; + con_ub[i] *= inv_ei; + con_lb_finite[i] *= inv_ei; + con_ub_finite[i] *= inv_ei; + dual_start[i] *= ei; +} + +__global__ void csr_scale_nnz_kernel(const int* __restrict__ row_ids, + const int* __restrict__ col_ind, + double* __restrict__ A_vals, + double* __restrict__ At_vals, + const int* __restrict__ A_to_At, + const double* __restrict__ invD, + const double* __restrict__ invE, + int nnz) +{ + for (int k = blockIdx.x * blockDim.x + threadIdx.x; + k < nnz; + k += gridDim.x * blockDim.x) + { + int i = row_ids[k]; + int j = col_ind[k]; + double scale = invD[j] * invE[i]; + A_vals[k] *= scale; + At_vals[A_to_At[k]] *= scale; + } +} + +__global__ void csr_row_absmax_kernel(const int* __restrict__ row_ptr, + const double* __restrict__ vals, + int num_rows, + double* __restrict__ out_max) +{ + int i = blockIdx.x * blockDim.x + threadIdx.x; + if (i >= num_rows) return; + int s = row_ptr[i], e = row_ptr[i + 1]; + double m = 0.0; + for (int k = s; k < e; ++k) { + double v = fabs(vals[k]); + if (!isfinite(v)) v = 0.0; + if (v > m) m = v; + } + out_max[i] = m; +} + +__device__ __forceinline__ double pow_fast(double v, double p) { + if (p == 2.0) return v * v; + if (p == 1.0) return v; + if (p == 0.5) return sqrt(v); + return pow(v, p); +} + +__global__ void csr_row_powsum_kernel(const int* __restrict__ row_ptr, + const double* __restrict__ vals, + int num_rows, + double degree, + double* __restrict__ out_sum) +{ + int i = blockIdx.x * blockDim.x + threadIdx.x; + if (i >= num_rows) return; + int s = row_ptr[i], e = row_ptr[i + 1]; + double acc = 0.0; + for (int k = s; k < e; ++k) { + double v = fabs(vals[k]); + if (!isfinite(v)) v = 0.0; + acc += pow_fast(v, degree); + } + out_sum[i] = acc; +} + +__global__ void clamp_sqrt_and_accum(double* __restrict__ x, + double* __restrict__ inv_x, + double* __restrict__ cum, + int n) +{ + for (int t = blockIdx.x * blockDim.x + threadIdx.x; t < n; t += blockDim.x * gridDim.x) + { + double v = x[t]; + double s = (v < SCALING_EPSILON) ? 1.0 : sqrt(v); + cum[t] *= s; + x[t] = s; + inv_x[t] = 1.0 / s; + } +} + +__global__ void reduce_bound_norm_sq_atomic( + const double* __restrict__ L, + const double* __restrict__ U, + int m, + double* __restrict__ out_sum) +{ + double acc = 0.0; + for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < m; i += blockDim.x * gridDim.x) { + double Li = L[i], Ui = U[i]; + bool fL = isfinite(Li), fU = isfinite(Ui); + if (fL && (!fU || fabs(Li - Ui) > SCALING_EPSILON)) acc += Li * Li; + if (fU) acc += Ui * Ui; + } + atomicAdd(out_sum, acc); +} + +__global__ void fill_const_and_accum_kernel(double* __restrict__ x, + double* __restrict__ inv_x, + double* __restrict__ cum, + double val, + int n) +{ + int i = blockIdx.x * blockDim.x + threadIdx.x; + if (i >= n) return; + + x[i] = val; + inv_x[i] = 1.0 / val; + cum[i] *= val; +} + +__global__ void fill_ones_kernel(double* __restrict__ x, int n) +{ + int i = blockIdx.x * blockDim.x + threadIdx.x; + if (i < n) x[i] = 1.0; +} diff --git a/src/solver.cu b/src/solver.cu index 300ce9f..130fd75 100644 --- a/src/solver.cu +++ b/src/solver.cu @@ -27,6 +27,15 @@ limitations under the License. #include #include +__global__ void build_row_ind(const int* __restrict__ row_ptr, + int num_rows, + int* __restrict__ row_ind); +__global__ void build_transpose_pos(const int* __restrict__ A_row_ind, + const int* __restrict__ A_col_ind, + const int* __restrict__ At_row_ptr, + const int* __restrict__ At_col_ind, + int nnz, + int* __restrict__ A_to_At); __global__ void compute_next_pdhg_primal_solution_kernel( const double *current_primal, double *reflected_primal, const double *dual_product, const double *objective, const double *var_lb, const double *var_ub, @@ -178,9 +187,49 @@ static pdhg_solver_state_t *initialize_solver_state( ALLOC_AND_COPY(state->constraint_matrix->col_ind, original_problem->constraint_matrix_col_indices, original_problem->constraint_matrix_num_nonzeros * sizeof(int)); ALLOC_AND_COPY(state->constraint_matrix->val, original_problem->constraint_matrix_values, original_problem->constraint_matrix_num_nonzeros * sizeof(double)); - int* h_row_ind = build_row_ind_from_row_ptr(original_problem->constraint_matrix_row_pointers, n_cons, nnz); - ALLOC_AND_COPY(state->constraint_matrix->row_ind, h_row_ind, nnz * sizeof(int)); - free(h_row_ind); + CUDA_CHECK(cudaMalloc(&state->constraint_matrix->row_ind, nnz * sizeof(int))); + build_row_ind<<num_blocks_dual, THREADS_PER_BLOCK>>>( + state->constraint_matrix->row_ptr, n_cons, state->constraint_matrix->row_ind); + CUDA_CHECK(cudaGetLastError()); + + CUDA_CHECK(cudaMalloc(&state->constraint_matrix_t->row_ptr, (n_vars + 1) * sizeof(int))); + CUDA_CHECK(cudaMalloc(&state->constraint_matrix_t->col_ind, original_problem->constraint_matrix_num_nonzeros * sizeof(int))); + CUDA_CHECK(cudaMalloc(&state->constraint_matrix_t->val, original_problem->constraint_matrix_num_nonzeros * sizeof(double))); + + size_t buffer_size = 0; + void *buffer = nullptr; + CUSPARSE_CHECK(cusparseCsr2cscEx2_bufferSize( + state->sparse_handle, state->constraint_matrix->num_rows, state->constraint_matrix->num_cols, state->constraint_matrix->num_nonzeros, + state->constraint_matrix->val, state->constraint_matrix->row_ptr, state->constraint_matrix->col_ind, + state->constraint_matrix_t->val, state->constraint_matrix_t->row_ptr, state->constraint_matrix_t->col_ind, + CUDA_R_64F, CUSPARSE_ACTION_NUMERIC, CUSPARSE_INDEX_BASE_ZERO, + CUSPARSE_CSR2CSC_ALG_DEFAULT, &buffer_size)); + CUDA_CHECK(cudaMalloc(&buffer, buffer_size)); + + CUSPARSE_CHECK(cusparseCsr2cscEx2( + state->sparse_handle, state->constraint_matrix->num_rows, state->constraint_matrix->num_cols, state->constraint_matrix->num_nonzeros, + state->constraint_matrix->val, state->constraint_matrix->row_ptr, state->constraint_matrix->col_ind, + state->constraint_matrix_t->val, state->constraint_matrix_t->row_ptr, state->constraint_matrix_t->col_ind, + CUDA_R_64F, CUSPARSE_ACTION_NUMERIC, CUSPARSE_INDEX_BASE_ZERO, + CUSPARSE_CSR2CSC_ALG_DEFAULT, buffer)); + + CUDA_CHECK(cudaFree(buffer)); + + CUDA_CHECK(cudaMalloc(&state->constraint_matrix_t->row_ind, nnz * sizeof(int))); + build_row_ind<<num_blocks_primal, THREADS_PER_BLOCK>>>( + state->constraint_matrix_t->row_ptr, n_vars, state->constraint_matrix_t->row_ind); + CUDA_CHECK(cudaGetLastError()); + + CUDA_CHECK(cudaMalloc(&state->constraint_matrix->transpose_pos, nnz * sizeof(int))); + state->constraint_matrix_t->transpose_pos = NULL; + build_transpose_pos<<num_blocks_nnz, THREADS_PER_BLOCK>>>( + state->constraint_matrix->row_ind, + state->constraint_matrix->col_ind, + state->constraint_matrix_t->row_ptr, + state->constraint_matrix_t->col_ind, + nnz, + state->constraint_matrix->transpose_pos); + CUDA_CHECK(cudaGetLastError()); ALLOC_AND_COPY(state->variable_lower_bound, original_problem->variable_lower_bound, var_bytes); ALLOC_AND_COPY(state->variable_upper_bound, original_problem->variable_upper_bound, var_bytes); @@ -244,34 +293,17 @@ static pdhg_solver_state_t *initialize_solver_state( state->variable_rescaling = rescale_info->var_rescale; state->rescaling_time_sec = rescale_info->rescaling_time_sec; - CUDA_CHECK(cudaMalloc(&state->constraint_matrix_t->row_ptr, (n_vars + 1) * sizeof(int))); - CUDA_CHECK(cudaMalloc(&state->constraint_matrix_t->col_ind, original_problem->constraint_matrix_num_nonzeros * sizeof(int))); - CUDA_CHECK(cudaMalloc(&state->constraint_matrix_t->val, original_problem->constraint_matrix_num_nonzeros * sizeof(double))); - - size_t buffer_size = 0; - void *buffer = nullptr; - CUSPARSE_CHECK(cusparseCsr2cscEx2_bufferSize( - state->sparse_handle, state->constraint_matrix->num_rows, state->constraint_matrix->num_cols, state->constraint_matrix->num_nonzeros, - state->constraint_matrix->val, state->constraint_matrix->row_ptr, state->constraint_matrix->col_ind, - state->constraint_matrix_t->val, state->constraint_matrix_t->row_ptr, state->constraint_matrix_t->col_ind, - CUDA_R_64F, CUSPARSE_ACTION_NUMERIC, CUSPARSE_INDEX_BASE_ZERO, - CUSPARSE_CSR2CSC_ALG_DEFAULT, &buffer_size)); - CUDA_CHECK(cudaMalloc(&buffer, buffer_size)); - - CUSPARSE_CHECK(cusparseCsr2cscEx2( - state->sparse_handle, state->constraint_matrix->num_rows, state->constraint_matrix->num_cols, state->constraint_matrix->num_nonzeros, - state->constraint_matrix->val, state->constraint_matrix->row_ptr, state->constraint_matrix->col_ind, - state->constraint_matrix_t->val, state->constraint_matrix_t->row_ptr, state->constraint_matrix_t->col_ind, - CUDA_R_64F, CUSPARSE_ACTION_NUMERIC, CUSPARSE_INDEX_BASE_ZERO, - CUSPARSE_CSR2CSC_ALG_DEFAULT, buffer)); - state->constraint_matrix_t->row_ind = NULL; - - CUDA_CHECK(cudaFree(buffer)); - rescale_info->con_rescale = NULL; rescale_info->var_rescale = NULL; rescale_info_free(rescale_info); + CUDA_CHECK(cudaFree(state->constraint_matrix->row_ind)); + state->constraint_matrix->row_ind = NULL; + CUDA_CHECK(cudaFree(state->constraint_matrix_t->row_ind)); + state->constraint_matrix_t->row_ind = NULL; + CUDA_CHECK(cudaFree(state->constraint_matrix->transpose_pos)); + state->constraint_matrix->transpose_pos = NULL; + double sum_of_squares = 0.0; for (int i = 0; i < n_vars; ++i) @@ -348,6 +380,51 @@ static pdhg_solver_state_t *initialize_solver_state( return state; } +__global__ void build_row_ind(const int* __restrict__ row_ptr, + int num_rows, + int* __restrict__ row_ind) +{ + int i = blockIdx.x * blockDim.x + threadIdx.x; + if (i >= num_rows) return; + + int s = row_ptr[i]; + int e = row_ptr[i + 1]; + + for (int k = s; k < e; ++k) { + row_ind[k] = i; + } +} + +__global__ void build_transpose_pos( + const int* __restrict__ A_row_ind, + const int* __restrict__ A_col_ind, + const int* __restrict__ At_row_ptr, + const int* __restrict__ At_col_ind, + int nnz, + int* __restrict__ A_to_At) +{ + int k = blockIdx.x * blockDim.x + threadIdx.x; + if (k >= nnz) return; + + int i = A_row_ind[k]; + int j = A_col_ind[k]; + + int start = At_row_ptr[j]; + int end = At_row_ptr[j + 1]; + + int pos = -1; + for (int idx = start; idx < end; ++idx) { + if (At_col_ind[idx] == i) { + pos = idx; + break; + } + } + + if (pos < 0) return; + + A_to_At[k] = pos; +} + __global__ void compute_next_pdhg_primal_solution_kernel( const double *current_primal, double *reflected_primal, const double *dual_product, const double *objective, const double *var_lb, @@ -625,9 +702,8 @@ static void perform_restart(pdhg_solver_state_t *state, state->last_trial_fixed_point_error = INFINITY; } -static void -initialize_step_size_and_primal_weight(pdhg_solver_state_t *state, - const pdhg_parameters_t *params) +static void initialize_step_size_and_primal_weight(pdhg_solver_state_t *state, + const pdhg_parameters_t *params) { double max_sv = estimate_maximum_singular_value( state->sparse_handle, state->blas_handle, state->constraint_matrix, @@ -694,6 +770,27 @@ void pdhg_solver_state_free(pdhg_solver_state_t *state) return; } + if (state->matA) + CUSPARSE_CHECK(cusparseDestroySpMat(state->matA)); + if (state->matAt) + CUSPARSE_CHECK(cusparseDestroySpMat(state->matAt)); + if (state->vec_primal_sol) + CUSPARSE_CHECK(cusparseDestroyDnVec(state->vec_primal_sol)); + if (state->vec_dual_sol) + CUSPARSE_CHECK(cusparseDestroyDnVec(state->vec_dual_sol)); + if (state->vec_primal_prod) + CUSPARSE_CHECK(cusparseDestroyDnVec(state->vec_primal_prod)); + if (state->vec_dual_prod) + CUSPARSE_CHECK(cusparseDestroyDnVec(state->vec_dual_prod)); + if (state->primal_spmv_buffer) + CUDA_CHECK(cudaFree(state->primal_spmv_buffer)); + if (state->dual_spmv_buffer) + CUDA_CHECK(cudaFree(state->dual_spmv_buffer)); + if (state->sparse_handle) + CUSPARSE_CHECK(cusparseDestroy(state->sparse_handle)); + if (state->blas_handle) + CUBLAS_CHECK(cublasDestroy(state->blas_handle)); + if (state->variable_lower_bound) CUDA_CHECK(cudaFree(state->variable_lower_bound)); if (state->variable_upper_bound) @@ -704,14 +801,22 @@ void pdhg_solver_state_free(pdhg_solver_state_t *state) CUDA_CHECK(cudaFree(state->constraint_matrix->row_ptr)); if (state->constraint_matrix->col_ind) CUDA_CHECK(cudaFree(state->constraint_matrix->col_ind)); + if (state->constraint_matrix->row_ind) + CUDA_CHECK(cudaFree(state->constraint_matrix->row_ind)); if (state->constraint_matrix->val) CUDA_CHECK(cudaFree(state->constraint_matrix->val)); + if (state->constraint_matrix->transpose_pos) + CUDA_CHECK(cudaFree(state->constraint_matrix->transpose_pos)); if (state->constraint_matrix_t->row_ptr) CUDA_CHECK(cudaFree(state->constraint_matrix_t->row_ptr)); if (state->constraint_matrix_t->col_ind) CUDA_CHECK(cudaFree(state->constraint_matrix_t->col_ind)); + if (state->constraint_matrix_t->row_ind) + CUDA_CHECK(cudaFree(state->constraint_matrix_t->row_ind)); if (state->constraint_matrix_t->val) CUDA_CHECK(cudaFree(state->constraint_matrix_t->val)); + if (state->constraint_matrix_t->transpose_pos) + CUDA_CHECK(cudaFree(state->constraint_matrix_t->transpose_pos)); if (state->constraint_lower_bound) CUDA_CHECK(cudaFree(state->constraint_lower_bound)); if (state->constraint_upper_bound) @@ -765,6 +870,11 @@ void pdhg_solver_state_free(pdhg_solver_state_t *state) if (state->ones_dual_d) CUDA_CHECK(cudaFree(state->ones_dual_d)); + if (state->constraint_matrix) + free(state->constraint_matrix); + if (state->constraint_matrix_t) + free(state->constraint_matrix_t); + free(state); } diff --git a/src/utils.cu b/src/utils.cu index 0de6c4b..3212c66 100644 --- a/src/utils.cu +++ b/src/utils.cu @@ -950,18 +950,3 @@ int coo_to_csr(const matrix_desc_t *desc, int **row_ptr, int **col_ind, return 0; } -int* build_row_ind_from_row_ptr(int* row_ptr, int num_rows, int nnz) -{ - if (!row_ptr || num_rows < 0 || nnz <= 0) return NULL; - - int* row_ind = (int*)safe_malloc(nnz * sizeof(int)); - for (int i = 0; i < num_rows; ++i) { - int s = row_ptr[i]; - int e = row_ptr[i + 1]; - for (int k = s; k < e; ++k) { - row_ind[k] = i; - } - } - return row_ind; -} - From c2c7252b6e05214e6f4225d6b50ba25799895cb8 Mon Sep 17 00:00:00 2001 From: Bo Tang Date: Sat, 15 Nov 2025 03:53:39 -0500 Subject: [PATCH 03/13] Todo: infeasible is stucked --- test/test_infeasible_unbounded.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/test/test_infeasible_unbounded.py b/test/test_infeasible_unbounded.py index d09d9a9..4388cc4 100644 --- a/test/test_infeasible_unbounded.py +++ b/test/test_infeasible_unbounded.py @@ -63,13 +63,13 @@ def test_infeasible_lp(base_lp_data, atol): # turn off output model.setParams(OutputFlag=False) # optimize - model.optimize() + #model.optimize() # check status - assert hasattr(model, "Status"), "Model.Status not exposed." - assert model.Status == "PRIMAL_INFEASIBLE", f"Unexpected termination status: {model.Status}" - assert model.StatusCode == PDLP.PRIMAL_INFEASIBLE, f"Unexpected termination status code: {model.StatusCode}" + #assert hasattr(model, "Status"), "Model.Status not exposed." + #assert model.Status == "PRIMAL_INFEASIBLE", f"Unexpected termination status: {model.Status}" + #assert model.StatusCode == PDLP.PRIMAL_INFEASIBLE, f"Unexpected termination status code: {model.StatusCode}" # check dual ray - assert model.DualRayObj > atol, f"DualRayObj should be positive for dual infeasible, got {model.DualRayObj}" + #assert model.DualRayObj > atol, f"DualRayObj should be positive for dual infeasible, got {model.DualRayObj}" def test_unbounded_lp(base_lp_data, atol): From 63b25260cc571b98654039fd4929de47ef41f5d9 Mon Sep 17 00:00:00 2001 From: Bo Tang Date: Sat, 15 Nov 2025 16:45:58 -0500 Subject: [PATCH 04/13] New feat: preconditioner prints --- src/preconditioner.cu | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/src/preconditioner.cu b/src/preconditioner.cu index ec3a6f2..1b9d8dd 100644 --- a/src/preconditioner.cu +++ b/src/preconditioner.cu @@ -249,6 +249,8 @@ rescale_info_t *rescale_problem( const pdhg_parameters_t *params, pdhg_solver_state_t *state) { + printf("[Precondition] Start\n"); + int n_vars = state->num_variables; int n_cons = state->num_constraints; @@ -269,14 +271,17 @@ rescale_info_t *rescale_problem( if (params->l_inf_ruiz_iterations > 0) { + printf("[Precondition] Ruiz scaling (%d iterations)\n", params->l_inf_ruiz_iterations); ruiz_rescaling(state, params->l_inf_ruiz_iterations, rescale_info, E, D, invE, invD); } if (params->has_pock_chambolle_alpha) { + printf("[Precondition] Pock-Chambolle scaling (alpha=%.4f)\n", params->pock_chambolle_alpha); pock_chambolle_rescaling(state, params->pock_chambolle_alpha, rescale_info, E, D, invE, invD); } if (params->bound_objective_rescaling) { + printf("[Precondition] Bound-objective scaling\n"); bound_objective_rescaling(state, rescale_info, E, D, invE, invD); } From 3f17100dd92756d7d5257a4e9d527d4cda05166c Mon Sep 17 00:00:00 2001 From: Bo Tang Date: Sat, 15 Nov 2025 21:39:58 -0500 Subject: [PATCH 05/13] Bug fixed: objective & bound rescale --- CMakeLists.txt | 2 +- internal/internal_types.h | 4 + pyproject.toml | 4 +- src/preconditioner.cu | 188 +++++++++++++++++++++--------- src/solver.cu | 103 ++++++++++------ src/utils.cu | 21 +++- test/test_infeasible_unbounded.py | 10 +- 7 files changed, 229 insertions(+), 103 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index bd1a955..e06c649 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -20,7 +20,7 @@ if(NOT CMAKE_BUILD_TYPE AND NOT CMAKE_CONFIGURATION_TYPES) endif() # Increase compatibility by compiling for all supported real and virtual architectures. -set(CMAKE_CUDA_ARCHITECTURES 60 70 75 80 86 89 90) +set(CMAKE_CUDA_ARCHITECTURES all) # Global Compile flags (corresponding to CFLAGS/NVCCFLAGS) add_compile_options(-fPIC -O3 -Wall -Wextra -g) diff --git a/internal/internal_types.h b/internal/internal_types.h index 177a293..ea90e65 100644 --- a/internal/internal_types.h +++ b/internal/internal_types.h @@ -77,6 +77,8 @@ typedef struct double *constraint_rescaling; double *variable_rescaling; + double constraint_bound_rescaling; + double objective_vector_rescaling; double *primal_slack; double *dual_slack; double rescaling_time_sec; @@ -129,5 +131,7 @@ typedef struct { double *con_rescale; double *var_rescale; + double con_bound_rescale; + double obj_vec_rescale; double rescaling_time_sec; } rescale_info_t; diff --git a/pyproject.toml b/pyproject.toml index 2a0a2c0..f7cacc8 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -4,7 +4,7 @@ build-backend = "scikit_build_core.build" [project] name = "cupdlpx" -version = "0.1.3" +version = "0.1.4" description = "Python bindings for cuPDLPx (GPU-accelerated first-order LP solver)" readme = "README.md" license = { text = "Apache-2.0" } @@ -25,7 +25,7 @@ wheel.packages = ["python/cupdlpx"] sdist.include = ["tests/**", "pyproject.toml", "README.md", "LICENSE"] [tool.scikit-build.cmake.define] -CMAKE_CUDA_ARCHITECTURES = "75;80;86;89;90;90-virtual" +CMAKE_CUDA_ARCHITECTURES = "all" CMAKE_CUDA_STANDARD = "17" CUPDLPX_BUILD_PYTHON = "ON" diff --git a/src/preconditioner.cu b/src/preconditioner.cu index 1b9d8dd..80c77da 100644 --- a/src/preconditioner.cu +++ b/src/preconditioner.cu @@ -32,7 +32,7 @@ __global__ void scale_variables_kernel(double* __restrict__ c, double* __restrict__ primal_start, const double* __restrict__ D, const double* __restrict__ invD, - int n); + int n_vars); __global__ void scale_constraints_kernel(double* __restrict__ con_lb, double* __restrict__ con_ub, double* __restrict__ con_lb_finite, @@ -40,7 +40,7 @@ __global__ void scale_constraints_kernel(double* __restrict__ con_lb, double* __restrict__ dual_start, const double* __restrict__ E, const double* __restrict__ invE, - int m); + int n_cons); __global__ void csr_scale_nnz_kernel(const int* __restrict__ row_ids, const int* __restrict__ col_ind, double* __restrict__ A_vals, @@ -61,25 +61,38 @@ __global__ void csr_row_powsum_kernel(const int* __restrict__ row_ptr, __global__ void clamp_sqrt_and_accum(double* __restrict__ x, double* __restrict__ inv_x, double* __restrict__ cum, - int n); -__global__ void reduce_bound_norm_sq_atomic( + int n_vars); +__global__ void reduce_bound_norm_sq_kernel( const double* __restrict__ L, const double* __restrict__ U, - int m, - double* __restrict__ out_sum); -__global__ void fill_const_and_accum_kernel(double* __restrict__ x, - double* __restrict__ inv_x, - double* __restrict__ cum, - double val, - int n); + int n_cons, + double* __restrict__ block_sums); +__global__ void scale_bounds_kernel( + double* __restrict__ con_lb, + double* __restrict__ con_ub, + double* __restrict__ con_lb_finite, + double* __restrict__ con_ub_finite, + double* __restrict__ dual_start, + int n_cons, + double con_scale, + double obj_scale); +__global__ void scale_obj_kernel( + double* __restrict__ c, + double* __restrict__ var_lb, + double* __restrict__ var_ub, + double* __restrict__ var_lb_finite, + double* __restrict__ var_ub_finite, + double* __restrict__ primal_start, + int n_vars, + double con_scale, + double obj_scale); +__global__ void fill_ones_kernel(double* __restrict__ x, int n_vars); static void scale_problem(pdhg_solver_state_t *state, double *E, double *D, double *invE, double *invD); static void ruiz_rescaling(pdhg_solver_state_t *state, int num_iters, rescale_info_t *rescale_info, double *E, double *D, double *invE, double *invD); -__global__ void fill_ones_kernel(double* __restrict__ x, int n); static void pock_chambolle_rescaling(pdhg_solver_state_t *state, double alpha, rescale_info_t *rescale_info, double *E, double *D, double *invE, double *invD); -static void bound_objective_rescaling(pdhg_solver_state_t *state, rescale_info_t *rescale_info, - double *E, double *D, double *invE, double *invD); +static void bound_objective_rescaling(pdhg_solver_state_t *state, rescale_info_t *rescale_info); static void scale_problem( pdhg_solver_state_t *state, @@ -191,7 +204,7 @@ static void pock_chambolle_rescaling( state->constraint_matrix_t->row_ptr, state->constraint_matrix_t->val, n_vars, - alpha, + 2.0 - alpha, D); clamp_sqrt_and_accum<<num_blocks_primal, THREADS_PER_BLOCK>>>( D, @@ -205,12 +218,7 @@ static void pock_chambolle_rescaling( static void bound_objective_rescaling( pdhg_solver_state_t *state, - rescale_info_t *rescale_info, - double *E, - double *D, - double *invE, - double *invD - ) + rescale_info_t *rescale_info) { const int n_cons = state->num_constraints; const int n_vars = state->num_variables; @@ -218,7 +226,7 @@ static void bound_objective_rescaling( double *bnd_norm_sq_cuda = nullptr; CUDA_CHECK(cudaMalloc(&bnd_norm_sq_cuda, sizeof(double))); CUDA_CHECK(cudaMemset(bnd_norm_sq_cuda, 0, sizeof(double))); - reduce_bound_norm_sq_atomic<<num_blocks_dual, THREADS_PER_BLOCK>>>( + reduce_bound_norm_sq_kernel<<<1, THREADS_PER_BLOCK, THREADS_PER_BLOCK * sizeof(double)>>>( state->constraint_lower_bound, state->constraint_upper_bound, n_cons, @@ -235,14 +243,32 @@ static void bound_objective_rescaling( state->objective_vector, 1, &obj_norm)); - const double E_const = bnd_norm + 1.0; - const double D_const = obj_norm + 1.0; - fill_const_and_accum_kernel<<num_blocks_dual, THREADS_PER_BLOCK>>>( - E, invE, rescale_info->con_rescale, E_const, n_cons); - fill_const_and_accum_kernel<<num_blocks_primal, THREADS_PER_BLOCK>>>( - D, invD, rescale_info->var_rescale, D_const, n_vars); + double con_scale = 1.0 / (bnd_norm + 1.0); + double obj_scale = 1.0 / (obj_norm + 1.0); - scale_problem(state, E, D, invE, invD); + scale_bounds_kernel<<num_blocks_dual, THREADS_PER_BLOCK>>>( + state->constraint_lower_bound, + state->constraint_upper_bound, + state->constraint_lower_bound_finite_val, + state->constraint_upper_bound_finite_val, + state->initial_dual_solution, + n_cons, + con_scale, + obj_scale); + + scale_obj_kernel<<num_blocks_primal, THREADS_PER_BLOCK>>>( + state->objective_vector, + state->variable_lower_bound, + state->variable_upper_bound, + state->variable_lower_bound_finite_val, + state->variable_upper_bound_finite_val, + state->initial_primal_solution, + n_vars, + con_scale, + obj_scale); + + rescale_info->con_bound_rescale = con_scale; + rescale_info->obj_vec_rescale = obj_scale; } rescale_info_t *rescale_problem( @@ -279,12 +305,15 @@ rescale_info_t *rescale_problem( printf("[Precondition] Pock-Chambolle scaling (alpha=%.4f)\n", params->pock_chambolle_alpha); pock_chambolle_rescaling(state, params->pock_chambolle_alpha, rescale_info, E, D, invE, invD); } + + rescale_info->con_bound_rescale = 1.0; + rescale_info->obj_vec_rescale = 1.0; if (params->bound_objective_rescaling) { printf("[Precondition] Bound-objective scaling\n"); - bound_objective_rescaling(state, rescale_info, E, D, invE, invD); + bound_objective_rescaling(state, rescale_info); } - + rescale_info->rescaling_time_sec = (double)(clock() - start_rescaling) / CLOCKS_PER_SEC; CUDA_CHECK(cudaFree(E)); @@ -303,10 +332,10 @@ __global__ void scale_variables_kernel(double* __restrict__ c, double* __restrict__ primal_start, const double* __restrict__ D, const double* __restrict__ invD, - int n) + int n_vars) { int j = blockIdx.x * blockDim.x + threadIdx.x; - if (j >= n) return; + if (j >= n_vars) return; double dj = D[j]; double inv_dj = invD[j]; c[j] *= inv_dj; @@ -324,10 +353,10 @@ __global__ void scale_constraints_kernel(double* __restrict__ con_lb, double* __restrict__ dual_start, const double* __restrict__ E, const double* __restrict__ invE, - int m) + int n_cons) { int i = blockIdx.x * blockDim.x + threadIdx.x; - if (i >= m) return; + if (i >= n_cons) return; double inv_ei = invE[i]; double ei = E[i]; con_lb[i] *= inv_ei; @@ -403,9 +432,9 @@ __global__ void csr_row_powsum_kernel(const int* __restrict__ row_ptr, __global__ void clamp_sqrt_and_accum(double* __restrict__ x, double* __restrict__ inv_x, double* __restrict__ cum, - int n) + int n_vars) { - for (int t = blockIdx.x * blockDim.x + threadIdx.x; t < n; t += blockDim.x * gridDim.x) + for (int t = blockIdx.x * blockDim.x + threadIdx.x; t < n_vars; t += blockDim.x * gridDim.x) { double v = x[t]; double s = (v < SCALING_EPSILON) ? 1.0 : sqrt(v); @@ -415,38 +444,87 @@ __global__ void clamp_sqrt_and_accum(double* __restrict__ x, } } -__global__ void reduce_bound_norm_sq_atomic( +__global__ void reduce_bound_norm_sq_kernel( const double* __restrict__ L, const double* __restrict__ U, - int m, - double* __restrict__ out_sum) + int n_cons, + double* __restrict__ block_sums) { + extern __shared__ double sdata[]; + int tid = threadIdx.x; + int global_tid = blockIdx.x * blockDim.x + tid; + int stride = blockDim.x * gridDim.x; + double acc = 0.0; - for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < m; i += blockDim.x * gridDim.x) { + for (int i = global_tid; i < n_cons; i += stride) { double Li = L[i], Ui = U[i]; bool fL = isfinite(Li), fU = isfinite(Ui); - if (fL && (!fU || fabs(Li - Ui) > SCALING_EPSILON)) acc += Li * Li; - if (fU) acc += Ui * Ui; + + if (fL && (!fU || fabs(Li - Ui) > SCALING_EPSILON)) { + acc += Li * Li; + } + if (fU) { + acc += Ui * Ui; + } + } + + sdata[tid] = acc; + __syncthreads(); + + for (int s = blockDim.x / 2; s > 0; s >>= 1) { + if (tid < s) { + sdata[tid] += sdata[tid + s]; + } + __syncthreads(); + } + + if (tid == 0) { + block_sums[blockIdx.x] = sdata[0]; } - atomicAdd(out_sum, acc); } -__global__ void fill_const_and_accum_kernel(double* __restrict__ x, - double* __restrict__ inv_x, - double* __restrict__ cum, - double val, - int n) +__global__ void scale_bounds_kernel( + double* __restrict__ con_lb, + double* __restrict__ con_ub, + double* __restrict__ con_lb_finite, + double* __restrict__ con_ub_finite, + double* __restrict__ dual_start, + int n_cons, + double con_scale, + double obj_scale) { int i = blockIdx.x * blockDim.x + threadIdx.x; - if (i >= n) return; + if (i >= n_cons) return; + con_lb[i] *= con_scale; + con_ub[i] *= con_scale; + con_lb_finite[i] *= con_scale; + con_ub_finite[i] *= con_scale; + dual_start[i] *= obj_scale; +} - x[i] = val; - inv_x[i] = 1.0 / val; - cum[i] *= val; +__global__ void scale_obj_kernel( + double* __restrict__ c, + double* __restrict__ var_lb, + double* __restrict__ var_ub, + double* __restrict__ var_lb_finite, + double* __restrict__ var_ub_finite, + double* __restrict__ primal_start, + int n_vars, + double con_scale, + double obj_scale) +{ + int j = blockIdx.x * blockDim.x + threadIdx.x; + if (j >= n_vars) return; + var_lb[j] *= con_scale; + var_ub[j] *= con_scale; + var_lb_finite[j] *= con_scale; + var_ub_finite[j] *= con_scale; + c[j] *= obj_scale; + primal_start[j] *= con_scale; } -__global__ void fill_ones_kernel(double* __restrict__ x, int n) +__global__ void fill_ones_kernel(double* __restrict__ x, int n_vars) { int i = blockIdx.x * blockDim.x + threadIdx.x; - if (i < n) x[i] = 1.0; + if (i < n_vars) x[i] = 1.0; } diff --git a/src/solver.cu b/src/solver.cu index 130fd75..fc8fcd1 100644 --- a/src/solver.cu +++ b/src/solver.cu @@ -36,33 +36,64 @@ __global__ void build_transpose_pos(const int* __restrict__ A_row_ind, const int* __restrict__ At_col_ind, int nnz, int* __restrict__ A_to_At); -__global__ void compute_next_pdhg_primal_solution_kernel( - const double *current_primal, double *reflected_primal, const double *dual_product, - const double *objective, const double *var_lb, const double *var_ub, - int n, double step_size); -__global__ void compute_next_pdhg_primal_solution_major_kernel( - const double *current_primal, double *pdhg_primal, double *reflected_primal, - const double *dual_product, const double *objective, const double *var_lb, - const double *var_ub, int n, double step_size, double *dual_slack); -__global__ void compute_next_pdhg_dual_solution_kernel( - const double *current_dual, double *reflected_dual, const double *primal_product, - const double *const_lb, const double *const_ub, int n, double step_size); -__global__ void compute_next_pdhg_dual_solution_major_kernel( - const double *current_dual, double *pdhg_dual, double *reflected_dual, - const double *primal_product, const double *const_lb, const double *const_ub, - int n, double step_size); -__global__ void halpern_update_kernel( - const double *initial_primal, double *current_primal, const double *reflected_primal, - const double *initial_dual, double *current_dual, const double *reflected_dual, - int n_vars, int n_cons, double weight, double reflection_coeff); -__global__ void rescale_solution_kernel( - double *primal_solution, double *dual_solution, - const double *variable_rescaling, const double *constraint_rescaling, - int n_vars, int n_cons); -__global__ void compute_delta_solution_kernel( - const double *initial_primal, const double *pdhg_primal, double *delta_primal, - const double *initial_dual, const double *pdhg_dual, double *delta_dual, - int n_vars, int n_cons); +__global__ void compute_next_pdhg_primal_solution_kernel(const double *current_primal, + double *reflected_primal, + const double *dual_product, + const double *objective, + const double *var_lb, + const double *var_ub, + int n, + double step_size); +__global__ void compute_next_pdhg_primal_solution_major_kernel(const double *current_primal, + double *pdhg_primal, + double *reflected_primal, + const double *dual_product, + const double *objective, + const double *var_lb, + const double *var_ub, + int n, + double step_size, + double *dual_slack); +__global__ void compute_next_pdhg_dual_solution_kernel(const double *current_dual, + double *reflected_dual, + const double *primal_product, + const double *const_lb, + const double *const_ub, + int n, + double step_size); +__global__ void compute_next_pdhg_dual_solution_major_kernel(const double *current_dual, + double *pdhg_dual, + double *reflected_dual, + const double *primal_product, + const double *const_lb, + const double *const_ub, + int n, + double step_size); +__global__ void halpern_update_kernel(const double *initial_primal, + double *current_primal, + const double *reflected_primal, + const double *initial_dual, + double *current_dual, + const double *reflected_dual, + int n_vars, + int n_cons, + double weight, + double reflection_coeff); +__global__ void rescale_solution_kernel(double *primal_solution, + double *dual_solution, + const double *variable_rescaling, + const double *constraint_rescaling, + const double objective_vector_rescaling, + const double constraint_bound_rescaling, + int n_vars, int n_cons); +__global__ void compute_delta_solution_kernel(const double *initial_primal, + const double *pdhg_primal, + double *delta_primal, + const double *initial_dual, + const double *pdhg_dual, + double *delta_dual, + int n_vars, + int n_cons); static void compute_next_pdhg_primal_solution(pdhg_solver_state_t *state); static void compute_next_pdhg_dual_solution(pdhg_solver_state_t *state); static void halpern_update(pdhg_solver_state_t *state, double reflection_coefficient); @@ -81,7 +112,7 @@ cupdlpx_result_t *optimize(const pdhg_parameters_t *params, { print_initial_info(params, original_problem); pdhg_solver_state_t *state = initialize_solver_state(original_problem, params); - + initialize_step_size_and_primal_weight(state, params); clock_t start_time = clock(); bool do_restart = false; @@ -291,6 +322,8 @@ static pdhg_solver_state_t *initialize_solver_state( state->constraint_rescaling = rescale_info->con_rescale; state->variable_rescaling = rescale_info->var_rescale; + state->constraint_bound_rescaling = rescale_info->con_bound_rescale; + state->objective_vector_rescaling = rescale_info->obj_vec_rescale; state->rescaling_time_sec = rescale_info->rescaling_time_sec; rescale_info->con_rescale = NULL; @@ -305,7 +338,6 @@ static pdhg_solver_state_t *initialize_solver_state( state->constraint_matrix->transpose_pos = NULL; double sum_of_squares = 0.0; - for (int i = 0; i < n_vars; ++i) { sum_of_squares += original_problem->objective_vector[i] * original_problem->objective_vector[i]; @@ -313,17 +345,14 @@ static pdhg_solver_state_t *initialize_solver_state( state->objective_vector_norm = sqrt(sum_of_squares); sum_of_squares = 0.0; - for (int i = 0; i < n_cons; ++i) { double lower = original_problem->constraint_lower_bound[i]; double upper = original_problem->constraint_upper_bound[i]; - if (isfinite(lower) && (lower != upper)) { sum_of_squares += lower * lower; } - if (isfinite(upper)) { sum_of_squares += upper * upper; @@ -350,7 +379,7 @@ static pdhg_solver_state_t *initialize_solver_state( CUSPARSE_CHECK(cusparseCreateDnVec(&state->vec_primal_prod, state->num_constraints, state->primal_product, CUDA_R_64F)); CUSPARSE_CHECK(cusparseCreateDnVec(&state->vec_dual_prod, state->num_variables, state->dual_product, CUDA_R_64F)); CUSPARSE_CHECK(cusparseSpMV_bufferSize(state->sparse_handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &HOST_ONE, state->matA, state->vec_primal_sol, &HOST_ZERO, state->vec_primal_prod, CUDA_R_64F, CUSPARSE_SPMV_CSR_ALG2, &primal_spmv_buffer_size)); - + CUSPARSE_CHECK(cusparseSpMV_bufferSize(state->sparse_handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &HOST_ONE, state->matAt, state->vec_dual_sol, &HOST_ZERO, state->vec_dual_prod, CUDA_R_64F, CUSPARSE_SPMV_CSR_ALG2, &dual_spmv_buffer_size)); CUDA_CHECK(cudaMalloc(&state->primal_spmv_buffer, primal_spmv_buffer_size)); CUSPARSE_CHECK(cusparseSpMV_preprocess(state->sparse_handle, CUSPARSE_OPERATION_NON_TRANSPOSE, @@ -512,18 +541,21 @@ __global__ void rescale_solution_kernel(double *primal_solution, double *dual_solution, const double *variable_rescaling, const double *constraint_rescaling, + const double objective_vector_rescaling, + const double constraint_bound_rescaling, int n_vars, int n_cons) { int i = blockIdx.x * blockDim.x + threadIdx.x; if (i < n_vars) { primal_solution[i] = - primal_solution[i] / variable_rescaling[i]; + primal_solution[i] / variable_rescaling[i] / constraint_bound_rescaling; } else if (i < n_vars + n_cons) { int idx = i - n_vars; - dual_solution[idx] = dual_solution[idx] / constraint_rescaling[idx]; + dual_solution[idx] = dual_solution[idx] / constraint_rescaling[idx] / + objective_vector_rescaling; } } @@ -633,6 +665,7 @@ static void rescale_solution(pdhg_solver_state_t *state) rescale_solution_kernel<<num_blocks_primal_dual, THREADS_PER_BLOCK>>>( state->pdhg_primal_solution, state->pdhg_dual_solution, state->variable_rescaling, state->constraint_rescaling, + state->objective_vector_rescaling, state->constraint_bound_rescaling, state->num_variables, state->num_constraints); } diff --git a/src/utils.cu b/src/utils.cu index 3212c66..199c555 100644 --- a/src/utils.cu +++ b/src/utils.cu @@ -596,14 +596,19 @@ void compute_residual(pdhg_solver_state_t *state) CUBLAS_CHECK(cublasDnrm2_v2_64(state->blas_handle, state->num_constraints, state->primal_residual, 1, &state->absolute_primal_residual)); + state->absolute_primal_residual /= state->constraint_bound_rescaling; CUBLAS_CHECK(cublasDnrm2_v2_64(state->blas_handle, state->num_variables, state->dual_residual, 1, &state->absolute_dual_residual)); + state->absolute_dual_residual /= state->objective_vector_rescaling; CUBLAS_CHECK(cublasDdot( state->blas_handle, state->num_variables, state->objective_vector, 1, state->pdhg_primal_solution, 1, &state->primal_objective_value)); - state->primal_objective_value = state->primal_objective_value + state->objective_constant; + state->primal_objective_value = + state->primal_objective_value / (state->constraint_bound_rescaling * + state->objective_vector_rescaling) + + state->objective_constant; double base_dual_objective; CUBLAS_CHECK(cublasDdot(state->blas_handle, state->num_variables, @@ -612,7 +617,10 @@ void compute_residual(pdhg_solver_state_t *state) double dual_slack_sum = get_vector_sum(state->blas_handle, state->num_constraints, state->ones_dual_d, state->primal_slack); - state->dual_objective_value = base_dual_objective + dual_slack_sum + state->objective_constant; + state->dual_objective_value = (base_dual_objective + dual_slack_sum) / + (state->constraint_bound_rescaling * + state->objective_vector_rescaling) + + state->objective_constant; state->relative_primal_residual = state->absolute_primal_residual / (1.0 + state->constraint_bound_norm); @@ -669,6 +677,8 @@ void compute_infeasibility_information(pdhg_solver_state_t *state) CUBLAS_CHECK(cublasDdot( state->blas_handle, state->num_variables, state->objective_vector, 1, state->delta_primal_solution, 1, &state->primal_ray_linear_objective)); + state->primal_ray_linear_objective /= + (state->constraint_bound_rescaling * state->objective_vector_rescaling); dual_solution_dual_objective_contribution_kernel<<num_blocks_dual, THREADS_PER_BLOCK>>>( @@ -688,7 +698,9 @@ void compute_infeasibility_information(pdhg_solver_state_t *state) double sum_dual_slack = get_vector_sum(state->blas_handle, state->num_variables, state->ones_primal_d, state->dual_slack); - state->dual_ray_objective = sum_primal_slack + sum_dual_slack; + state->dual_ray_objective = + (sum_primal_slack + sum_dual_slack) / + (state->constraint_bound_rescaling * state->objective_vector_rescaling); compute_primal_infeasibility_kernel<<num_blocks_dual, THREADS_PER_BLOCK>>>( @@ -948,5 +960,4 @@ int coo_to_csr(const matrix_desc_t *desc, int **row_ptr, int **col_ind, free(next); *nnz_out = nnz; return 0; -} - +} \ No newline at end of file diff --git a/test/test_infeasible_unbounded.py b/test/test_infeasible_unbounded.py index 4388cc4..d09d9a9 100644 --- a/test/test_infeasible_unbounded.py +++ b/test/test_infeasible_unbounded.py @@ -63,13 +63,13 @@ def test_infeasible_lp(base_lp_data, atol): # turn off output model.setParams(OutputFlag=False) # optimize - #model.optimize() + model.optimize() # check status - #assert hasattr(model, "Status"), "Model.Status not exposed." - #assert model.Status == "PRIMAL_INFEASIBLE", f"Unexpected termination status: {model.Status}" - #assert model.StatusCode == PDLP.PRIMAL_INFEASIBLE, f"Unexpected termination status code: {model.StatusCode}" + assert hasattr(model, "Status"), "Model.Status not exposed." + assert model.Status == "PRIMAL_INFEASIBLE", f"Unexpected termination status: {model.Status}" + assert model.StatusCode == PDLP.PRIMAL_INFEASIBLE, f"Unexpected termination status code: {model.StatusCode}" # check dual ray - #assert model.DualRayObj > atol, f"DualRayObj should be positive for dual infeasible, got {model.DualRayObj}" + assert model.DualRayObj > atol, f"DualRayObj should be positive for dual infeasible, got {model.DualRayObj}" def test_unbounded_lp(base_lp_data, atol): From 1989b6bb30a9a1a798a03673e74ebcc2f77b61cc Mon Sep 17 00:00:00 2001 From: Bo Tang Date: Sun, 16 Nov 2025 00:40:11 -0500 Subject: [PATCH 06/13] New feat: record precondition time --- src/cli.c | 1 + 1 file changed, 1 insertion(+) diff --git a/src/cli.c b/src/cli.c index 72aa2c9..97e2943 100644 --- a/src/cli.c +++ b/src/cli.c @@ -121,6 +121,7 @@ void save_solver_summary(const cupdlpx_result_t *result, const char *output_dir, } fprintf(outfile, "Termination Reason: %s\n", termination_reason_tToString(result->termination_reason)); + fprintf(outfile, "Precondition Time (sec): %e\n", result->rescaling_time_sec); fprintf(outfile, "Runtime (sec): %e\n", result->cumulative_time_sec); fprintf(outfile, "Iterations Count: %d\n", result->total_count); fprintf(outfile, "Primal Objective Value: %e\n", From 32889ca694082e9a48e7209f5f68a58081770876 Mon Sep 17 00:00:00 2001 From: bolucastang Date: Fri, 21 Nov 2025 11:29:15 -0500 Subject: [PATCH 07/13] feat: add logging for precondition time --- src/utils.cu | 19 ++++++++++--------- 1 file changed, 10 insertions(+), 9 deletions(-) diff --git a/src/utils.cu b/src/utils.cu index 96ae8d2..e660fd1 100644 --- a/src/utils.cu +++ b/src/utils.cu @@ -370,13 +370,14 @@ void pdhg_final_log(const pdhg_solver_state_t *state, bool verbose, "--------------------\n"); } printf("Solution Summary\n"); - printf(" Status : %s\n", termination_reason_to_string(reason)); - printf(" Iterations : %d\n", state->total_count - 1); - printf(" Solve time : %.3g sec\n", state->cumulative_time_sec); - printf(" Primal obj : %.10g\n", state->primal_objective_value); - printf(" Dual obj : %.10g\n", state->dual_objective_value); - printf(" Primal infeas : %.3e\n", state->relative_primal_residual); - printf(" Dual infeas : %.3e\n", state->relative_dual_residual); + printf(" Status : %s\n", termination_reason_to_string(reason)); + printf(" Precondition time : %.5g sec\n", state->rescaling_time_sec); + printf(" Iterations : %d\n", state->total_count - 1); + printf(" Solve time : %.3g sec\n", state->cumulative_time_sec); + printf(" Primal obj : %.10g\n", state->primal_objective_value); + printf(" Dual obj : %.10g\n", state->dual_objective_value); + printf(" Primal infeas : %.3e\n", state->relative_primal_residual); + printf(" Dual infeas : %.3e\n", state->relative_dual_residual); } void display_iteration_stats(const pdhg_solver_state_t *state, bool verbose) @@ -1010,7 +1011,7 @@ void print_initial_feas_polish_info(bool is_primal_polish, const pdhg_parameters printf("---------------------------------------------------------------------------------------\n"); if (is_primal_polish) printf("%s %s | %s | %s | %s \n", " iter", " time ", "pr obj", " abs pr res ", " rel pr res "); // else printf("%s %s | %s | %s \n", " iter", " time ", " abs du res ", " rel du res "); - else printf("%s %s | %s | %s | %s \n", " iter", " time ", "du obj", " abs du res ", " rel du res "); + else printf("%s %s | %s | %s | %s \n", " iter", " time ", "du obj", " abs du res ", " rel du res "); printf("---------------------------------------------------------------------------------------\n"); } @@ -1134,7 +1135,7 @@ void compute_dual_feas_polish_residual(pdhg_solver_state_t *state, const pdhg_so CUSPARSE_CHECK(cusparseSpMV(state->sparse_handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &HOST_ONE, state->matAt, state->vec_dual_sol, &HOST_ZERO, state->vec_dual_prod, CUDA_R_64F, CUSPARSE_SPMV_CSR_ALG2, state->dual_spmv_buffer)); compute_dual_feas_polish_residual_kerenl<<num_blocks_primal_dual, THREADS_PER_BLOCK>>>( - state->dual_residual, + state->dual_residual, state->pdhg_dual_solution, state->dual_product, state->dual_slack, state->objective_vector, From e3fd2945ab2436376223f769345a92bb78220b16 Mon Sep 17 00:00:00 2001 From: Bo Tang Date: Tue, 25 Nov 2025 22:44:22 -0500 Subject: [PATCH 08/13] Bug fixed: numerical issue --- src/preconditioner.cu | 28 ++++++++++++++-------------- 1 file changed, 14 insertions(+), 14 deletions(-) diff --git a/src/preconditioner.cu b/src/preconditioner.cu index 80c77da..c042175 100644 --- a/src/preconditioner.cu +++ b/src/preconditioner.cu @@ -22,7 +22,7 @@ limitations under the License. #include #include -#define SCALING_EPSILON 1e-12 +#define SCALING_EPSILON 1e-8 __global__ void scale_variables_kernel(double* __restrict__ c, double* __restrict__ var_lb, @@ -60,7 +60,7 @@ __global__ void csr_row_powsum_kernel(const int* __restrict__ row_ptr, double* __restrict__ out_sum); __global__ void clamp_sqrt_and_accum(double* __restrict__ x, double* __restrict__ inv_x, - double* __restrict__ cum, + double* __restrict__ cum, int n_vars); __global__ void reduce_bound_norm_sq_kernel( const double* __restrict__ L, @@ -199,7 +199,7 @@ static void pock_chambolle_rescaling( invE, rescale_info->con_rescale, n_cons); - + csr_row_powsum_kernel<<num_blocks_primal, THREADS_PER_BLOCK>>>( state->constraint_matrix_t->row_ptr, state->constraint_matrix_t->val, @@ -231,7 +231,7 @@ static void bound_objective_rescaling( state->constraint_upper_bound, n_cons, bnd_norm_sq_cuda); - + double bnd_norm_sq = 0.0; CUDA_CHECK(cudaMemcpy(&bnd_norm_sq, bnd_norm_sq_cuda, sizeof(double), cudaMemcpyDeviceToHost)); CUDA_CHECK(cudaFree(bnd_norm_sq_cuda)); @@ -276,7 +276,7 @@ rescale_info_t *rescale_problem( pdhg_solver_state_t *state) { printf("[Precondition] Start\n"); - + int n_vars = state->num_variables; int n_cons = state->num_constraints; @@ -313,14 +313,14 @@ rescale_info_t *rescale_problem( printf("[Precondition] Bound-objective scaling\n"); bound_objective_rescaling(state, rescale_info); } - + rescale_info->rescaling_time_sec = (double)(clock() - start_rescaling) / CLOCKS_PER_SEC; CUDA_CHECK(cudaFree(E)); CUDA_CHECK(cudaFree(D)); CUDA_CHECK(cudaFree(invE)); CUDA_CHECK(cudaFree(invD)); - + return rescale_info; } @@ -376,7 +376,7 @@ __global__ void csr_scale_nnz_kernel(const int* __restrict__ row_ids, int nnz) { for (int k = blockIdx.x * blockDim.x + threadIdx.x; - k < nnz; + k < nnz; k += gridDim.x * blockDim.x) { int i = row_ids[k]; @@ -429,16 +429,16 @@ __global__ void csr_row_powsum_kernel(const int* __restrict__ row_ptr, out_sum[i] = acc; } -__global__ void clamp_sqrt_and_accum(double* __restrict__ x, +__global__ void clamp_sqrt_and_accum(double* __restrict__ x, double* __restrict__ inv_x, - double* __restrict__ cum, - int n_vars) + double* __restrict__ cum, + int n_vars) { for (int t = blockIdx.x * blockDim.x + threadIdx.x; t < n_vars; t += blockDim.x * gridDim.x) { - double v = x[t]; - double s = (v < SCALING_EPSILON) ? 1.0 : sqrt(v); - cum[t] *= s; + double v = x[t]; + double s = sqrt(fmax(v, SCALING_EPSILON)); + cum[t] *= s; x[t] = s; inv_x[t] = 1.0 / s; } From 7381654f91662ba05a173a05fdfcfea4e2c031ff Mon Sep 17 00:00:00 2001 From: Bo Tang Date: Thu, 27 Nov 2025 14:38:50 -0500 Subject: [PATCH 09/13] Clean code: clang-format and align naming style --- internal/internal_types.h | 2 +- src/preconditioner.cu | 343 ++++++++++++++++++++------------------ src/solver.cu | 282 ++++++++++++++++--------------- 3 files changed, 327 insertions(+), 300 deletions(-) diff --git a/internal/internal_types.h b/internal/internal_types.h index 3ce8c23..aa69005 100644 --- a/internal/internal_types.h +++ b/internal/internal_types.h @@ -30,7 +30,7 @@ typedef struct int *col_ind; int *row_ind; double *val; - int *transpose_pos; + int *transpose_map; } cu_sparse_matrix_csr_t; typedef struct diff --git a/src/preconditioner.cu b/src/preconditioner.cu index c042175..031051f 100644 --- a/src/preconditioner.cu +++ b/src/preconditioner.cu @@ -24,73 +24,73 @@ limitations under the License. #define SCALING_EPSILON 1e-8 -__global__ void scale_variables_kernel(double* __restrict__ c, - double* __restrict__ var_lb, - double* __restrict__ var_ub, - double* __restrict__ var_lb_finite, - double* __restrict__ var_ub_finite, - double* __restrict__ primal_start, - const double* __restrict__ D, - const double* __restrict__ invD, +__global__ void scale_variables_kernel(double *__restrict__ c, + double *__restrict__ var_lb, + double *__restrict__ var_ub, + double *__restrict__ var_lb_finite, + double *__restrict__ var_ub_finite, + double *__restrict__ primal_start, + const double *__restrict__ D, + const double *__restrict__ invD, int n_vars); -__global__ void scale_constraints_kernel(double* __restrict__ con_lb, - double* __restrict__ con_ub, - double* __restrict__ con_lb_finite, - double* __restrict__ con_ub_finite, - double* __restrict__ dual_start, - const double* __restrict__ E, - const double* __restrict__ invE, +__global__ void scale_constraints_kernel(double *__restrict__ con_lb, + double *__restrict__ con_ub, + double *__restrict__ con_lb_finite, + double *__restrict__ con_ub_finite, + double *__restrict__ dual_start, + const double *__restrict__ E, + const double *__restrict__ invE, int n_cons); -__global__ void csr_scale_nnz_kernel(const int* __restrict__ row_ids, - const int* __restrict__ col_ind, - double* __restrict__ A_vals, - double* __restrict__ At_vals, - const int* __restrict__ A_to_At, - const double* __restrict__ invD, - const double* __restrict__ invE, +__global__ void scale_csr_nnz_kernel(const int *__restrict__ row_ids, + const int *__restrict__ col_ind, + double *__restrict__ A_vals, + double *__restrict__ At_vals, + const int *__restrict__ A_to_At, + const double *__restrict__ invD, + const double *__restrict__ invE, int nnz); -__global__ void csr_row_absmax_kernel(const int* __restrict__ row_ptr, - const double* __restrict__ vals, - int num_rows, - double* __restrict__ out_max); -__global__ void csr_row_powsum_kernel(const int* __restrict__ row_ptr, - const double* __restrict__ vals, - int num_rows, - double degree, - double* __restrict__ out_sum); -__global__ void clamp_sqrt_and_accum(double* __restrict__ x, - double* __restrict__ inv_x, - double* __restrict__ cum, - int n_vars); +__global__ void compute_csr_row_absmax_kernel(const int *__restrict__ row_ptr, + const double *__restrict__ vals, + int num_rows, + double *__restrict__ out_max); +__global__ void compute_csr_row_powsum_kernel(const int *__restrict__ row_ptr, + const double *__restrict__ vals, + int num_rows, + double degree, + double *__restrict__ out_sum); +__global__ void clamp_sqrt_and_accum_kernel(double *__restrict__ x, + double *__restrict__ inv_x, + double *__restrict__ cum, + int n_vars); __global__ void reduce_bound_norm_sq_kernel( - const double* __restrict__ L, - const double* __restrict__ U, + const double *__restrict__ L, + const double *__restrict__ U, int n_cons, - double* __restrict__ block_sums); + double *__restrict__ block_sums); __global__ void scale_bounds_kernel( - double* __restrict__ con_lb, - double* __restrict__ con_ub, - double* __restrict__ con_lb_finite, - double* __restrict__ con_ub_finite, - double* __restrict__ dual_start, + double *__restrict__ con_lb, + double *__restrict__ con_ub, + double *__restrict__ con_lb_finite, + double *__restrict__ con_ub_finite, + double *__restrict__ dual_start, int n_cons, double con_scale, double obj_scale); -__global__ void scale_obj_kernel( - double* __restrict__ c, - double* __restrict__ var_lb, - double* __restrict__ var_ub, - double* __restrict__ var_lb_finite, - double* __restrict__ var_ub_finite, - double* __restrict__ primal_start, +__global__ void scale_objective_kernel( + double *__restrict__ c, + double *__restrict__ var_lb, + double *__restrict__ var_ub, + double *__restrict__ var_lb_finite, + double *__restrict__ var_ub_finite, + double *__restrict__ primal_start, int n_vars, double con_scale, double obj_scale); -__global__ void fill_ones_kernel(double* __restrict__ x, int n_vars); +__global__ void fill_ones_kernel(double *__restrict__ x, int n_vars); static void scale_problem(pdhg_solver_state_t *state, double *E, double *D, double *invE, double *invD); static void ruiz_rescaling(pdhg_solver_state_t *state, int num_iters, rescale_info_t *rescale_info, double *E, double *D, double *invE, double *invD); -static void pock_chambolle_rescaling(pdhg_solver_state_t *state, double alpha, rescale_info_t *rescale_info, +static void pock_chambolle_rescaling(pdhg_solver_state_t *state, const double alpha, rescale_info_t *rescale_info, double *E, double *D, double *invE, double *invD); static void bound_objective_rescaling(pdhg_solver_state_t *state, rescale_info_t *rescale_info); @@ -125,12 +125,12 @@ static void scale_problem( invE, n_cons); - csr_scale_nnz_kernel<<num_blocks_nnz, THREADS_PER_BLOCK>>>( + scale_csr_nnz_kernel<<num_blocks_nnz, THREADS_PER_BLOCK>>>( state->constraint_matrix->row_ind, state->constraint_matrix->col_ind, state->constraint_matrix->val, state->constraint_matrix_t->val, - state->constraint_matrix->transpose_pos, + state->constraint_matrix->transpose_map, invD, invE, state->constraint_matrix->num_nonzeros); @@ -150,23 +150,23 @@ static void ruiz_rescaling( for (int iter = 0; iter < num_iterations; ++iter) { - csr_row_absmax_kernel<<num_blocks_dual, THREADS_PER_BLOCK>>>( + compute_csr_row_absmax_kernel<<num_blocks_dual, THREADS_PER_BLOCK>>>( state->constraint_matrix->row_ptr, state->constraint_matrix->val, n_cons, E); - clamp_sqrt_and_accum<<num_blocks_dual, THREADS_PER_BLOCK>>>( + clamp_sqrt_and_accum_kernel<<num_blocks_dual, THREADS_PER_BLOCK>>>( E, invE, rescale_info->con_rescale, n_cons); - csr_row_absmax_kernel<<num_blocks_primal, THREADS_PER_BLOCK>>>( + compute_csr_row_absmax_kernel<<num_blocks_primal, THREADS_PER_BLOCK>>>( state->constraint_matrix_t->row_ptr, state->constraint_matrix_t->val, n_vars, D); - clamp_sqrt_and_accum<<num_blocks_primal, THREADS_PER_BLOCK>>>( + clamp_sqrt_and_accum_kernel<<num_blocks_primal, THREADS_PER_BLOCK>>>( D, invD, rescale_info->var_rescale, @@ -188,32 +188,31 @@ static void pock_chambolle_rescaling( const int n_cons = state->num_constraints; const int n_vars = state->num_variables; - csr_row_powsum_kernel<<num_blocks_dual, THREADS_PER_BLOCK>>>( + compute_csr_row_powsum_kernel<<num_blocks_dual, THREADS_PER_BLOCK>>>( state->constraint_matrix->row_ptr, state->constraint_matrix->val, n_cons, alpha, E); - clamp_sqrt_and_accum<<num_blocks_dual, THREADS_PER_BLOCK>>>( + clamp_sqrt_and_accum_kernel<<num_blocks_dual, THREADS_PER_BLOCK>>>( E, invE, rescale_info->con_rescale, n_cons); - csr_row_powsum_kernel<<num_blocks_primal, THREADS_PER_BLOCK>>>( + compute_csr_row_powsum_kernel<<num_blocks_primal, THREADS_PER_BLOCK>>>( state->constraint_matrix_t->row_ptr, state->constraint_matrix_t->val, n_vars, 2.0 - alpha, D); - clamp_sqrt_and_accum<<num_blocks_primal, THREADS_PER_BLOCK>>>( + clamp_sqrt_and_accum_kernel<<num_blocks_primal, THREADS_PER_BLOCK>>>( D, invD, rescale_info->var_rescale, n_vars); scale_problem(state, E, D, invE, invD); - } static void bound_objective_rescaling( @@ -223,19 +222,19 @@ static void bound_objective_rescaling( const int n_cons = state->num_constraints; const int n_vars = state->num_variables; - double *bnd_norm_sq_cuda = nullptr; - CUDA_CHECK(cudaMalloc(&bnd_norm_sq_cuda, sizeof(double))); - CUDA_CHECK(cudaMemset(bnd_norm_sq_cuda, 0, sizeof(double))); + double *bnd_norm_sq_d = NULL; + CUDA_CHECK(cudaMalloc(&bnd_norm_sq_d, sizeof(double))); + CUDA_CHECK(cudaMemset(bnd_norm_sq_d, 0, sizeof(double))); reduce_bound_norm_sq_kernel<<<1, THREADS_PER_BLOCK, THREADS_PER_BLOCK * sizeof(double)>>>( state->constraint_lower_bound, state->constraint_upper_bound, n_cons, - bnd_norm_sq_cuda); + bnd_norm_sq_d); - double bnd_norm_sq = 0.0; - CUDA_CHECK(cudaMemcpy(&bnd_norm_sq, bnd_norm_sq_cuda, sizeof(double), cudaMemcpyDeviceToHost)); - CUDA_CHECK(cudaFree(bnd_norm_sq_cuda)); - double bnd_norm = sqrt(bnd_norm_sq); + double bnd_norm_sq_h = 0.0; + CUDA_CHECK(cudaMemcpy(&bnd_norm_sq_h, bnd_norm_sq_d, sizeof(double), cudaMemcpyDeviceToHost)); + CUDA_CHECK(cudaFree(bnd_norm_sq_d)); + double bnd_norm = sqrt(bnd_norm_sq_h); double obj_norm = 0.0; CUBLAS_CHECK(cublasDnrm2(state->blas_handle, @@ -256,7 +255,7 @@ static void bound_objective_rescaling( con_scale, obj_scale); - scale_obj_kernel<<num_blocks_primal, THREADS_PER_BLOCK>>>( + scale_objective_kernel<<num_blocks_primal, THREADS_PER_BLOCK>>>( state->objective_vector, state->variable_lower_bound, state->variable_upper_bound, @@ -268,7 +267,7 @@ static void bound_objective_rescaling( obj_scale); rescale_info->con_bound_rescale = con_scale; - rescale_info->obj_vec_rescale = obj_scale; + rescale_info->obj_vec_rescale = obj_scale; } rescale_info_t *rescale_problem( @@ -282,18 +281,18 @@ rescale_info_t *rescale_problem( clock_t start_rescaling = clock(); rescale_info_t *rescale_info = (rescale_info_t *)safe_calloc(1, sizeof(rescale_info_t)); - CUDA_CHECK(cudaMalloc(&rescale_info->con_rescale, n_cons*sizeof(double))); - CUDA_CHECK(cudaMalloc(&rescale_info->var_rescale, n_vars*sizeof(double))); + CUDA_CHECK(cudaMalloc(&rescale_info->con_rescale, n_cons * sizeof(double))); + CUDA_CHECK(cudaMalloc(&rescale_info->var_rescale, n_vars * sizeof(double))); fill_ones_kernel<<num_blocks_dual, THREADS_PER_BLOCK>>>( rescale_info->con_rescale, n_cons); fill_ones_kernel<<num_blocks_primal, THREADS_PER_BLOCK>>>( rescale_info->var_rescale, n_vars); - double *E=nullptr, *D=nullptr, *invE=nullptr, *invD=nullptr; - CUDA_CHECK(cudaMalloc(&E, n_cons*sizeof(double))); - CUDA_CHECK(cudaMalloc(&D, n_vars*sizeof(double))); - CUDA_CHECK(cudaMalloc(&invE, n_cons*sizeof(double))); - CUDA_CHECK(cudaMalloc(&invD, n_vars*sizeof(double))); + double *E = NULL, *D = NULL, *invE = NULL, *invD = NULL; + CUDA_CHECK(cudaMalloc(&E, n_cons * sizeof(double))); + CUDA_CHECK(cudaMalloc(&D, n_vars * sizeof(double))); + CUDA_CHECK(cudaMalloc(&invE, n_cons * sizeof(double))); + CUDA_CHECK(cudaMalloc(&invD, n_vars * sizeof(double))); if (params->l_inf_ruiz_iterations > 0) { @@ -324,21 +323,22 @@ rescale_info_t *rescale_problem( return rescale_info; } -__global__ void scale_variables_kernel(double* __restrict__ c, - double* __restrict__ var_lb, - double* __restrict__ var_ub, - double* __restrict__ var_lb_finite, - double* __restrict__ var_ub_finite, - double* __restrict__ primal_start, - const double* __restrict__ D, - const double* __restrict__ invD, +__global__ void scale_variables_kernel(double *__restrict__ c, + double *__restrict__ var_lb, + double *__restrict__ var_ub, + double *__restrict__ var_lb_finite, + double *__restrict__ var_ub_finite, + double *__restrict__ primal_start, + const double *__restrict__ D, + const double *__restrict__ invD, int n_vars) { int j = blockIdx.x * blockDim.x + threadIdx.x; - if (j >= n_vars) return; + if (j >= n_vars) + return; double dj = D[j]; double inv_dj = invD[j]; - c[j] *= inv_dj; + c[j] *= inv_dj; var_lb[j] *= dj; var_ub[j] *= dj; var_lb_finite[j] *= dj; @@ -346,17 +346,18 @@ __global__ void scale_variables_kernel(double* __restrict__ c, primal_start[j] *= dj; } -__global__ void scale_constraints_kernel(double* __restrict__ con_lb, - double* __restrict__ con_ub, - double* __restrict__ con_lb_finite, - double* __restrict__ con_ub_finite, - double* __restrict__ dual_start, - const double* __restrict__ E, - const double* __restrict__ invE, +__global__ void scale_constraints_kernel(double *__restrict__ con_lb, + double *__restrict__ con_ub, + double *__restrict__ con_lb_finite, + double *__restrict__ con_ub_finite, + double *__restrict__ dual_start, + const double *__restrict__ E, + const double *__restrict__ invE, int n_cons) { int i = blockIdx.x * blockDim.x + threadIdx.x; - if (i >= n_cons) return; + if (i >= n_cons) + return; double inv_ei = invE[i]; double ei = E[i]; con_lb[i] *= inv_ei; @@ -366,13 +367,13 @@ __global__ void scale_constraints_kernel(double* __restrict__ con_lb, dual_start[i] *= ei; } -__global__ void csr_scale_nnz_kernel(const int* __restrict__ row_ids, - const int* __restrict__ col_ind, - double* __restrict__ A_vals, - double* __restrict__ At_vals, - const int* __restrict__ A_to_At, - const double* __restrict__ invD, - const double* __restrict__ invE, +__global__ void scale_csr_nnz_kernel(const int *__restrict__ row_ids, + const int *__restrict__ col_ind, + double *__restrict__ A_vals, + double *__restrict__ At_vals, + const int *__restrict__ A_to_At, + const double *__restrict__ invD, + const double *__restrict__ invE, int nnz) { for (int k = blockIdx.x * blockDim.x + threadIdx.x; @@ -387,52 +388,63 @@ __global__ void csr_scale_nnz_kernel(const int* __restrict__ row_ids, } } -__global__ void csr_row_absmax_kernel(const int* __restrict__ row_ptr, - const double* __restrict__ vals, - int num_rows, - double* __restrict__ out_max) +__global__ void compute_csr_row_absmax_kernel(const int *__restrict__ row_ptr, + const double *__restrict__ vals, + int num_rows, + double *__restrict__ out_max) { int i = blockIdx.x * blockDim.x + threadIdx.x; - if (i >= num_rows) return; + if (i >= num_rows) + return; int s = row_ptr[i], e = row_ptr[i + 1]; double m = 0.0; - for (int k = s; k < e; ++k) { + for (int k = s; k < e; ++k) + { double v = fabs(vals[k]); - if (!isfinite(v)) v = 0.0; - if (v > m) m = v; + if (!isfinite(v)) + v = 0.0; + if (v > m) + m = v; } out_max[i] = m; } -__device__ __forceinline__ double pow_fast(double v, double p) { - if (p == 2.0) return v * v; - if (p == 1.0) return v; - if (p == 0.5) return sqrt(v); +__device__ __forceinline__ double pow_fast(double v, double p) +{ + if (p == 2.0) + return v * v; + if (p == 1.0) + return v; + if (p == 0.5) + return sqrt(v); return pow(v, p); } -__global__ void csr_row_powsum_kernel(const int* __restrict__ row_ptr, - const double* __restrict__ vals, - int num_rows, - double degree, - double* __restrict__ out_sum) +__global__ void compute_csr_row_powsum_kernel(const int *__restrict__ row_ptr, + const double *__restrict__ vals, + int num_rows, + double degree, + double *__restrict__ out_sum) { int i = blockIdx.x * blockDim.x + threadIdx.x; - if (i >= num_rows) return; + if (i >= num_rows) + return; int s = row_ptr[i], e = row_ptr[i + 1]; double acc = 0.0; - for (int k = s; k < e; ++k) { + for (int k = s; k < e; ++k) + { double v = fabs(vals[k]); - if (!isfinite(v)) v = 0.0; + if (!isfinite(v)) + v = 0.0; acc += pow_fast(v, degree); } out_sum[i] = acc; } -__global__ void clamp_sqrt_and_accum(double* __restrict__ x, - double* __restrict__ inv_x, - double* __restrict__ cum, - int n_vars) +__global__ void clamp_sqrt_and_accum_kernel(double *__restrict__ x, + double *__restrict__ inv_x, + double *__restrict__ cum, + int n_vars) { for (int t = blockIdx.x * blockDim.x + threadIdx.x; t < n_vars; t += blockDim.x * gridDim.x) { @@ -445,10 +457,10 @@ __global__ void clamp_sqrt_and_accum(double* __restrict__ x, } __global__ void reduce_bound_norm_sq_kernel( - const double* __restrict__ L, - const double* __restrict__ U, + const double *__restrict__ L, + const double *__restrict__ U, int n_cons, - double* __restrict__ block_sums) + double *__restrict__ block_sums) { extern __shared__ double sdata[]; int tid = threadIdx.x; @@ -456,14 +468,17 @@ __global__ void reduce_bound_norm_sq_kernel( int stride = blockDim.x * gridDim.x; double acc = 0.0; - for (int i = global_tid; i < n_cons; i += stride) { + for (int i = global_tid; i < n_cons; i += stride) + { double Li = L[i], Ui = U[i]; bool fL = isfinite(Li), fU = isfinite(Ui); - if (fL && (!fU || fabs(Li - Ui) > SCALING_EPSILON)) { + if (fL && (!fU || fabs(Li - Ui) > SCALING_EPSILON)) + { acc += Li * Li; } - if (fU) { + if (fU) + { acc += Ui * Ui; } } @@ -471,60 +486,66 @@ __global__ void reduce_bound_norm_sq_kernel( sdata[tid] = acc; __syncthreads(); - for (int s = blockDim.x / 2; s > 0; s >>= 1) { - if (tid < s) { + for (int s = blockDim.x / 2; s > 0; s >>= 1) + { + if (tid < s) + { sdata[tid] += sdata[tid + s]; } __syncthreads(); } - if (tid == 0) { + if (tid == 0) + { block_sums[blockIdx.x] = sdata[0]; } } __global__ void scale_bounds_kernel( - double* __restrict__ con_lb, - double* __restrict__ con_ub, - double* __restrict__ con_lb_finite, - double* __restrict__ con_ub_finite, - double* __restrict__ dual_start, + double *__restrict__ con_lb, + double *__restrict__ con_ub, + double *__restrict__ con_lb_finite, + double *__restrict__ con_ub_finite, + double *__restrict__ dual_start, int n_cons, double con_scale, double obj_scale) { int i = blockIdx.x * blockDim.x + threadIdx.x; - if (i >= n_cons) return; - con_lb[i] *= con_scale; - con_ub[i] *= con_scale; + if (i >= n_cons) + return; + con_lb[i] *= con_scale; + con_ub[i] *= con_scale; con_lb_finite[i] *= con_scale; con_ub_finite[i] *= con_scale; - dual_start[i] *= obj_scale; + dual_start[i] *= obj_scale; } -__global__ void scale_obj_kernel( - double* __restrict__ c, - double* __restrict__ var_lb, - double* __restrict__ var_ub, - double* __restrict__ var_lb_finite, - double* __restrict__ var_ub_finite, - double* __restrict__ primal_start, +__global__ void scale_objective_kernel( + double *__restrict__ c, + double *__restrict__ var_lb, + double *__restrict__ var_ub, + double *__restrict__ var_lb_finite, + double *__restrict__ var_ub_finite, + double *__restrict__ primal_start, int n_vars, double con_scale, double obj_scale) { int j = blockIdx.x * blockDim.x + threadIdx.x; - if (j >= n_vars) return; - var_lb[j] *= con_scale; - var_ub[j] *= con_scale; + if (j >= n_vars) + return; + var_lb[j] *= con_scale; + var_ub[j] *= con_scale; var_lb_finite[j] *= con_scale; var_ub_finite[j] *= con_scale; - c[j] *= obj_scale; - primal_start[j] *= con_scale; + c[j] *= obj_scale; + primal_start[j] *= con_scale; } -__global__ void fill_ones_kernel(double* __restrict__ x, int n_vars) +__global__ void fill_ones_kernel(double *__restrict__ x, int n_vars) { int i = blockIdx.x * blockDim.x + threadIdx.x; - if (i < n_vars) x[i] = 1.0; + if (i < n_vars) + x[i] = 1.0; } diff --git a/src/solver.cu b/src/solver.cu index c40dfb3..c01d5b8 100644 --- a/src/solver.cu +++ b/src/solver.cu @@ -27,57 +27,57 @@ limitations under the License. #include #include -__global__ void build_row_ind(const int* __restrict__ row_ptr, +__global__ void build_row_ind(const int *__restrict__ row_ptr, int num_rows, - int* __restrict__ row_ind); -__global__ void build_transpose_pos(const int* __restrict__ A_row_ind, - const int* __restrict__ A_col_ind, - const int* __restrict__ At_row_ptr, - const int* __restrict__ At_col_ind, + int *__restrict__ row_ind); +__global__ void build_transpose_map(const int *__restrict__ A_row_ind, + const int *__restrict__ A_col_ind, + const int *__restrict__ At_row_ptr, + const int *__restrict__ At_col_ind, int nnz, - int* __restrict__ A_to_At); -__global__ void compute_next_pdhg_primal_solution_kernel(const double *current_primal, - double *reflected_primal, + int *__restrict__ A_to_At); +__global__ void compute_next_pdhg_primal_solution_kernel(const double *current_primal, + double *reflected_primal, const double *dual_product, - const double *objective, - const double *var_lb, + const double *objective, + const double *var_lb, const double *var_ub, - int n, + int n, double step_size); -__global__ void compute_next_pdhg_primal_solution_major_kernel(const double *current_primal, - double *pdhg_primal, +__global__ void compute_next_pdhg_primal_solution_major_kernel(const double *current_primal, + double *pdhg_primal, double *reflected_primal, - const double *dual_product, - const double *objective, + const double *dual_product, + const double *objective, const double *var_lb, - const double *var_ub, - int n, - double step_size, + const double *var_ub, + int n, + double step_size, double *dual_slack); -__global__ void compute_next_pdhg_dual_solution_kernel(const double *current_dual, - double *reflected_dual, +__global__ void compute_next_pdhg_dual_solution_kernel(const double *current_dual, + double *reflected_dual, const double *primal_product, - const double *const_lb, - const double *const_ub, - int n, + const double *const_lb, + const double *const_ub, + int n, double step_size); -__global__ void compute_next_pdhg_dual_solution_major_kernel(const double *current_dual, - double *pdhg_dual, +__global__ void compute_next_pdhg_dual_solution_major_kernel(const double *current_dual, + double *pdhg_dual, double *reflected_dual, - const double *primal_product, - const double *const_lb, + const double *primal_product, + const double *const_lb, const double *const_ub, - int n, + int n, double step_size); -__global__ void halpern_update_kernel(const double *initial_primal, - double *current_primal, +__global__ void halpern_update_kernel(const double *initial_primal, + double *current_primal, const double *reflected_primal, - const double *initial_dual, - double *current_dual, + const double *initial_dual, + double *current_dual, const double *reflected_dual, - int n_vars, - int n_cons, - double weight, + int n_vars, + int n_cons, + double weight, double reflection_coeff); __global__ void rescale_solution_kernel(double *primal_solution, double *dual_solution, @@ -86,13 +86,13 @@ __global__ void rescale_solution_kernel(double *primal_solution, const double objective_vector_rescaling, const double constraint_bound_rescaling, int n_vars, int n_cons); -__global__ void compute_delta_solution_kernel(const double *initial_primal, - const double *pdhg_primal, +__global__ void compute_delta_solution_kernel(const double *initial_primal, + const double *pdhg_primal, double *delta_primal, - const double *initial_dual, - const double *pdhg_dual, + const double *initial_dual, + const double *pdhg_dual, double *delta_dual, - int n_vars, + int n_vars, int n_cons); static void compute_next_pdhg_primal_solution(pdhg_solver_state_t *state); static void compute_next_pdhg_dual_solution(pdhg_solver_state_t *state); @@ -101,12 +101,12 @@ static void rescale_solution(pdhg_solver_state_t *state); static cupdlpx_result_t *create_result_from_state(pdhg_solver_state_t *state); static void perform_restart(pdhg_solver_state_t *state, const pdhg_parameters_t *params); static void initialize_step_size_and_primal_weight(pdhg_solver_state_t *state, const pdhg_parameters_t *params); -static pdhg_solver_state_t *initialize_solver_state(const lp_problem_t *original_problem, const pdhg_parameters_t* params); +static pdhg_solver_state_t *initialize_solver_state(const lp_problem_t *original_problem, const pdhg_parameters_t *params); static void compute_fixed_point_error(pdhg_solver_state_t *state); void pdhg_solver_state_free(pdhg_solver_state_t *state); void rescale_info_free(rescale_info_t *info); -static void perform_primal_restart(pdhg_solver_state_t *state); +static void perform_primal_restart(pdhg_solver_state_t *state); static void perform_dual_restart(pdhg_solver_state_t *state); void primal_feasibility_polish(const pdhg_parameters_t *params, pdhg_solver_state_t *state, const pdhg_solver_state_t *ori_state); void dual_feasibility_polish(const pdhg_parameters_t *params, pdhg_solver_state_t *state, const pdhg_solver_state_t *ori_state); @@ -120,13 +120,12 @@ static pdhg_solver_state_t *initialize_primal_feas_polish_state( static pdhg_solver_state_t *initialize_dual_feas_polish_state( const pdhg_solver_state_t *original_state); - cupdlpx_result_t *optimize(const pdhg_parameters_t *params, const lp_problem_t *original_problem) { print_initial_info(params, original_problem); pdhg_solver_state_t *state = initialize_solver_state(original_problem, params); - + initialize_step_size_and_primal_weight(state, params); clock_t start_time = clock(); bool do_restart = false; @@ -182,8 +181,8 @@ cupdlpx_result_t *optimize(const pdhg_parameters_t *params, pdhg_final_log(state, params->verbose, state->termination_reason); - if (params->feasibility_polishing && - state->termination_reason != TERMINATION_REASON_DUAL_INFEASIBLE && + if (params->feasibility_polishing && + state->termination_reason != TERMINATION_REASON_DUAL_INFEASIBLE && state->termination_reason != TERMINATION_REASON_PRIMAL_INFEASIBLE) { feasibility_polish(params, state); @@ -196,7 +195,7 @@ cupdlpx_result_t *optimize(const pdhg_parameters_t *params, static pdhg_solver_state_t *initialize_solver_state( const lp_problem_t *original_problem, - const pdhg_parameters_t* params) + const pdhg_parameters_t *params) { pdhg_solver_state_t *state = (pdhg_solver_state_t *)safe_calloc(1, sizeof(pdhg_solver_state_t)); @@ -227,7 +226,7 @@ static pdhg_solver_state_t *initialize_solver_state( state->num_blocks_dual = (state->num_constraints + THREADS_PER_BLOCK - 1) / THREADS_PER_BLOCK; state->num_blocks_primal_dual = (state->num_variables + state->num_constraints + THREADS_PER_BLOCK - 1) / THREADS_PER_BLOCK; state->num_blocks_nnz = (state->constraint_matrix->num_nonzeros + THREADS_PER_BLOCK - 1) / THREADS_PER_BLOCK; - + CUSPARSE_CHECK(cusparseCreate(&state->sparse_handle)); CUBLAS_CHECK(cublasCreate(&state->blas_handle)); CUBLAS_CHECK(cublasSetPointerMode(state->blas_handle, CUBLAS_POINTER_MODE_HOST)); @@ -273,15 +272,15 @@ static pdhg_solver_state_t *initialize_solver_state( state->constraint_matrix_t->row_ptr, n_vars, state->constraint_matrix_t->row_ind); CUDA_CHECK(cudaGetLastError()); - CUDA_CHECK(cudaMalloc(&state->constraint_matrix->transpose_pos, nnz * sizeof(int))); - state->constraint_matrix_t->transpose_pos = NULL; - build_transpose_pos<<num_blocks_nnz, THREADS_PER_BLOCK>>>( + CUDA_CHECK(cudaMalloc(&state->constraint_matrix->transpose_map, nnz * sizeof(int))); + state->constraint_matrix_t->transpose_map = NULL; + build_transpose_map<<num_blocks_nnz, THREADS_PER_BLOCK>>>( state->constraint_matrix->row_ind, state->constraint_matrix->col_ind, state->constraint_matrix_t->row_ptr, state->constraint_matrix_t->col_ind, nnz, - state->constraint_matrix->transpose_pos); + state->constraint_matrix->transpose_map); CUDA_CHECK(cudaGetLastError()); ALLOC_AND_COPY(state->variable_lower_bound, original_problem->variable_lower_bound, var_bytes); @@ -329,24 +328,24 @@ static pdhg_solver_state_t *initialize_solver_state( if (original_problem->primal_start) { - CUDA_CHECK(cudaMemcpy(state->initial_primal_solution, - original_problem->primal_start, - var_bytes, cudaMemcpyHostToDevice)); + CUDA_CHECK(cudaMemcpy(state->initial_primal_solution, + original_problem->primal_start, + var_bytes, cudaMemcpyHostToDevice)); } if (original_problem->dual_start) { - CUDA_CHECK(cudaMemcpy(state->initial_dual_solution, - original_problem->dual_start, - con_bytes, cudaMemcpyHostToDevice)); + CUDA_CHECK(cudaMemcpy(state->initial_dual_solution, + original_problem->dual_start, + con_bytes, cudaMemcpyHostToDevice)); } rescale_info_t *rescale_info = rescale_problem(params, state); state->constraint_rescaling = rescale_info->con_rescale; - state->variable_rescaling = rescale_info->var_rescale; + state->variable_rescaling = rescale_info->var_rescale; state->constraint_bound_rescaling = rescale_info->con_bound_rescale; state->objective_vector_rescaling = rescale_info->obj_vec_rescale; - state->rescaling_time_sec = rescale_info->rescaling_time_sec; + state->rescaling_time_sec = rescale_info->rescaling_time_sec; rescale_info->con_rescale = NULL; rescale_info->var_rescale = NULL; @@ -356,8 +355,8 @@ static pdhg_solver_state_t *initialize_solver_state( state->constraint_matrix->row_ind = NULL; CUDA_CHECK(cudaFree(state->constraint_matrix_t->row_ind)); state->constraint_matrix_t->row_ind = NULL; - CUDA_CHECK(cudaFree(state->constraint_matrix->transpose_pos)); - state->constraint_matrix->transpose_pos = NULL; + CUDA_CHECK(cudaFree(state->constraint_matrix->transpose_map)); + state->constraint_matrix->transpose_map = NULL; double sum_of_squares = 0.0; for (int i = 0; i < n_vars; ++i) @@ -401,7 +400,7 @@ static pdhg_solver_state_t *initialize_solver_state( CUSPARSE_CHECK(cusparseCreateDnVec(&state->vec_primal_prod, state->num_constraints, state->primal_product, CUDA_R_64F)); CUSPARSE_CHECK(cusparseCreateDnVec(&state->vec_dual_prod, state->num_variables, state->dual_product, CUDA_R_64F)); CUSPARSE_CHECK(cusparseSpMV_bufferSize(state->sparse_handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &HOST_ONE, state->matA, state->vec_primal_sol, &HOST_ZERO, state->vec_primal_prod, CUDA_R_64F, CUSPARSE_SPMV_CSR_ALG2, &primal_spmv_buffer_size)); - + CUSPARSE_CHECK(cusparseSpMV_bufferSize(state->sparse_handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &HOST_ONE, state->matAt, state->vec_dual_sol, &HOST_ZERO, state->vec_dual_prod, CUDA_R_64F, CUSPARSE_SPMV_CSR_ALG2, &dual_spmv_buffer_size)); CUDA_CHECK(cudaMalloc(&state->primal_spmv_buffer, primal_spmv_buffer_size)); CUSPARSE_CHECK(cusparseSpMV_preprocess(state->sparse_handle, CUSPARSE_OPERATION_NON_TRANSPOSE, @@ -431,49 +430,55 @@ static pdhg_solver_state_t *initialize_solver_state( return state; } -__global__ void build_row_ind(const int* __restrict__ row_ptr, +__global__ void build_row_ind(const int *__restrict__ row_ptr, int num_rows, - int* __restrict__ row_ind) + int *__restrict__ row_ind) { int i = blockIdx.x * blockDim.x + threadIdx.x; - if (i >= num_rows) return; + if (i >= num_rows) + return; int s = row_ptr[i]; int e = row_ptr[i + 1]; - for (int k = s; k < e; ++k) { + for (int k = s; k < e; ++k) + { row_ind[k] = i; } } -__global__ void build_transpose_pos( - const int* __restrict__ A_row_ind, - const int* __restrict__ A_col_ind, - const int* __restrict__ At_row_ptr, - const int* __restrict__ At_col_ind, +__global__ void build_transpose_map( + const int *__restrict__ A_row_ind, + const int *__restrict__ A_col_ind, + const int *__restrict__ At_row_ptr, + const int *__restrict__ At_col_ind, int nnz, - int* __restrict__ A_to_At) + int *__restrict__ A_to_At) { int k = blockIdx.x * blockDim.x + threadIdx.x; - if (k >= nnz) return; + if (k >= nnz) + return; - int i = A_row_ind[k]; - int j = A_col_ind[k]; + int i = A_row_ind[k]; + int j = A_col_ind[k]; int start = At_row_ptr[j]; - int end = At_row_ptr[j + 1]; + int end = At_row_ptr[j + 1]; int pos = -1; - for (int idx = start; idx < end; ++idx) { - if (At_col_ind[idx] == i) { + for (int idx = start; idx < end; ++idx) + { + if (At_col_ind[idx] == i) + { pos = idx; break; } } - if (pos < 0) return; + if (pos < 0) + return; - A_to_At[k] = pos; + A_to_At[k] = pos; } __global__ void compute_next_pdhg_primal_solution_kernel( @@ -838,7 +843,7 @@ void pdhg_solver_state_free(pdhg_solver_state_t *state) if (state->vec_dual_prod) CUSPARSE_CHECK(cusparseDestroyDnVec(state->vec_dual_prod)); if (state->primal_spmv_buffer) - CUDA_CHECK(cudaFree(state->primal_spmv_buffer)); + CUDA_CHECK(cudaFree(state->primal_spmv_buffer)); if (state->dual_spmv_buffer) CUDA_CHECK(cudaFree(state->dual_spmv_buffer)); if (state->sparse_handle) @@ -860,8 +865,8 @@ void pdhg_solver_state_free(pdhg_solver_state_t *state) CUDA_CHECK(cudaFree(state->constraint_matrix->row_ind)); if (state->constraint_matrix->val) CUDA_CHECK(cudaFree(state->constraint_matrix->val)); - if (state->constraint_matrix->transpose_pos) - CUDA_CHECK(cudaFree(state->constraint_matrix->transpose_pos)); + if (state->constraint_matrix->transpose_map) + CUDA_CHECK(cudaFree(state->constraint_matrix->transpose_map)); if (state->constraint_matrix_t->row_ptr) CUDA_CHECK(cudaFree(state->constraint_matrix_t->row_ptr)); if (state->constraint_matrix_t->col_ind) @@ -870,8 +875,8 @@ void pdhg_solver_state_free(pdhg_solver_state_t *state) CUDA_CHECK(cudaFree(state->constraint_matrix_t->row_ind)); if (state->constraint_matrix_t->val) CUDA_CHECK(cudaFree(state->constraint_matrix_t->val)); - if (state->constraint_matrix_t->transpose_pos) - CUDA_CHECK(cudaFree(state->constraint_matrix_t->transpose_pos)); + if (state->constraint_matrix_t->transpose_map) + CUDA_CHECK(cudaFree(state->constraint_matrix_t->transpose_map)); if (state->constraint_lower_bound) CUDA_CHECK(cudaFree(state->constraint_lower_bound)); if (state->constraint_upper_bound) @@ -1013,14 +1018,14 @@ void set_default_parameters(pdhg_parameters_t *params) params->restart_params.i_smooth = 0.3; } -//Feasibility Polishing +// Feasibility Polishing void feasibility_polish(const pdhg_parameters_t *params, pdhg_solver_state_t *state) { clock_t start_time = clock(); if (state->relative_primal_residual < params->termination_criteria.eps_feas_polish_relative && state->relative_dual_residual < params->termination_criteria.eps_feas_polish_relative) { - + printf("Skipping feasibility polishing as the solution is already sufficiently feasible.\n"); return; } @@ -1034,7 +1039,7 @@ void feasibility_polish(const pdhg_parameters_t *params, pdhg_solver_state_t *st original_primal_weight = (state->objective_vector_norm + 1.0) / (state->constraint_bound_norm + 1.0); } - //PRIMAL FEASIBILITY POLISHING + // PRIMAL FEASIBILITY POLISHING pdhg_solver_state_t *primal_state = initialize_primal_feas_polish_state(state); primal_state->primal_weight = original_primal_weight; primal_state->best_primal_weight = original_primal_weight; @@ -1050,8 +1055,8 @@ void feasibility_polish(const pdhg_parameters_t *params, pdhg_solver_state_t *st state->primal_objective_value = primal_state->primal_objective_value; } state->feasibility_iteration += primal_state->total_count - 1; - - //DUAL FEASIBILITY POLISHING + + // DUAL FEASIBILITY POLISHING pdhg_solver_state_t *dual_state = initialize_dual_feas_polish_state(state); dual_state->primal_weight = original_primal_weight; dual_state->best_primal_weight = original_primal_weight; @@ -1070,7 +1075,7 @@ void feasibility_polish(const pdhg_parameters_t *params, pdhg_solver_state_t *st state->objective_gap = fabs(state->primal_objective_value - state->dual_objective_value); state->relative_objective_gap = state->objective_gap / (1.0 + fabs(state->primal_objective_value) + fabs(state->dual_objective_value)); - + // FINAL LOGGING pdhg_feas_polish_final_log(primal_state, dual_state, params->verbose); primal_feas_polish_state_free(primal_state); @@ -1184,22 +1189,22 @@ static pdhg_solver_state_t *initialize_primal_feas_polish_state( CUDA_CHECK(cudaMalloc(&dest, bytes)); \ CUDA_CHECK(cudaMemset(dest, 0, bytes)); - //RESET PROBLEM TO FEASIBILITY PROBLEM + // RESET PROBLEM TO FEASIBILITY PROBLEM ALLOC_ZERO(primal_state->objective_vector, num_var * sizeof(double)); primal_state->objective_constant = 0.0; -#define ALLOC_AND_COPY_DEV(dest, src, bytes) \ - CUDA_CHECK(cudaMalloc(&dest, bytes)); \ +#define ALLOC_AND_COPY_DEV(dest, src, bytes) \ + CUDA_CHECK(cudaMalloc(&dest, bytes)); \ CUDA_CHECK(cudaMemcpy(dest, src, bytes, cudaMemcpyDeviceToDevice)); - //ALLOCATE AND COPY SOLUTION VECTORS + // ALLOCATE AND COPY SOLUTION VECTORS ALLOC_AND_COPY_DEV(primal_state->initial_primal_solution, original_state->initial_primal_solution, num_var * sizeof(double)); ALLOC_AND_COPY_DEV(primal_state->current_primal_solution, original_state->current_primal_solution, num_var * sizeof(double)); ALLOC_AND_COPY_DEV(primal_state->pdhg_primal_solution, original_state->pdhg_primal_solution, num_var * sizeof(double)); ALLOC_AND_COPY_DEV(primal_state->reflected_primal_solution, original_state->reflected_primal_solution, num_var * sizeof(double)); ALLOC_AND_COPY_DEV(primal_state->primal_product, original_state->primal_product, num_cons * sizeof(double)); - //ALLOC ZERO FOR OTHERS + // ALLOC ZERO FOR OTHERS ALLOC_ZERO(primal_state->initial_dual_solution, num_cons * sizeof(double)); ALLOC_ZERO(primal_state->current_dual_solution, num_cons * sizeof(double)); ALLOC_ZERO(primal_state->pdhg_dual_solution, num_cons * sizeof(double)); @@ -1213,7 +1218,7 @@ static pdhg_solver_state_t *initialize_primal_feas_polish_state( ALLOC_ZERO(primal_state->delta_primal_solution, num_var * sizeof(double)); ALLOC_ZERO(primal_state->delta_dual_solution, num_cons * sizeof(double)); - //RESET SCALAR + // RESET SCALAR primal_state->primal_weight_error_sum = 0.0; primal_state->primal_weight_last_error = 0.0; primal_state->best_primal_weight = 0.0; @@ -1229,7 +1234,7 @@ static pdhg_solver_state_t *initialize_primal_feas_polish_state( primal_state->cumulative_time_sec = 0.0; primal_state->best_primal_dual_residual_gap = INFINITY; - //IGNORE DUAL RESIDUAL AND OBJECTIVE GAP + // IGNORE DUAL RESIDUAL AND OBJECTIVE GAP primal_state->relative_dual_residual = 0.0; primal_state->absolute_dual_residual = 0.0; primal_state->relative_objective_gap = 0.0; @@ -1240,13 +1245,15 @@ static pdhg_solver_state_t *initialize_primal_feas_polish_state( void primal_feas_polish_state_free(pdhg_solver_state_t *state) { - #define SAFE_CUDA_FREE(p) \ - if ((p) != NULL) { \ - CUDA_CHECK(cudaFree(p));\ - (p) = NULL; \ - } \ - - if (!state) return; +#define SAFE_CUDA_FREE(p) \ + if ((p) != NULL) \ + { \ + CUDA_CHECK(cudaFree(p)); \ + (p) = NULL; \ + } + + if (!state) + return; SAFE_CUDA_FREE(state->objective_vector); SAFE_CUDA_FREE(state->initial_primal_solution); SAFE_CUDA_FREE(state->current_primal_solution); @@ -1259,7 +1266,7 @@ void primal_feas_polish_state_free(pdhg_solver_state_t *state) SAFE_CUDA_FREE(state->pdhg_dual_solution); SAFE_CUDA_FREE(state->reflected_dual_solution); SAFE_CUDA_FREE(state->primal_product); - + SAFE_CUDA_FREE(state->primal_slack); SAFE_CUDA_FREE(state->dual_slack); SAFE_CUDA_FREE(state->primal_residual); @@ -1279,7 +1286,6 @@ __global__ void zero_finite_value_vectors_kernel( vec[idx] = 0.0; } } - static pdhg_solver_state_t *initialize_dual_feas_polish_state( const pdhg_solver_state_t *original_state) @@ -1289,18 +1295,18 @@ static pdhg_solver_state_t *initialize_dual_feas_polish_state( int num_var = original_state->num_variables; int num_cons = original_state->num_constraints; - #define ALLOC_AND_COPY_DEV(dest, src, bytes) \ - CUDA_CHECK(cudaMalloc(&dest, bytes)); \ - CUDA_CHECK(cudaMemcpy(dest, src, bytes, cudaMemcpyDeviceToDevice)); - - //RESET PROBLEM TO DUAL FEASIBILITY PROBLEM - #define SET_FINITE_TO_ZERO(vec, n) \ - { \ - int threads = 256; \ - int blocks = (n + threads - 1) / threads; \ - zero_finite_value_vectors_kernel<<>>(vec, n); \ - CUDA_CHECK(cudaDeviceSynchronize()); \ - } +#define ALLOC_AND_COPY_DEV(dest, src, bytes) \ + CUDA_CHECK(cudaMalloc(&dest, bytes)); \ + CUDA_CHECK(cudaMemcpy(dest, src, bytes, cudaMemcpyDeviceToDevice)); + +// RESET PROBLEM TO DUAL FEASIBILITY PROBLEM +#define SET_FINITE_TO_ZERO(vec, n) \ + { \ + int threads = 256; \ + int blocks = (n + threads - 1) / threads; \ + zero_finite_value_vectors_kernel<<>>(vec, n); \ + CUDA_CHECK(cudaDeviceSynchronize()); \ + } ALLOC_AND_COPY_DEV(dual_state->constraint_lower_bound, original_state->constraint_lower_bound, num_cons * sizeof(double)); ALLOC_AND_COPY_DEV(dual_state->constraint_upper_bound, original_state->constraint_upper_bound, num_cons * sizeof(double)); @@ -1312,16 +1318,16 @@ static pdhg_solver_state_t *initialize_dual_feas_polish_state( SET_FINITE_TO_ZERO(dual_state->variable_lower_bound, num_var); SET_FINITE_TO_ZERO(dual_state->variable_upper_bound, num_var); - #define ALLOC_ZERO(dest, bytes) \ - CUDA_CHECK(cudaMalloc(&dest, bytes)); \ - CUDA_CHECK(cudaMemset(dest, 0, bytes)); +#define ALLOC_ZERO(dest, bytes) \ + CUDA_CHECK(cudaMalloc(&dest, bytes)); \ + CUDA_CHECK(cudaMemset(dest, 0, bytes)); ALLOC_ZERO(dual_state->constraint_lower_bound_finite_val, num_cons * sizeof(double)); ALLOC_ZERO(dual_state->constraint_upper_bound_finite_val, num_cons * sizeof(double)); ALLOC_ZERO(dual_state->variable_lower_bound_finite_val, num_var * sizeof(double)); ALLOC_ZERO(dual_state->variable_upper_bound_finite_val, num_var * sizeof(double)); - //ALLOCATE AND COPY SOLUTION VECTORS + // ALLOCATE AND COPY SOLUTION VECTORS ALLOC_AND_COPY_DEV(dual_state->initial_dual_solution, original_state->initial_dual_solution, num_cons * sizeof(double)); ALLOC_AND_COPY_DEV(dual_state->current_dual_solution, original_state->current_dual_solution, num_cons * sizeof(double)); ALLOC_AND_COPY_DEV(dual_state->pdhg_dual_solution, original_state->pdhg_dual_solution, num_cons * sizeof(double)); @@ -1329,7 +1335,7 @@ static pdhg_solver_state_t *initialize_dual_feas_polish_state( ALLOC_AND_COPY_DEV(dual_state->dual_product, original_state->dual_product, num_var * sizeof(double)); ALLOC_AND_COPY_DEV(dual_state->dual_slack, original_state->dual_slack, num_var * sizeof(double)); - //ALLOC ZERO FOR OTHERS + // ALLOC ZERO FOR OTHERS ALLOC_ZERO(dual_state->initial_primal_solution, num_var * sizeof(double)); ALLOC_ZERO(dual_state->current_primal_solution, num_var * sizeof(double)); ALLOC_ZERO(dual_state->pdhg_primal_solution, num_var * sizeof(double)); @@ -1341,7 +1347,7 @@ static pdhg_solver_state_t *initialize_dual_feas_polish_state( ALLOC_ZERO(dual_state->delta_primal_solution, num_var * sizeof(double)); ALLOC_ZERO(dual_state->delta_dual_solution, num_cons * sizeof(double)); - //RESET SCALAR + // RESET SCALAR dual_state->primal_weight_error_sum = 0.0; dual_state->primal_weight_last_error = 0.0; dual_state->best_primal_weight = 0.0; @@ -1357,7 +1363,7 @@ static pdhg_solver_state_t *initialize_dual_feas_polish_state( dual_state->cumulative_time_sec = 0.0; dual_state->best_primal_dual_residual_gap = INFINITY; - //IGNORE PRIMAL RESIDUAL AND OBJECTIVE GAP + // IGNORE PRIMAL RESIDUAL AND OBJECTIVE GAP dual_state->relative_primal_residual = 0.0; dual_state->absolute_primal_residual = 0.0; dual_state->relative_objective_gap = 0.0; @@ -1367,13 +1373,15 @@ static pdhg_solver_state_t *initialize_dual_feas_polish_state( void dual_feas_polish_state_free(pdhg_solver_state_t *state) { - #define SAFE_CUDA_FREE(p) \ - if ((p) != NULL) { \ - CUDA_CHECK(cudaFree(p));\ - (p) = NULL; \ - } \ - - if (!state) return; +#define SAFE_CUDA_FREE(p) \ + if ((p) != NULL) \ + { \ + CUDA_CHECK(cudaFree(p)); \ + (p) = NULL; \ + } + + if (!state) + return; SAFE_CUDA_FREE(state->constraint_lower_bound); SAFE_CUDA_FREE(state->constraint_upper_bound); SAFE_CUDA_FREE(state->variable_lower_bound); @@ -1394,7 +1402,7 @@ void dual_feas_polish_state_free(pdhg_solver_state_t *state) SAFE_CUDA_FREE(state->pdhg_dual_solution); SAFE_CUDA_FREE(state->reflected_dual_solution); SAFE_CUDA_FREE(state->primal_product); - + SAFE_CUDA_FREE(state->primal_slack); SAFE_CUDA_FREE(state->dual_slack); SAFE_CUDA_FREE(state->primal_residual); @@ -1473,5 +1481,3 @@ static void compute_dual_fixed_point_error(pdhg_solver_state_t *state) &dual_norm)); state->fixed_point_error = dual_norm * dual_norm / state->primal_weight; } - - From f13f5725ecc2ef2b500b2f67e86f4df83a2af852 Mon Sep 17 00:00:00 2001 From: Bo Tang Date: Thu, 27 Nov 2025 18:57:02 -0500 Subject: [PATCH 10/13] Clean code: rename variables --- src/preconditioner.cu | 468 +++++++++++++++++++++--------------------- 1 file changed, 234 insertions(+), 234 deletions(-) diff --git a/src/preconditioner.cu b/src/preconditioner.cu index 031051f..1f711cf 100644 --- a/src/preconditioner.cu +++ b/src/preconditioner.cu @@ -24,85 +24,85 @@ limitations under the License. #define SCALING_EPSILON 1e-8 -__global__ void scale_variables_kernel(double *__restrict__ c, - double *__restrict__ var_lb, - double *__restrict__ var_ub, - double *__restrict__ var_lb_finite, - double *__restrict__ var_ub_finite, - double *__restrict__ primal_start, - const double *__restrict__ D, - const double *__restrict__ invD, - int n_vars); -__global__ void scale_constraints_kernel(double *__restrict__ con_lb, - double *__restrict__ con_ub, - double *__restrict__ con_lb_finite, - double *__restrict__ con_ub_finite, - double *__restrict__ dual_start, - const double *__restrict__ E, - const double *__restrict__ invE, - int n_cons); -__global__ void scale_csr_nnz_kernel(const int *__restrict__ row_ids, - const int *__restrict__ col_ind, - double *__restrict__ A_vals, - double *__restrict__ At_vals, - const int *__restrict__ A_to_At, - const double *__restrict__ invD, - const double *__restrict__ invE, +__global__ void scale_variables_kernel(double *__restrict__ objective_vector, + double *__restrict__ variable_lower_bound, + double *__restrict__ variable_upper_bound, + double *__restrict__ variable_lower_bound_finite_val, + double *__restrict__ variable_upper_bound_finite_val, + double *__restrict__ initial_primal_solution, + const double *__restrict__ variable_rescaling, + const double *__restrict__ inverse_variable_rescaling, + int num_variables); +__global__ void scale_constraints_kernel(double *__restrict__ constraint_lower_bound, + double *__restrict__ constraint_upper_bound, + double *__restrict__ constraint_lower_bound_finite_val, + double *__restrict__ constraint_upper_bound_finite_val, + double *__restrict__ initial_dual_solution, + const double *__restrict__ constraint_rescaling, + const double *__restrict__ inverse_constraint_rescaling, + int num_constraints); +__global__ void scale_csr_nnz_kernel(const int *__restrict__ constraint_row_ind, + const int *__restrict__ constraint_col_ind, + double *__restrict__ constraint_matrix_val, + double *__restrict__ constraint_matrix_transpose_val, + const int *__restrict__ constraint_to_transpose_position, + const double *__restrict__ inverse_variable_rescaling, + const double *__restrict__ inverse_constraint_rescaling, int nnz); __global__ void compute_csr_row_absmax_kernel(const int *__restrict__ row_ptr, - const double *__restrict__ vals, + const double *__restrict__ matrix_vals, int num_rows, - double *__restrict__ out_max); + double *__restrict__ row_absmax_values); __global__ void compute_csr_row_powsum_kernel(const int *__restrict__ row_ptr, - const double *__restrict__ vals, + const double *__restrict__ matrix_vals, int num_rows, double degree, - double *__restrict__ out_sum); -__global__ void clamp_sqrt_and_accum_kernel(double *__restrict__ x, - double *__restrict__ inv_x, - double *__restrict__ cum, - int n_vars); + double *__restrict__ row_powsum_values); +__global__ void clamp_sqrt_and_accum_kernel(double *__restrict__ scaling_factors, + double *__restrict__ inverse_scaling_factors, + double *__restrict__ cumulative_rescaling, + int num_variables); __global__ void reduce_bound_norm_sq_kernel( - const double *__restrict__ L, - const double *__restrict__ U, - int n_cons, + const double *__restrict__ constraint_lower_bound, + const double *__restrict__ constraint_upper_bound, + int num_constraints, double *__restrict__ block_sums); __global__ void scale_bounds_kernel( - double *__restrict__ con_lb, - double *__restrict__ con_ub, - double *__restrict__ con_lb_finite, - double *__restrict__ con_ub_finite, - double *__restrict__ dual_start, - int n_cons, - double con_scale, - double obj_scale); + double *__restrict__ constraint_lower_bound, + double *__restrict__ constraint_upper_bound, + double *__restrict__ constraint_lower_bound_finite_val, + double *__restrict__ constraint_upper_bound_finite_val, + double *__restrict__ initial_dual_solution, + int num_constraints, + double constraint_scale, + double objective_scale); __global__ void scale_objective_kernel( - double *__restrict__ c, - double *__restrict__ var_lb, - double *__restrict__ var_ub, - double *__restrict__ var_lb_finite, - double *__restrict__ var_ub_finite, - double *__restrict__ primal_start, - int n_vars, - double con_scale, - double obj_scale); -__global__ void fill_ones_kernel(double *__restrict__ x, int n_vars); -static void scale_problem(pdhg_solver_state_t *state, double *E, double *D, double *invE, double *invD); + double *__restrict__ objective_vector, + double *__restrict__ variable_lower_bound, + double *__restrict__ variable_upper_bound, + double *__restrict__ variable_lower_bound_finite_val, + double *__restrict__ variable_upper_bound_finite_val, + double *__restrict__ initial_primal_solution, + int num_variables, + double constraint_scale, + double objective_scale); +__global__ void fill_ones_kernel(double *__restrict__ x, int num_variables); +static void scale_problem(pdhg_solver_state_t *state, double *constraint_rescaling, double *variable_rescaling, double *inverse_constraint_rescaling, double *inverse_variable_rescaling); static void ruiz_rescaling(pdhg_solver_state_t *state, int num_iters, rescale_info_t *rescale_info, - double *E, double *D, double *invE, double *invD); + double *constraint_rescaling, double *variable_rescaling, double *inverse_constraint_rescaling, double *inverse_variable_rescaling); static void pock_chambolle_rescaling(pdhg_solver_state_t *state, const double alpha, rescale_info_t *rescale_info, - double *E, double *D, double *invE, double *invD); + double *constraint_rescaling, double *variable_rescaling, double *inverse_constraint_rescaling, double *inverse_variable_rescaling); static void bound_objective_rescaling(pdhg_solver_state_t *state, rescale_info_t *rescale_info); static void scale_problem( pdhg_solver_state_t *state, - double *E, - double *D, - double *invE, - double *invD) + double *constraint_rescaling, + double *variable_rescaling, + double *inverse_constraint_rescaling, + double *inverse_variable_rescaling) { - int n_vars = state->num_variables; - int n_cons = state->num_constraints; + int num_variables = state->num_variables; + int num_constraints = state->num_constraints; scale_variables_kernel<<num_blocks_primal, THREADS_PER_BLOCK>>>( state->objective_vector, @@ -111,9 +111,9 @@ static void scale_problem( state->variable_lower_bound_finite_val, state->variable_upper_bound_finite_val, state->initial_primal_solution, - D, - invD, - n_vars); + variable_rescaling, + inverse_variable_rescaling, + num_variables); scale_constraints_kernel<<num_blocks_dual, THREADS_PER_BLOCK>>>( state->constraint_lower_bound, @@ -121,9 +121,9 @@ static void scale_problem( state->constraint_lower_bound_finite_val, state->constraint_upper_bound_finite_val, state->initial_dual_solution, - E, - invE, - n_cons); + constraint_rescaling, + inverse_constraint_rescaling, + num_constraints); scale_csr_nnz_kernel<<num_blocks_nnz, THREADS_PER_BLOCK>>>( state->constraint_matrix->row_ind, @@ -131,8 +131,8 @@ static void scale_problem( state->constraint_matrix->val, state->constraint_matrix_t->val, state->constraint_matrix->transpose_map, - invD, - invE, + inverse_variable_rescaling, + inverse_constraint_rescaling, state->constraint_matrix->num_nonzeros); } @@ -140,39 +140,39 @@ static void ruiz_rescaling( pdhg_solver_state_t *state, int num_iterations, rescale_info_t *rescale_info, - double *E, - double *D, - double *invE, - double *invD) + double *constraint_rescaling, + double *variable_rescaling, + double *inverse_constraint_rescaling, + double *inverse_variable_rescaling) { - const int n_cons = state->num_constraints; - const int n_vars = state->num_variables; + const int num_constraints = state->num_constraints; + const int num_variables = state->num_variables; for (int iter = 0; iter < num_iterations; ++iter) { compute_csr_row_absmax_kernel<<num_blocks_dual, THREADS_PER_BLOCK>>>( state->constraint_matrix->row_ptr, state->constraint_matrix->val, - n_cons, - E); + num_constraints, + constraint_rescaling); clamp_sqrt_and_accum_kernel<<num_blocks_dual, THREADS_PER_BLOCK>>>( - E, - invE, + constraint_rescaling, + inverse_constraint_rescaling, rescale_info->con_rescale, - n_cons); + num_constraints); compute_csr_row_absmax_kernel<<num_blocks_primal, THREADS_PER_BLOCK>>>( state->constraint_matrix_t->row_ptr, state->constraint_matrix_t->val, - n_vars, - D); + num_variables, + variable_rescaling); clamp_sqrt_and_accum_kernel<<num_blocks_primal, THREADS_PER_BLOCK>>>( - D, - invD, + variable_rescaling, + inverse_variable_rescaling, rescale_info->var_rescale, - n_vars); + num_variables); - scale_problem(state, E, D, invE, invD); + scale_problem(state, constraint_rescaling, variable_rescaling, inverse_constraint_rescaling, inverse_variable_rescaling); } } @@ -180,47 +180,47 @@ static void pock_chambolle_rescaling( pdhg_solver_state_t *state, const double alpha, rescale_info_t *rescale_info, - double *E, - double *D, - double *invE, - double *invD) + double *constraint_rescaling, + double *variable_rescaling, + double *inverse_constraint_rescaling, + double *inverse_variable_rescaling) { - const int n_cons = state->num_constraints; - const int n_vars = state->num_variables; + const int num_constraints = state->num_constraints; + const int num_variables = state->num_variables; compute_csr_row_powsum_kernel<<num_blocks_dual, THREADS_PER_BLOCK>>>( state->constraint_matrix->row_ptr, state->constraint_matrix->val, - n_cons, + num_constraints, alpha, - E); + constraint_rescaling); clamp_sqrt_and_accum_kernel<<num_blocks_dual, THREADS_PER_BLOCK>>>( - E, - invE, + constraint_rescaling, + inverse_constraint_rescaling, rescale_info->con_rescale, - n_cons); + num_constraints); compute_csr_row_powsum_kernel<<num_blocks_primal, THREADS_PER_BLOCK>>>( state->constraint_matrix_t->row_ptr, state->constraint_matrix_t->val, - n_vars, + num_variables, 2.0 - alpha, - D); + variable_rescaling); clamp_sqrt_and_accum_kernel<<num_blocks_primal, THREADS_PER_BLOCK>>>( - D, - invD, + variable_rescaling, + inverse_variable_rescaling, rescale_info->var_rescale, - n_vars); + num_variables); - scale_problem(state, E, D, invE, invD); + scale_problem(state, constraint_rescaling, variable_rescaling, inverse_constraint_rescaling, inverse_variable_rescaling); } static void bound_objective_rescaling( pdhg_solver_state_t *state, rescale_info_t *rescale_info) { - const int n_cons = state->num_constraints; - const int n_vars = state->num_variables; + const int num_constraints = state->num_constraints; + const int num_variables = state->num_variables; double *bnd_norm_sq_d = NULL; CUDA_CHECK(cudaMalloc(&bnd_norm_sq_d, sizeof(double))); @@ -228,7 +228,7 @@ static void bound_objective_rescaling( reduce_bound_norm_sq_kernel<<<1, THREADS_PER_BLOCK, THREADS_PER_BLOCK * sizeof(double)>>>( state->constraint_lower_bound, state->constraint_upper_bound, - n_cons, + num_constraints, bnd_norm_sq_d); double bnd_norm_sq_h = 0.0; @@ -242,8 +242,8 @@ static void bound_objective_rescaling( state->objective_vector, 1, &obj_norm)); - double con_scale = 1.0 / (bnd_norm + 1.0); - double obj_scale = 1.0 / (obj_norm + 1.0); + double constraint_scale = 1.0 / (bnd_norm + 1.0); + double objective_scale = 1.0 / (obj_norm + 1.0); scale_bounds_kernel<<num_blocks_dual, THREADS_PER_BLOCK>>>( state->constraint_lower_bound, @@ -251,9 +251,9 @@ static void bound_objective_rescaling( state->constraint_lower_bound_finite_val, state->constraint_upper_bound_finite_val, state->initial_dual_solution, - n_cons, - con_scale, - obj_scale); + num_constraints, + constraint_scale, + objective_scale); scale_objective_kernel<<num_blocks_primal, THREADS_PER_BLOCK>>>( state->objective_vector, @@ -262,12 +262,12 @@ static void bound_objective_rescaling( state->variable_lower_bound_finite_val, state->variable_upper_bound_finite_val, state->initial_primal_solution, - n_vars, - con_scale, - obj_scale); + num_variables, + constraint_scale, + objective_scale); - rescale_info->con_bound_rescale = con_scale; - rescale_info->obj_vec_rescale = obj_scale; + rescale_info->con_bound_rescale = constraint_scale; + rescale_info->obj_vec_rescale = objective_scale; } rescale_info_t *rescale_problem( @@ -276,33 +276,33 @@ rescale_info_t *rescale_problem( { printf("[Precondition] Start\n"); - int n_vars = state->num_variables; - int n_cons = state->num_constraints; + int num_variables = state->num_variables; + int num_constraints = state->num_constraints; clock_t start_rescaling = clock(); rescale_info_t *rescale_info = (rescale_info_t *)safe_calloc(1, sizeof(rescale_info_t)); - CUDA_CHECK(cudaMalloc(&rescale_info->con_rescale, n_cons * sizeof(double))); - CUDA_CHECK(cudaMalloc(&rescale_info->var_rescale, n_vars * sizeof(double))); + CUDA_CHECK(cudaMalloc(&rescale_info->con_rescale, num_constraints * sizeof(double))); + CUDA_CHECK(cudaMalloc(&rescale_info->var_rescale, num_variables * sizeof(double))); fill_ones_kernel<<num_blocks_dual, THREADS_PER_BLOCK>>>( - rescale_info->con_rescale, n_cons); + rescale_info->con_rescale, num_constraints); fill_ones_kernel<<num_blocks_primal, THREADS_PER_BLOCK>>>( - rescale_info->var_rescale, n_vars); + rescale_info->var_rescale, num_variables); - double *E = NULL, *D = NULL, *invE = NULL, *invD = NULL; - CUDA_CHECK(cudaMalloc(&E, n_cons * sizeof(double))); - CUDA_CHECK(cudaMalloc(&D, n_vars * sizeof(double))); - CUDA_CHECK(cudaMalloc(&invE, n_cons * sizeof(double))); - CUDA_CHECK(cudaMalloc(&invD, n_vars * sizeof(double))); + double *constraint_rescaling = NULL, *variable_rescaling = NULL, *inverse_constraint_rescaling = NULL, *inverse_variable_rescaling = NULL; + CUDA_CHECK(cudaMalloc(&constraint_rescaling, num_constraints * sizeof(double))); + CUDA_CHECK(cudaMalloc(&variable_rescaling, num_variables * sizeof(double))); + CUDA_CHECK(cudaMalloc(&inverse_constraint_rescaling, num_constraints * sizeof(double))); + CUDA_CHECK(cudaMalloc(&inverse_variable_rescaling, num_variables * sizeof(double))); if (params->l_inf_ruiz_iterations > 0) { printf("[Precondition] Ruiz scaling (%d iterations)\n", params->l_inf_ruiz_iterations); - ruiz_rescaling(state, params->l_inf_ruiz_iterations, rescale_info, E, D, invE, invD); + ruiz_rescaling(state, params->l_inf_ruiz_iterations, rescale_info, constraint_rescaling, variable_rescaling, inverse_constraint_rescaling, inverse_variable_rescaling); } if (params->has_pock_chambolle_alpha) { printf("[Precondition] Pock-Chambolle scaling (alpha=%.4f)\n", params->pock_chambolle_alpha); - pock_chambolle_rescaling(state, params->pock_chambolle_alpha, rescale_info, E, D, invE, invD); + pock_chambolle_rescaling(state, params->pock_chambolle_alpha, rescale_info, constraint_rescaling, variable_rescaling, inverse_constraint_rescaling, inverse_variable_rescaling); } rescale_info->con_bound_rescale = 1.0; @@ -315,83 +315,83 @@ rescale_info_t *rescale_problem( rescale_info->rescaling_time_sec = (double)(clock() - start_rescaling) / CLOCKS_PER_SEC; - CUDA_CHECK(cudaFree(E)); - CUDA_CHECK(cudaFree(D)); - CUDA_CHECK(cudaFree(invE)); - CUDA_CHECK(cudaFree(invD)); + CUDA_CHECK(cudaFree(constraint_rescaling)); + CUDA_CHECK(cudaFree(variable_rescaling)); + CUDA_CHECK(cudaFree(inverse_constraint_rescaling)); + CUDA_CHECK(cudaFree(inverse_variable_rescaling)); return rescale_info; } -__global__ void scale_variables_kernel(double *__restrict__ c, - double *__restrict__ var_lb, - double *__restrict__ var_ub, - double *__restrict__ var_lb_finite, - double *__restrict__ var_ub_finite, - double *__restrict__ primal_start, - const double *__restrict__ D, - const double *__restrict__ invD, - int n_vars) +__global__ void scale_variables_kernel(double *__restrict__ objective_vector, + double *__restrict__ variable_lower_bound, + double *__restrict__ variable_upper_bound, + double *__restrict__ variable_lower_bound_finite_val, + double *__restrict__ variable_upper_bound_finite_val, + double *__restrict__ initial_primal_solution, + const double *__restrict__ variable_rescaling, + const double *__restrict__ inverse_variable_rescaling, + int num_variables) { int j = blockIdx.x * blockDim.x + threadIdx.x; - if (j >= n_vars) + if (j >= num_variables) return; - double dj = D[j]; - double inv_dj = invD[j]; - c[j] *= inv_dj; - var_lb[j] *= dj; - var_ub[j] *= dj; - var_lb_finite[j] *= dj; - var_ub_finite[j] *= dj; - primal_start[j] *= dj; + double dj = variable_rescaling[j]; + double inv_dj = inverse_variable_rescaling[j]; + objective_vector[j] *= inv_dj; + variable_lower_bound[j] *= dj; + variable_upper_bound[j] *= dj; + variable_lower_bound_finite_val[j] *= dj; + variable_upper_bound_finite_val[j] *= dj; + initial_primal_solution[j] *= dj; } -__global__ void scale_constraints_kernel(double *__restrict__ con_lb, - double *__restrict__ con_ub, - double *__restrict__ con_lb_finite, - double *__restrict__ con_ub_finite, - double *__restrict__ dual_start, - const double *__restrict__ E, - const double *__restrict__ invE, - int n_cons) +__global__ void scale_constraints_kernel(double *__restrict__ constraint_lower_bound, + double *__restrict__ constraint_upper_bound, + double *__restrict__ constraint_lower_bound_finite_val, + double *__restrict__ constraint_upper_bound_finite_val, + double *__restrict__ initial_dual_solution, + const double *__restrict__ constraint_rescaling, + const double *__restrict__ inverse_constraint_rescaling, + int num_constraints) { int i = blockIdx.x * blockDim.x + threadIdx.x; - if (i >= n_cons) + if (i >= num_constraints) return; - double inv_ei = invE[i]; - double ei = E[i]; - con_lb[i] *= inv_ei; - con_ub[i] *= inv_ei; - con_lb_finite[i] *= inv_ei; - con_ub_finite[i] *= inv_ei; - dual_start[i] *= ei; + double inv_ei = inverse_constraint_rescaling[i]; + double ei = constraint_rescaling[i]; + constraint_lower_bound[i] *= inv_ei; + constraint_upper_bound[i] *= inv_ei; + constraint_lower_bound_finite_val[i] *= inv_ei; + constraint_upper_bound_finite_val[i] *= inv_ei; + initial_dual_solution[i] *= ei; } -__global__ void scale_csr_nnz_kernel(const int *__restrict__ row_ids, - const int *__restrict__ col_ind, - double *__restrict__ A_vals, - double *__restrict__ At_vals, - const int *__restrict__ A_to_At, - const double *__restrict__ invD, - const double *__restrict__ invE, +__global__ void scale_csr_nnz_kernel(const int *__restrict__ constraint_row_ind, + const int *__restrict__ constraint_col_ind, + double *__restrict__ constraint_matrix_val, + double *__restrict__ constraint_matrix_transpose_val, + const int *__restrict__ constraint_to_transpose_position, + const double *__restrict__ inverse_variable_rescaling, + const double *__restrict__ inverse_constraint_rescaling, int nnz) { for (int k = blockIdx.x * blockDim.x + threadIdx.x; k < nnz; k += gridDim.x * blockDim.x) { - int i = row_ids[k]; - int j = col_ind[k]; - double scale = invD[j] * invE[i]; - A_vals[k] *= scale; - At_vals[A_to_At[k]] *= scale; + int i = constraint_row_ind[k]; + int j = constraint_col_ind[k]; + double scale = inverse_variable_rescaling[j] * inverse_constraint_rescaling[i]; + constraint_matrix_val[k] *= scale; + constraint_matrix_transpose_val[constraint_to_transpose_position[k]] *= scale; } } __global__ void compute_csr_row_absmax_kernel(const int *__restrict__ row_ptr, - const double *__restrict__ vals, + const double *__restrict__ matrix_vals, int num_rows, - double *__restrict__ out_max) + double *__restrict__ row_absmax_values) { int i = blockIdx.x * blockDim.x + threadIdx.x; if (i >= num_rows) @@ -400,13 +400,13 @@ __global__ void compute_csr_row_absmax_kernel(const int *__restrict__ row_ptr, double m = 0.0; for (int k = s; k < e; ++k) { - double v = fabs(vals[k]); + double v = fabs(matrix_vals[k]); if (!isfinite(v)) v = 0.0; if (v > m) m = v; } - out_max[i] = m; + row_absmax_values[i] = m; } __device__ __forceinline__ double pow_fast(double v, double p) @@ -421,10 +421,10 @@ __device__ __forceinline__ double pow_fast(double v, double p) } __global__ void compute_csr_row_powsum_kernel(const int *__restrict__ row_ptr, - const double *__restrict__ vals, + const double *__restrict__ matrix_vals, int num_rows, double degree, - double *__restrict__ out_sum) + double *__restrict__ row_powsum_values) { int i = blockIdx.x * blockDim.x + threadIdx.x; if (i >= num_rows) @@ -433,33 +433,33 @@ __global__ void compute_csr_row_powsum_kernel(const int *__restrict__ row_ptr, double acc = 0.0; for (int k = s; k < e; ++k) { - double v = fabs(vals[k]); + double v = fabs(matrix_vals[k]); if (!isfinite(v)) v = 0.0; acc += pow_fast(v, degree); } - out_sum[i] = acc; + row_powsum_values[i] = acc; } -__global__ void clamp_sqrt_and_accum_kernel(double *__restrict__ x, - double *__restrict__ inv_x, - double *__restrict__ cum, - int n_vars) +__global__ void clamp_sqrt_and_accum_kernel(double *__restrict__ scaling_factors, + double *__restrict__ inverse_scaling_factors, + double *__restrict__ cumulative_rescaling, + int num_variables) { - for (int t = blockIdx.x * blockDim.x + threadIdx.x; t < n_vars; t += blockDim.x * gridDim.x) + for (int t = blockIdx.x * blockDim.x + threadIdx.x; t < num_variables; t += blockDim.x * gridDim.x) { - double v = x[t]; + double v = scaling_factors[t]; double s = sqrt(fmax(v, SCALING_EPSILON)); - cum[t] *= s; - x[t] = s; - inv_x[t] = 1.0 / s; + cumulative_rescaling[t] *= s; + scaling_factors[t] = s; + inverse_scaling_factors[t] = 1.0 / s; } } __global__ void reduce_bound_norm_sq_kernel( - const double *__restrict__ L, - const double *__restrict__ U, - int n_cons, + const double *__restrict__ constraint_lower_bound, + const double *__restrict__ constraint_upper_bound, + int num_constraints, double *__restrict__ block_sums) { extern __shared__ double sdata[]; @@ -468,9 +468,9 @@ __global__ void reduce_bound_norm_sq_kernel( int stride = blockDim.x * gridDim.x; double acc = 0.0; - for (int i = global_tid; i < n_cons; i += stride) + for (int i = global_tid; i < num_constraints; i += stride) { - double Li = L[i], Ui = U[i]; + double Li = constraint_lower_bound[i], Ui = constraint_upper_bound[i]; bool fL = isfinite(Li), fU = isfinite(Ui); if (fL && (!fU || fabs(Li - Ui) > SCALING_EPSILON)) @@ -502,50 +502,50 @@ __global__ void reduce_bound_norm_sq_kernel( } __global__ void scale_bounds_kernel( - double *__restrict__ con_lb, - double *__restrict__ con_ub, - double *__restrict__ con_lb_finite, - double *__restrict__ con_ub_finite, - double *__restrict__ dual_start, - int n_cons, - double con_scale, - double obj_scale) + double *__restrict__ constraint_lower_bound, + double *__restrict__ constraint_upper_bound, + double *__restrict__ constraint_lower_bound_finite_val, + double *__restrict__ constraint_upper_bound_finite_val, + double *__restrict__ initial_dual_solution, + int num_constraints, + double constraint_scale, + double objective_scale) { int i = blockIdx.x * blockDim.x + threadIdx.x; - if (i >= n_cons) + if (i >= num_constraints) return; - con_lb[i] *= con_scale; - con_ub[i] *= con_scale; - con_lb_finite[i] *= con_scale; - con_ub_finite[i] *= con_scale; - dual_start[i] *= obj_scale; + constraint_lower_bound[i] *= constraint_scale; + constraint_upper_bound[i] *= constraint_scale; + constraint_lower_bound_finite_val[i] *= constraint_scale; + constraint_upper_bound_finite_val[i] *= constraint_scale; + initial_dual_solution[i] *= objective_scale; } __global__ void scale_objective_kernel( - double *__restrict__ c, - double *__restrict__ var_lb, - double *__restrict__ var_ub, - double *__restrict__ var_lb_finite, - double *__restrict__ var_ub_finite, - double *__restrict__ primal_start, - int n_vars, - double con_scale, - double obj_scale) + double *__restrict__ objective_vector, + double *__restrict__ variable_lower_bound, + double *__restrict__ variable_upper_bound, + double *__restrict__ variable_lower_bound_finite_val, + double *__restrict__ variable_upper_bound_finite_val, + double *__restrict__ initial_primal_solution, + int num_variables, + double constraint_scale, + double objective_scale) { int j = blockIdx.x * blockDim.x + threadIdx.x; - if (j >= n_vars) + if (j >= num_variables) return; - var_lb[j] *= con_scale; - var_ub[j] *= con_scale; - var_lb_finite[j] *= con_scale; - var_ub_finite[j] *= con_scale; - c[j] *= obj_scale; - primal_start[j] *= con_scale; + variable_lower_bound[j] *= constraint_scale; + variable_upper_bound[j] *= constraint_scale; + variable_lower_bound_finite_val[j] *= constraint_scale; + variable_upper_bound_finite_val[j] *= constraint_scale; + objective_vector[j] *= objective_scale; + initial_primal_solution[j] *= constraint_scale; } -__global__ void fill_ones_kernel(double *__restrict__ x, int n_vars) +__global__ void fill_ones_kernel(double *__restrict__ x, int num_variables) { int i = blockIdx.x * blockDim.x + threadIdx.x; - if (i < n_vars) + if (i < num_variables) x[i] = 1.0; } From 5ec9302c37ff022bf7623d5321fb0081d72165d8 Mon Sep 17 00:00:00 2001 From: Bo Tang Date: Thu, 27 Nov 2025 19:27:37 -0500 Subject: [PATCH 11/13] Refactor: move finite-bound computation --- src/preconditioner.cu | 32 ------------------------ src/solver.cu | 57 +++++++++++++++++++++++++++++++------------ 2 files changed, 42 insertions(+), 47 deletions(-) diff --git a/src/preconditioner.cu b/src/preconditioner.cu index 1f711cf..fe087fc 100644 --- a/src/preconditioner.cu +++ b/src/preconditioner.cu @@ -27,16 +27,12 @@ limitations under the License. __global__ void scale_variables_kernel(double *__restrict__ objective_vector, double *__restrict__ variable_lower_bound, double *__restrict__ variable_upper_bound, - double *__restrict__ variable_lower_bound_finite_val, - double *__restrict__ variable_upper_bound_finite_val, double *__restrict__ initial_primal_solution, const double *__restrict__ variable_rescaling, const double *__restrict__ inverse_variable_rescaling, int num_variables); __global__ void scale_constraints_kernel(double *__restrict__ constraint_lower_bound, double *__restrict__ constraint_upper_bound, - double *__restrict__ constraint_lower_bound_finite_val, - double *__restrict__ constraint_upper_bound_finite_val, double *__restrict__ initial_dual_solution, const double *__restrict__ constraint_rescaling, const double *__restrict__ inverse_constraint_rescaling, @@ -70,8 +66,6 @@ __global__ void reduce_bound_norm_sq_kernel( __global__ void scale_bounds_kernel( double *__restrict__ constraint_lower_bound, double *__restrict__ constraint_upper_bound, - double *__restrict__ constraint_lower_bound_finite_val, - double *__restrict__ constraint_upper_bound_finite_val, double *__restrict__ initial_dual_solution, int num_constraints, double constraint_scale, @@ -80,8 +74,6 @@ __global__ void scale_objective_kernel( double *__restrict__ objective_vector, double *__restrict__ variable_lower_bound, double *__restrict__ variable_upper_bound, - double *__restrict__ variable_lower_bound_finite_val, - double *__restrict__ variable_upper_bound_finite_val, double *__restrict__ initial_primal_solution, int num_variables, double constraint_scale, @@ -108,8 +100,6 @@ static void scale_problem( state->objective_vector, state->variable_lower_bound, state->variable_upper_bound, - state->variable_lower_bound_finite_val, - state->variable_upper_bound_finite_val, state->initial_primal_solution, variable_rescaling, inverse_variable_rescaling, @@ -118,8 +108,6 @@ static void scale_problem( scale_constraints_kernel<<num_blocks_dual, THREADS_PER_BLOCK>>>( state->constraint_lower_bound, state->constraint_upper_bound, - state->constraint_lower_bound_finite_val, - state->constraint_upper_bound_finite_val, state->initial_dual_solution, constraint_rescaling, inverse_constraint_rescaling, @@ -248,8 +236,6 @@ static void bound_objective_rescaling( scale_bounds_kernel<<num_blocks_dual, THREADS_PER_BLOCK>>>( state->constraint_lower_bound, state->constraint_upper_bound, - state->constraint_lower_bound_finite_val, - state->constraint_upper_bound_finite_val, state->initial_dual_solution, num_constraints, constraint_scale, @@ -259,8 +245,6 @@ static void bound_objective_rescaling( state->objective_vector, state->variable_lower_bound, state->variable_upper_bound, - state->variable_lower_bound_finite_val, - state->variable_upper_bound_finite_val, state->initial_primal_solution, num_variables, constraint_scale, @@ -326,8 +310,6 @@ rescale_info_t *rescale_problem( __global__ void scale_variables_kernel(double *__restrict__ objective_vector, double *__restrict__ variable_lower_bound, double *__restrict__ variable_upper_bound, - double *__restrict__ variable_lower_bound_finite_val, - double *__restrict__ variable_upper_bound_finite_val, double *__restrict__ initial_primal_solution, const double *__restrict__ variable_rescaling, const double *__restrict__ inverse_variable_rescaling, @@ -341,15 +323,11 @@ __global__ void scale_variables_kernel(double *__restrict__ objective_vector, objective_vector[j] *= inv_dj; variable_lower_bound[j] *= dj; variable_upper_bound[j] *= dj; - variable_lower_bound_finite_val[j] *= dj; - variable_upper_bound_finite_val[j] *= dj; initial_primal_solution[j] *= dj; } __global__ void scale_constraints_kernel(double *__restrict__ constraint_lower_bound, double *__restrict__ constraint_upper_bound, - double *__restrict__ constraint_lower_bound_finite_val, - double *__restrict__ constraint_upper_bound_finite_val, double *__restrict__ initial_dual_solution, const double *__restrict__ constraint_rescaling, const double *__restrict__ inverse_constraint_rescaling, @@ -362,8 +340,6 @@ __global__ void scale_constraints_kernel(double *__restrict__ constraint_lower_b double ei = constraint_rescaling[i]; constraint_lower_bound[i] *= inv_ei; constraint_upper_bound[i] *= inv_ei; - constraint_lower_bound_finite_val[i] *= inv_ei; - constraint_upper_bound_finite_val[i] *= inv_ei; initial_dual_solution[i] *= ei; } @@ -504,8 +480,6 @@ __global__ void reduce_bound_norm_sq_kernel( __global__ void scale_bounds_kernel( double *__restrict__ constraint_lower_bound, double *__restrict__ constraint_upper_bound, - double *__restrict__ constraint_lower_bound_finite_val, - double *__restrict__ constraint_upper_bound_finite_val, double *__restrict__ initial_dual_solution, int num_constraints, double constraint_scale, @@ -516,8 +490,6 @@ __global__ void scale_bounds_kernel( return; constraint_lower_bound[i] *= constraint_scale; constraint_upper_bound[i] *= constraint_scale; - constraint_lower_bound_finite_val[i] *= constraint_scale; - constraint_upper_bound_finite_val[i] *= constraint_scale; initial_dual_solution[i] *= objective_scale; } @@ -525,8 +497,6 @@ __global__ void scale_objective_kernel( double *__restrict__ objective_vector, double *__restrict__ variable_lower_bound, double *__restrict__ variable_upper_bound, - double *__restrict__ variable_lower_bound_finite_val, - double *__restrict__ variable_upper_bound_finite_val, double *__restrict__ initial_primal_solution, int num_variables, double constraint_scale, @@ -537,8 +507,6 @@ __global__ void scale_objective_kernel( return; variable_lower_bound[j] *= constraint_scale; variable_upper_bound[j] *= constraint_scale; - variable_lower_bound_finite_val[j] *= constraint_scale; - variable_upper_bound_finite_val[j] *= constraint_scale; objective_vector[j] *= objective_scale; initial_primal_solution[j] *= constraint_scale; } diff --git a/src/solver.cu b/src/solver.cu index c01d5b8..a869666 100644 --- a/src/solver.cu +++ b/src/solver.cu @@ -36,6 +36,11 @@ __global__ void build_transpose_map(const int *__restrict__ A_row_ind, const int *__restrict__ At_col_ind, int nnz, int *__restrict__ A_to_At); +__global__ void fill_finite_bounds_kernel(const double *__restrict__ lower_bound, + const double *__restrict__ upper_bound, + double *__restrict__ lower_bound_finite_val, + double *__restrict__ upper_bound_finite_val, + int num_elements); __global__ void compute_next_pdhg_primal_solution_kernel(const double *current_primal, double *reflected_primal, const double *dual_product, @@ -289,21 +294,6 @@ static pdhg_solver_state_t *initialize_solver_state( ALLOC_AND_COPY(state->constraint_lower_bound, original_problem->constraint_lower_bound, con_bytes); ALLOC_AND_COPY(state->constraint_upper_bound, original_problem->constraint_upper_bound, con_bytes); - double *temp_host = (double *)safe_malloc(fmax(var_bytes, con_bytes)); - for (int i = 0; i < n_cons; ++i) - temp_host[i] = isfinite(original_problem->constraint_lower_bound[i]) ? original_problem->constraint_lower_bound[i] : 0.0; - ALLOC_AND_COPY(state->constraint_lower_bound_finite_val, temp_host, con_bytes); - for (int i = 0; i < n_cons; ++i) - temp_host[i] = isfinite(original_problem->constraint_upper_bound[i]) ? original_problem->constraint_upper_bound[i] : 0.0; - ALLOC_AND_COPY(state->constraint_upper_bound_finite_val, temp_host, con_bytes); - for (int i = 0; i < n_vars; ++i) - temp_host[i] = isfinite(original_problem->variable_lower_bound[i]) ? original_problem->variable_lower_bound[i] : 0.0; - ALLOC_AND_COPY(state->variable_lower_bound_finite_val, temp_host, var_bytes); - for (int i = 0; i < n_vars; ++i) - temp_host[i] = isfinite(original_problem->variable_upper_bound[i]) ? original_problem->variable_upper_bound[i] : 0.0; - ALLOC_AND_COPY(state->variable_upper_bound_finite_val, temp_host, var_bytes); - free(temp_host); - #define ALLOC_ZERO(dest, bytes) \ CUDA_CHECK(cudaMalloc(&dest, bytes)); \ CUDA_CHECK(cudaMemset(dest, 0, bytes)); @@ -351,6 +341,25 @@ static pdhg_solver_state_t *initialize_solver_state( rescale_info->var_rescale = NULL; rescale_info_free(rescale_info); + CUDA_CHECK(cudaMalloc(&state->constraint_lower_bound_finite_val, con_bytes)); + CUDA_CHECK(cudaMalloc(&state->constraint_upper_bound_finite_val, con_bytes)); + CUDA_CHECK(cudaMalloc(&state->variable_lower_bound_finite_val, var_bytes)); + CUDA_CHECK(cudaMalloc(&state->variable_upper_bound_finite_val, var_bytes)); + + fill_finite_bounds_kernel<<num_blocks_dual, THREADS_PER_BLOCK>>>( + state->constraint_lower_bound, + state->constraint_upper_bound, + state->constraint_lower_bound_finite_val, + state->constraint_upper_bound_finite_val, + n_cons); + + fill_finite_bounds_kernel<<num_blocks_primal, THREADS_PER_BLOCK>>>( + state->variable_lower_bound, + state->variable_upper_bound, + state->variable_lower_bound_finite_val, + state->variable_upper_bound_finite_val, + n_vars); + CUDA_CHECK(cudaFree(state->constraint_matrix->row_ind)); state->constraint_matrix->row_ind = NULL; CUDA_CHECK(cudaFree(state->constraint_matrix_t->row_ind)); @@ -481,6 +490,24 @@ __global__ void build_transpose_map( A_to_At[k] = pos; } +__global__ void fill_finite_bounds_kernel( + const double *__restrict__ lb, + const double *__restrict__ ub, + double *__restrict__ lb_finite, + double *__restrict__ ub_finite, + int num_elements) +{ + int i = blockIdx.x * blockDim.x + threadIdx.x; + if (i >= num_elements) + return; + + double Li = lb[i]; + double Ui = ub[i]; + + lb_finite[i] = isfinite(Li) ? Li : 0.0; + ub_finite[i] = isfinite(Ui) ? Ui : 0.0; +} + __global__ void compute_next_pdhg_primal_solution_kernel( const double *current_primal, double *reflected_primal, const double *dual_product, const double *objective, const double *var_lb, From 61e88bd02bd34d0c3d2da0d2500f7e4441d263e7 Mon Sep 17 00:00:00 2001 From: LucasBoTang Date: Fri, 28 Nov 2025 22:47:19 -0500 Subject: [PATCH 12/13] Numerical issue: back to old version --- src/preconditioner.cu | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/preconditioner.cu b/src/preconditioner.cu index fe087fc..0151334 100644 --- a/src/preconditioner.cu +++ b/src/preconditioner.cu @@ -22,7 +22,7 @@ limitations under the License. #include #include -#define SCALING_EPSILON 1e-8 +#define SCALING_EPSILON 1e-12 __global__ void scale_variables_kernel(double *__restrict__ objective_vector, double *__restrict__ variable_lower_bound, @@ -425,7 +425,7 @@ __global__ void clamp_sqrt_and_accum_kernel(double *__restrict__ scaling_factors for (int t = blockIdx.x * blockDim.x + threadIdx.x; t < num_variables; t += blockDim.x * gridDim.x) { double v = scaling_factors[t]; - double s = sqrt(fmax(v, SCALING_EPSILON)); + double s = (v < SCALING_EPSILON) ? 1.0 : sqrt(v); cumulative_rescaling[t] *= s; scaling_factors[t] = s; inverse_scaling_factors[t] = 1.0 / s; From 51685beaf9fc7a52a1ab9ca63e706eac1714c969 Mon Sep 17 00:00:00 2001 From: LucasBoTang Date: Fri, 28 Nov 2025 22:55:19 -0500 Subject: [PATCH 13/13] New ver: 0.1.5 --- CMakeLists.txt | 2 +- pyproject.toml | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 8ccc7ac..7910540 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -8,7 +8,7 @@ project(cupdlpx LANGUAGES C CXX CUDA) set(CUPDLPX_VERSION_MAJOR 0) set(CUPDLPX_VERSION_MINOR 1) -set(CUPDLPX_VERSION_PATCH 4) +set(CUPDLPX_VERSION_PATCH 5) set(CUPDLPX_VERSION "${CUPDLPX_VERSION_MAJOR}.${CUPDLPX_VERSION_MINOR}.${CUPDLPX_VERSION_PATCH}") add_compile_definitions(CUPDLPX_VERSION="${CUPDLPX_VERSION}") diff --git a/pyproject.toml b/pyproject.toml index f399ca1..4a4626d 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -4,7 +4,7 @@ build-backend = "scikit_build_core.build" [project] name = "cupdlpx" -version = "0.1.4" +version = "0.1.5" description = "Python bindings for cuPDLPx (GPU-accelerated first-order LP solver)" readme = "README.md" license = { text = "Apache-2.0" }