From 2a31c9966e55f1db809b438391ce6015cf6e1ffd Mon Sep 17 00:00:00 2001
From: Bo Tang <botang@mie.utoronto.ca>
Date: Fri, 14 Nov 2025 18:33:51 -0500
Subject: [PATCH 01/13] New feat: GPU precondition

---
 CMakeLists.txt            |   2 +-
 internal/internal_types.h |   7 +-
 internal/preconditioner.h |   2 +-
 internal/utils.h          |   2 +
 pyproject.toml            |   2 +-
 src/preconditioner.c      | 306 ------------------------
 src/preconditioner.cu     | 489 ++++++++++++++++++++++++++++++++++++++
 src/solver.cu             | 366 ++++++++++------------------
 src/utils.cu              |  34 +--
 9 files changed, 644 insertions(+), 566 deletions(-)
 delete mode 100644 src/preconditioner.c
 create mode 100644 src/preconditioner.cu

diff --git a/CMakeLists.txt b/CMakeLists.txt
index e06c649..bd1a955 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -20,7 +20,7 @@ if(NOT CMAKE_BUILD_TYPE AND NOT CMAKE_CONFIGURATION_TYPES)
 endif()
 
 # Increase compatibility by compiling for all supported real and virtual architectures.
-set(CMAKE_CUDA_ARCHITECTURES all)
+set(CMAKE_CUDA_ARCHITECTURES 60 70 75 80 86 89 90)
 
 # Global Compile flags (corresponding to CFLAGS/NVCCFLAGS)
 add_compile_options(-fPIC -O3 -Wall -Wextra -g)
diff --git a/internal/internal_types.h b/internal/internal_types.h
index e66600d..f0d8def 100644
--- a/internal/internal_types.h
+++ b/internal/internal_types.h
@@ -28,6 +28,7 @@ typedef struct
 	int num_nonzeros;
 	int *row_ptr;
 	int *col_ind;
+	int *row_ind;
 	double *val;
 } cu_sparse_matrix_csr_t;
 
@@ -46,6 +47,7 @@ typedef struct
 	int num_blocks_primal;
 	int num_blocks_dual;
 	int num_blocks_primal_dual;
+	int num_blocks_nnz;
 	double objective_vector_norm;
 	double constraint_bound_norm;
 	double *constraint_lower_bound_finite_val;
@@ -74,8 +76,6 @@ typedef struct
 
 	double *constraint_rescaling;
 	double *variable_rescaling;
-	double constraint_bound_rescaling;
-	double objective_vector_rescaling;
 	double *primal_slack;
 	double *dual_slack;
 	double rescaling_time_sec;
@@ -126,10 +126,7 @@ typedef struct
 
 typedef struct
 {
-	lp_problem_t *scaled_problem;
 	double *con_rescale;
 	double *var_rescale;
-	double con_bound_rescale;
-	double obj_vec_rescale;
 	double rescaling_time_sec;
 } rescale_info_t;
diff --git a/internal/preconditioner.h b/internal/preconditioner.h
index 8feb664..71a5c2b 100644
--- a/internal/preconditioner.h
+++ b/internal/preconditioner.h
@@ -25,7 +25,7 @@ extern "C"
 
     rescale_info_t *rescale_problem(
         const pdhg_parameters_t *params,
-        const lp_problem_t *original_problem);
+        pdhg_solver_state_t *state);
 
 #ifdef __cplusplus
 }
diff --git a/internal/utils.h b/internal/utils.h
index 62da072..7b939ea 100644
--- a/internal/utils.h
+++ b/internal/utils.h
@@ -125,6 +125,8 @@ extern "C"
     int coo_to_csr(const matrix_desc_t *desc,
                    int **row_ptr, int **col_ind, double **vals, int *nnz_out);
 
+    int* build_row_ind_from_row_ptr(int* row_ptr, int num_rows, int nnz);
+
 #ifdef __cplusplus
 }
 
diff --git a/pyproject.toml b/pyproject.toml
index 9c65a6c..2a0a2c0 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -4,7 +4,7 @@ build-backend = "scikit_build_core.build"
 
 [project]
 name = "cupdlpx"
-version = "0.1.2"
+version = "0.1.3"
 description = "Python bindings for cuPDLPx (GPU-accelerated first-order LP solver)"
 readme = "README.md"
 license = { text = "Apache-2.0" }
diff --git a/src/preconditioner.c b/src/preconditioner.c
deleted file mode 100644
index 41b7542..0000000
--- a/src/preconditioner.c
+++ /dev/null
@@ -1,306 +0,0 @@
-/*
-Copyright 2025 Haihao Lu
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-*/
-
-#include "preconditioner.h"
-#include "utils.h"
-#include <math.h>
-#include <stdio.h>
-#include <string.h>
-#include <time.h>
-
-#define SCALING_EPSILON 1e-12
-
-static lp_problem_t *deepcopy_problem(const lp_problem_t *prob);
-static void scale_problem(lp_problem_t *problem, const double *con_rescale,
-                          const double *var_rescale);
-static void ruiz_rescaling(lp_problem_t *problem, int num_iters,
-                           double *cum_con_rescale, double *cum_var_rescale);
-static void pock_chambolle_rescaling(lp_problem_t *problem, double alpha,
-                                     double *cum_con_rescale,
-                                     double *cum_var_rescale);
-
-static lp_problem_t *deepcopy_problem(const lp_problem_t *prob)
-{
-    lp_problem_t *new_prob = (lp_problem_t *)safe_malloc(sizeof(lp_problem_t));
-
-    new_prob->num_variables = prob->num_variables;
-    new_prob->num_constraints = prob->num_constraints;
-    new_prob->constraint_matrix_num_nonzeros =
-        prob->constraint_matrix_num_nonzeros;
-    new_prob->objective_constant = prob->objective_constant;
-
-    size_t var_bytes = prob->num_variables * sizeof(double);
-    size_t con_bytes = prob->num_constraints * sizeof(double);
-    size_t nnz_bytes_val = prob->constraint_matrix_num_nonzeros * sizeof(double);
-    size_t nnz_bytes_col = prob->constraint_matrix_num_nonzeros * sizeof(int);
-    size_t row_ptr_bytes = (prob->num_constraints + 1) * sizeof(int);
-
-    new_prob->variable_lower_bound = safe_malloc(var_bytes);
-    new_prob->variable_upper_bound = safe_malloc(var_bytes);
-    new_prob->objective_vector = safe_malloc(var_bytes);
-    new_prob->constraint_lower_bound = safe_malloc(con_bytes);
-    new_prob->constraint_upper_bound = safe_malloc(con_bytes);
-    new_prob->constraint_matrix_row_pointers = safe_malloc(row_ptr_bytes);
-    new_prob->constraint_matrix_col_indices = safe_malloc(nnz_bytes_col);
-    new_prob->constraint_matrix_values = safe_malloc(nnz_bytes_val);
-
-    memcpy(new_prob->variable_lower_bound, prob->variable_lower_bound, var_bytes);
-    memcpy(new_prob->variable_upper_bound, prob->variable_upper_bound, var_bytes);
-    memcpy(new_prob->objective_vector, prob->objective_vector, var_bytes);
-    memcpy(new_prob->constraint_lower_bound, prob->constraint_lower_bound,
-           con_bytes);
-    memcpy(new_prob->constraint_upper_bound, prob->constraint_upper_bound,
-           con_bytes);
-    memcpy(new_prob->constraint_matrix_row_pointers,
-           prob->constraint_matrix_row_pointers, row_ptr_bytes);
-    memcpy(new_prob->constraint_matrix_col_indices,
-           prob->constraint_matrix_col_indices, nnz_bytes_col);
-    memcpy(new_prob->constraint_matrix_values, prob->constraint_matrix_values,
-           nnz_bytes_val);
-
-    if (prob->primal_start)
-    {
-        new_prob->primal_start = safe_malloc(var_bytes);
-        memcpy(new_prob->primal_start, prob->primal_start, var_bytes);
-    }
-    else
-    {
-        new_prob->primal_start = NULL;
-    }
-    if (prob->dual_start)
-    {
-        new_prob->dual_start = safe_malloc(con_bytes);
-        memcpy(new_prob->dual_start, prob->dual_start, con_bytes);
-    }
-    else
-    {
-        new_prob->dual_start = NULL;
-    }
-
-    return new_prob;
-}
-
-static void scale_problem(lp_problem_t *problem,
-                          const double *constraint_rescaling,
-                          const double *variable_rescaling)
-{
-    for (int i = 0; i < problem->num_variables; ++i)
-    {
-        problem->objective_vector[i] /= variable_rescaling[i];
-        problem->variable_upper_bound[i] *= variable_rescaling[i];
-        problem->variable_lower_bound[i] *= variable_rescaling[i];
-    }
-    for (int i = 0; i < problem->num_constraints; ++i)
-    {
-        problem->constraint_lower_bound[i] /= constraint_rescaling[i];
-        problem->constraint_upper_bound[i] /= constraint_rescaling[i];
-    }
-
-    for (int row = 0; row < problem->num_constraints; ++row)
-    {
-        for (int nz_idx = problem->constraint_matrix_row_pointers[row];
-             nz_idx < problem->constraint_matrix_row_pointers[row + 1]; ++nz_idx)
-        {
-            int col = problem->constraint_matrix_col_indices[nz_idx];
-            problem->constraint_matrix_values[nz_idx] /=
-                (constraint_rescaling[row] * variable_rescaling[col]);
-        }
-    }
-}
-
-static void ruiz_rescaling(lp_problem_t *problem, int num_iterations,
-                           double *cum_constraint_rescaling,
-                           double *cum_variable_rescaling)
-{
-    int num_cons = problem->num_constraints;
-    int num_vars = problem->num_variables;
-    double *con_rescale = safe_malloc(num_cons * sizeof(double));
-    double *var_rescale = safe_malloc(num_vars * sizeof(double));
-
-    for (int iter = 0; iter < num_iterations; ++iter)
-    {
-        for (int i = 0; i < num_vars; ++i)
-            var_rescale[i] = 0.0;
-        for (int i = 0; i < num_cons; ++i)
-            con_rescale[i] = 0.0;
-
-        for (int row = 0; row < num_cons; ++row)
-        {
-            for (int nz_idx = problem->constraint_matrix_row_pointers[row];
-                 nz_idx < problem->constraint_matrix_row_pointers[row + 1];
-                 ++nz_idx)
-            {
-                int col = problem->constraint_matrix_col_indices[nz_idx];
-                if (col < 0 || col >= num_vars)
-                {
-                    fprintf(stderr,
-                            "Error: Invalid column index %d at nz_idx %d for row %d. "
-                            "Must be in [0, %d).\n",
-                            col, nz_idx, row, num_vars);
-                }
-                double val = fabs(problem->constraint_matrix_values[nz_idx]);
-                if (val > var_rescale[col])
-                    var_rescale[col] = val;
-                if (val > con_rescale[row])
-                    con_rescale[row] = val;
-            }
-        }
-
-        for (int i = 0; i < num_vars; ++i)
-            var_rescale[i] =
-                (var_rescale[i] < SCALING_EPSILON) ? 1.0 : sqrt(var_rescale[i]);
-        for (int i = 0; i < num_cons; ++i)
-            con_rescale[i] =
-                (con_rescale[i] < SCALING_EPSILON) ? 1.0 : sqrt(con_rescale[i]);
-
-        scale_problem(problem, con_rescale, var_rescale);
-        for (int i = 0; i < num_vars; ++i)
-            cum_variable_rescaling[i] *= var_rescale[i];
-        for (int i = 0; i < num_cons; ++i)
-            cum_constraint_rescaling[i] *= con_rescale[i];
-    }
-    free(con_rescale);
-    free(var_rescale);
-}
-
-static void pock_chambolle_rescaling(lp_problem_t *problem, double alpha,
-                                     double *cum_constraint_rescaling,
-                                     double *cum_variable_rescaling)
-{
-    int num_cons = problem->num_constraints;
-    int num_vars = problem->num_variables;
-    double *con_rescale = safe_calloc(num_cons, sizeof(double));
-    double *var_rescale = safe_calloc(num_vars, sizeof(double));
-
-    for (int row = 0; row < num_cons; ++row)
-    {
-        for (int nz_idx = problem->constraint_matrix_row_pointers[row];
-             nz_idx < problem->constraint_matrix_row_pointers[row + 1]; ++nz_idx)
-        {
-            int col = problem->constraint_matrix_col_indices[nz_idx];
-            double val = fabs(problem->constraint_matrix_values[nz_idx]);
-            var_rescale[col] += pow(val, 2.0 - alpha);
-            con_rescale[row] += pow(val, alpha);
-        }
-    }
-
-    for (int i = 0; i < num_vars; ++i)
-        var_rescale[i] =
-            (var_rescale[i] < SCALING_EPSILON) ? 1.0 : sqrt(var_rescale[i]);
-    for (int i = 0; i < num_cons; ++i)
-        con_rescale[i] =
-            (con_rescale[i] < SCALING_EPSILON) ? 1.0 : sqrt(con_rescale[i]);
-
-    scale_problem(problem, con_rescale, var_rescale);
-    for (int i = 0; i < num_vars; ++i)
-        cum_variable_rescaling[i] *= var_rescale[i];
-    for (int i = 0; i < num_cons; ++i)
-        cum_constraint_rescaling[i] *= con_rescale[i];
-
-    free(con_rescale);
-    free(var_rescale);
-}
-
-rescale_info_t *rescale_problem(const pdhg_parameters_t *params,
-                                const lp_problem_t *original_problem)
-{
-    clock_t start_rescaling = clock();
-    rescale_info_t *rescale_info =
-        (rescale_info_t *)safe_calloc(1, sizeof(rescale_info_t));
-    rescale_info->scaled_problem = deepcopy_problem(original_problem);
-    if (rescale_info->scaled_problem == NULL)
-    {
-        fprintf(stderr,
-                "Failed to create a copy of the problem. Aborting rescale.\n");
-        return NULL;
-    }
-    int num_cons = original_problem->num_constraints;
-    int num_vars = original_problem->num_variables;
-
-    rescale_info->con_rescale = safe_malloc(num_cons * sizeof(double));
-    rescale_info->var_rescale = safe_malloc(num_vars * sizeof(double));
-    for (int i = 0; i < num_cons; ++i)
-        rescale_info->con_rescale[i] = 1.0;
-    for (int i = 0; i < num_vars; ++i)
-        rescale_info->var_rescale[i] = 1.0;
-    if (params->l_inf_ruiz_iterations > 0)
-    {
-        ruiz_rescaling(rescale_info->scaled_problem, params->l_inf_ruiz_iterations,
-                       rescale_info->con_rescale, rescale_info->var_rescale);
-    }
-    if (params->has_pock_chambolle_alpha)
-    {
-        pock_chambolle_rescaling(
-            rescale_info->scaled_problem, params->pock_chambolle_alpha,
-            rescale_info->con_rescale, rescale_info->var_rescale);
-    }
-    if (params->bound_objective_rescaling)
-    {
-        double bound_norm_sq = 0.0;
-        for (int i = 0; i < num_cons; ++i)
-        {
-            if (isfinite(rescale_info->scaled_problem->constraint_lower_bound[i]) &&
-                (rescale_info->scaled_problem->constraint_lower_bound[i] !=
-                 rescale_info->scaled_problem->constraint_upper_bound[i]))
-            {
-                bound_norm_sq +=
-                    rescale_info->scaled_problem->constraint_lower_bound[i] *
-                    rescale_info->scaled_problem->constraint_lower_bound[i];
-            }
-            if (isfinite(rescale_info->scaled_problem->constraint_upper_bound[i]))
-            {
-                bound_norm_sq +=
-                    rescale_info->scaled_problem->constraint_upper_bound[i] *
-                    rescale_info->scaled_problem->constraint_upper_bound[i];
-            }
-        }
-
-        double obj_norm_sq = 0.0;
-        for (int i = 0; i < num_vars; ++i)
-        {
-            obj_norm_sq += rescale_info->scaled_problem->objective_vector[i] *
-                           rescale_info->scaled_problem->objective_vector[i];
-        }
-
-        rescale_info->con_bound_rescale = 1.0 / (sqrt(bound_norm_sq) + 1.0);
-        rescale_info->obj_vec_rescale = 1.0 / (sqrt(obj_norm_sq) + 1.0);
-
-        for (int i = 0; i < num_cons; ++i)
-        {
-            rescale_info->scaled_problem->constraint_lower_bound[i] *=
-                rescale_info->con_bound_rescale;
-            rescale_info->scaled_problem->constraint_upper_bound[i] *=
-                rescale_info->con_bound_rescale;
-        }
-        for (int i = 0; i < num_vars; ++i)
-        {
-            rescale_info->scaled_problem->variable_lower_bound[i] *=
-                rescale_info->con_bound_rescale;
-            rescale_info->scaled_problem->variable_upper_bound[i] *=
-                rescale_info->con_bound_rescale;
-            rescale_info->scaled_problem->objective_vector[i] *=
-                rescale_info->obj_vec_rescale;
-        }
-    }
-    else
-    {
-        rescale_info->con_bound_rescale = 1.0;
-        rescale_info->obj_vec_rescale = 1.0;
-    }
-    rescale_info->rescaling_time_sec =
-        (double)(clock() - start_rescaling) / CLOCKS_PER_SEC;
-    return rescale_info;
-}
\ No newline at end of file
diff --git a/src/preconditioner.cu b/src/preconditioner.cu
new file mode 100644
index 0000000..e197905
--- /dev/null
+++ b/src/preconditioner.cu
@@ -0,0 +1,489 @@
+/*
+Copyright 2025 Haihao Lu
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/
+
+#include "preconditioner.h"
+#include "utils.h"
+#include <time.h>
+#include <stdio.h>
+#include <math.h>
+#include <vector>
+#include <cuda_runtime.h>
+#include <cublas_v2.h>
+
+#define SCALING_EPSILON 1e-12
+
+__global__ void scale_variables_kernel(double* __restrict__ c,
+                                       double* __restrict__ var_lb,
+                                       double* __restrict__ var_ub,
+                                       double* __restrict__ var_lb_finite,
+                                       double* __restrict__ var_ub_finite,
+                                       double* __restrict__ primal_start,
+                                       const double* __restrict__ D,
+                                       const double* __restrict__ invD,
+                                       int n);
+__global__ void scale_constraints_kernel(double* __restrict__ con_lb,
+                                         double* __restrict__ con_ub,
+                                         double* __restrict__ con_lb_finite,
+                                         double* __restrict__ con_ub_finite,
+                                         double* __restrict__ dual_start,
+                                         const double* __restrict__ E,
+                                         const double* __restrict__ invE,
+                                         int m);
+__global__ void csr_scale_nnz_kernel(const int* __restrict__ row_ids,
+                                     const int* __restrict__ col_ind,
+                                     double* __restrict__ vals,
+                                     const double* __restrict__ invD,
+                                     const double* __restrict__ invE,
+                                     int nnz);
+__global__ void csr_row_absmax_kernel(const int* __restrict__ row_ptr,
+                                      const double* __restrict__ vals,
+                                      int num_rows,
+                                      double* __restrict__ out_max);
+__global__ void csr_col_absmax_atomic_kernel(const int* __restrict__ col_ind,
+                                             const double* __restrict__ vals,
+                                             int nnz,
+                                             unsigned long long* __restrict__ out_max_bits);
+__global__ void u64bits_to_double(const unsigned long long* __restrict__ in_bits,
+                                  double* __restrict__ out_val,
+                                  int n);
+__global__ void csr_row_powsum_kernel(const int* __restrict__ row_ptr,
+                                          const double* __restrict__ vals,
+                                          int num_rows,
+                                          double degree,
+                                          double* __restrict__ out_sum);
+__global__ void csr_col_powsum_atomic_kernel(const int* __restrict__ col_ind,
+                                                const double* __restrict__ vals,
+                                                int nnz,
+                                                double degree,
+                                                double* __restrict__ out_sum);
+__global__ void clamp_sqrt_and_accum(double* __restrict__ x,
+                                     double* __restrict__ inv_x,
+                                     double* __restrict__ cum, 
+                                     int n);
+static void scale_problem(pdhg_solver_state_t *state, double *E, double *D, double *invE, double *invD);
+static void ruiz_rescaling(pdhg_solver_state_t *state, int num_iters, rescale_info_t *rescale_info,
+                           double *E, double *D, double *invE, double *invD);
+static void pock_chambolle_rescaling(pdhg_solver_state_t *state, double alpha, rescale_info_t *rescale_info,
+                                     double *E, double *D, double *invE, double *invD);
+static void bound_objective_rescaling(pdhg_solver_state_t *state, rescale_info_t *rescale_info,
+                                     double *E, double *D, double *invE, double *invD);
+
+__global__ void scale_variables_kernel(double* __restrict__ c,
+                                       double* __restrict__ var_lb,
+                                       double* __restrict__ var_ub,
+                                       double* __restrict__ var_lb_finite,
+                                       double* __restrict__ var_ub_finite,
+                                       double* __restrict__ primal_start,
+                                       const double* __restrict__ D,
+                                       const double* __restrict__ invD,
+                                       int n)
+{
+    int j = blockIdx.x * blockDim.x + threadIdx.x;
+    if (j >= n) return;
+    double dj = D[j];
+    double inv_dj = invD[j];
+    c[j]      *= inv_dj;
+    var_lb[j] *= dj;
+    var_ub[j] *= dj;
+    var_lb_finite[j] *= dj;
+    var_ub_finite[j] *= dj;
+    primal_start[j] *= dj;
+}
+
+__global__ void scale_constraints_kernel(double* __restrict__ con_lb,
+                                         double* __restrict__ con_ub,
+                                         double* __restrict__ con_lb_finite,
+                                         double* __restrict__ con_ub_finite,
+                                         double* __restrict__ dual_start,
+                                         const double* __restrict__ E,
+                                         const double* __restrict__ invE,
+                                         int m)
+{
+    int i = blockIdx.x * blockDim.x + threadIdx.x;
+    if (i >= m) return;
+    double inv_ei = invE[i];
+    double ei = E[i];
+    con_lb[i] *= inv_ei;
+    con_ub[i] *= inv_ei;
+    con_lb_finite[i] *= inv_ei;
+    con_ub_finite[i] *= inv_ei;
+    dual_start[i] *= ei;
+}
+
+__global__ void csr_scale_nnz_kernel(const int* __restrict__ row_ids,
+                                     const int* __restrict__ col_ind,
+                                     double* __restrict__ vals,
+                                     const double* __restrict__ invD,
+                                     const double* __restrict__ invE,
+                                     int nnz)
+{
+    for (int k = blockIdx.x * blockDim.x + threadIdx.x;
+         k < nnz; 
+         k += gridDim.x * blockDim.x)
+    {
+        int i = row_ids[k];
+        int j = col_ind[k];
+        vals[k] *= invD[j] * invE[i];
+    }
+}
+
+__global__ void csr_row_absmax_kernel(const int* __restrict__ row_ptr,
+                                      const double* __restrict__ vals,
+                                      int num_rows,
+                                      double* __restrict__ out_max)
+{
+    int i = blockIdx.x * blockDim.x + threadIdx.x;
+    if (i >= num_rows) return;
+    int s = row_ptr[i], e = row_ptr[i + 1];
+    double m = 0.0;
+    for (int k = s; k < e; ++k) {
+        double v = fabs(vals[k]);
+        if (!isfinite(v)) v = 0.0;
+        if (v > m) m = v;
+    }
+    out_max[i] = m;
+}
+
+__global__ void csr_col_absmax_atomic_kernel(const int* __restrict__ col_ind,
+                                             const double* __restrict__ vals,
+                                             int nnz,
+                                             unsigned long long* __restrict__ out_max_bits)
+{
+    for (int k = blockIdx.x * blockDim.x + threadIdx.x; k < nnz; k += gridDim.x * blockDim.x) {
+        int j = col_ind[k];
+        double v = fabs(vals[k]);
+        if (!isfinite(v)) v = 0.0;
+        unsigned long long bits = __double_as_longlong(v);
+        atomicMax(&out_max_bits[j], bits);
+    }
+}
+
+__global__ void u64bits_to_double(const unsigned long long* __restrict__ in_bits,
+                                  double* __restrict__ out_val,
+                                  int n)
+{
+    for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < n; i += gridDim.x * blockDim.x) {
+        out_val[i] = __longlong_as_double(in_bits[i]);
+    }
+}
+
+__device__ __forceinline__ double pow_fast(double v, double p) {
+    if (p == 2.0)   return v * v;
+    if (p == 1.0)   return v;
+    if (p == 0.5)   return sqrt(v);
+    return pow(v, p);
+}
+
+__global__ void csr_row_powsum_kernel(const int* __restrict__ row_ptr,
+                                       const double* __restrict__ vals,
+                                       int num_rows,
+                                       double degree,
+                                       double* __restrict__ out_sum)
+{
+    int i = blockIdx.x * blockDim.x + threadIdx.x;
+    if (i >= num_rows) return;
+    int s = row_ptr[i], e = row_ptr[i + 1];
+    double acc = 0.0;
+    for (int k = s; k < e; ++k) {
+        double v = fabs(vals[k]);
+        if (!isfinite(v)) v = 0.0;
+        acc += pow_fast(v, degree);
+    }
+    out_sum[i] = acc;
+}
+
+__global__ void csr_col_powsum_atomic_kernel(const int* __restrict__ col_ind,
+                                                const double* __restrict__ vals,
+                                                int nnz,
+                                                double degree,
+                                                double* __restrict__ out_sum)
+{
+    for (int k = blockIdx.x * blockDim.x + threadIdx.x; k < nnz; k += gridDim.x * blockDim.x) {
+        int j = col_ind[k];
+        double v = fabs(vals[k]);
+        if (!isfinite(v)) v = 0.0;
+        double t = pow_fast(v, degree);
+        atomicAdd(&out_sum[j], t);
+    }
+}
+
+__global__ void clamp_sqrt_and_accum(double* __restrict__ x, 
+                                     double* __restrict__ inv_x,
+                                     double* __restrict__ cum, 
+                                     int n) 
+{
+    for (int t = blockIdx.x * blockDim.x + threadIdx.x; t < n; t += blockDim.x * gridDim.x)
+    {
+        double v = x[t]; 
+        double s = (v < SCALING_EPSILON) ? 1.0 : sqrt(v); 
+        cum[t] *= s; 
+        x[t] = s;
+        inv_x[t] = 1.0 / s;
+    }
+}
+
+__global__ void reduce_bound_norm_sq_atomic(
+    const double* __restrict__ L,
+    const double* __restrict__ U,
+    int m,
+    double* __restrict__ out_sum)
+{
+    double acc = 0.0;
+    for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < m; i += blockDim.x * gridDim.x) {
+        double Li = L[i], Ui = U[i];
+        bool fL = isfinite(Li), fU = isfinite(Ui);
+        if (fL && (!fU || fabs(Li - Ui) > SCALING_EPSILON)) acc += Li * Li;
+        if (fU)                                 acc += Ui * Ui;
+    }
+    atomicAdd(out_sum, acc);
+}
+
+static void scale_problem(
+    pdhg_solver_state_t *state,
+    double *E,
+    double *D,
+    double *invE,
+    double *invD)
+{
+    int n_vars = state->num_variables;
+    int n_cons = state->num_constraints;
+
+    scale_variables_kernel<<<state->num_blocks_primal, THREADS_PER_BLOCK>>>(
+        state->objective_vector,
+        state->variable_lower_bound,
+        state->variable_upper_bound,
+        state->variable_lower_bound_finite_val,
+        state->variable_upper_bound_finite_val,
+        state->initial_primal_solution,
+        D,
+        invD,
+        n_vars);
+
+    scale_constraints_kernel<<<state->num_blocks_dual, THREADS_PER_BLOCK>>>(
+        state->constraint_lower_bound,
+        state->constraint_upper_bound,
+        state->constraint_lower_bound_finite_val,
+        state->constraint_upper_bound_finite_val,
+        state->initial_dual_solution,
+        E,
+        invE,
+        n_cons);
+
+    csr_scale_nnz_kernel<<<state->num_blocks_nnz, THREADS_PER_BLOCK>>>(
+        state->constraint_matrix->row_ind,
+        state->constraint_matrix->col_ind,
+        state->constraint_matrix->val,
+        invD,
+        invE,
+        state->constraint_matrix->num_nonzeros);
+}
+
+static void ruiz_rescaling(
+    pdhg_solver_state_t *state,
+    int num_iterations,
+    rescale_info_t *rescale_info,
+    double *E,
+    double *D,
+    double *invE,
+    double *invD)
+{
+    const int n_cons = state->num_constraints;
+    const int n_vars = state->num_variables;
+    const int nnz    = state->constraint_matrix->num_nonzeros;
+
+    unsigned long long *D_bits=nullptr;
+    CUDA_CHECK(cudaMalloc(&D_bits, n_vars*sizeof(unsigned long long)));
+
+    for (int iter = 0; iter < num_iterations; ++iter)
+    {
+        csr_row_absmax_kernel<<<state->num_blocks_dual, THREADS_PER_BLOCK>>>(
+            state->constraint_matrix->row_ptr,
+            state->constraint_matrix->val,
+            n_cons,
+            E);
+        clamp_sqrt_and_accum<<<state->num_blocks_dual, THREADS_PER_BLOCK>>>(
+            E,
+            invE,
+            rescale_info->con_rescale,
+            n_cons);
+
+        CUDA_CHECK(cudaMemset(D_bits, 0, n_vars*sizeof(unsigned long long)));
+        csr_col_absmax_atomic_kernel<<<state->num_blocks_nnz, THREADS_PER_BLOCK>>>(
+            state->constraint_matrix->col_ind,
+            state->constraint_matrix->val,
+            nnz,
+            D_bits);
+        u64bits_to_double<<<state->num_blocks_primal, THREADS_PER_BLOCK>>>(D_bits, D, n_vars); 
+        clamp_sqrt_and_accum<<<state->num_blocks_primal, THREADS_PER_BLOCK>>>(
+            D,
+            invD,
+            rescale_info->var_rescale,
+            n_vars);
+
+        scale_problem(state, E, D, invE, invD);
+    }
+
+    CUDA_CHECK(cudaFree(D_bits));
+}
+
+static void pock_chambolle_rescaling(
+    pdhg_solver_state_t *state,
+    const double alpha,
+    rescale_info_t *rescale_info,
+    double *E,
+    double *D,
+    double *invE,
+    double *invD)
+{
+    const int n_cons = state->num_constraints;
+    const int n_vars = state->num_variables;
+    const int nnz    = state->constraint_matrix->num_nonzeros;
+
+    csr_row_powsum_kernel<<<state->num_blocks_dual, THREADS_PER_BLOCK>>>(
+        state->constraint_matrix->row_ptr,
+        state->constraint_matrix->val,
+        n_cons,
+        alpha,
+        E);
+    clamp_sqrt_and_accum<<<state->num_blocks_dual, THREADS_PER_BLOCK>>>(
+        E,
+        invE,
+        rescale_info->con_rescale,
+        n_cons);
+        
+    CUDA_CHECK(cudaMemset(D, 0, n_vars*sizeof(double)));
+    csr_col_powsum_atomic_kernel<<<state->num_blocks_nnz, THREADS_PER_BLOCK>>>(
+        state->constraint_matrix->col_ind,
+        state->constraint_matrix->val,
+        nnz,
+        2.0 - alpha,
+        D);
+    clamp_sqrt_and_accum<<<state->num_blocks_primal, THREADS_PER_BLOCK>>>(
+        D,
+        invD,
+        rescale_info->var_rescale,
+        n_vars);
+
+    scale_problem(state, E, D, invE, invD);
+
+}
+
+static void bound_objective_rescaling(
+    pdhg_solver_state_t *state,
+    rescale_info_t *rescale_info,
+    double *E,
+    double *D,
+    double *invE,
+    double *invD
+    )
+{
+    const int n_cons = state->num_constraints;
+    const int n_vars = state->num_variables;
+
+    double *bnd_norm_sq_cuda = nullptr;
+    CUDA_CHECK(cudaMalloc(&bnd_norm_sq_cuda, sizeof(double)));
+    CUDA_CHECK(cudaMemset(bnd_norm_sq_cuda, 0, sizeof(double)));
+    reduce_bound_norm_sq_atomic<<<state->num_blocks_dual, THREADS_PER_BLOCK>>>(
+        state->constraint_lower_bound,
+        state->constraint_upper_bound,
+        n_cons,
+        bnd_norm_sq_cuda);
+    
+    double bnd_norm_sq = 0.0;
+    CUDA_CHECK(cudaMemcpy(&bnd_norm_sq, bnd_norm_sq_cuda, sizeof(double), cudaMemcpyDeviceToHost));
+    CUDA_CHECK(cudaFree(bnd_norm_sq_cuda));
+    double bnd_norm = sqrt(bnd_norm_sq);
+
+    double obj_norm = 0.0;
+    CUBLAS_CHECK(cublasDnrm2(state->blas_handle,
+                             state->num_variables,
+                             state->objective_vector, 1,
+                             &obj_norm));
+
+    const double E_const = bnd_norm + 1.0;
+    const double D_const = obj_norm + 1.0;
+    {
+        std::vector<double> h1(n_cons,E_const), h2(n_vars,D_const);
+        CUDA_CHECK(cudaMemcpy(E, h1.data(), n_cons*sizeof(double), cudaMemcpyHostToDevice));
+        CUDA_CHECK(cudaMemcpy(D, h2.data(), n_vars*sizeof(double), cudaMemcpyHostToDevice));
+        std::vector<double> h3(n_cons,1/E_const), h4(n_vars,1/D_const);
+        CUDA_CHECK(cudaMemcpy(invE, h3.data(), n_cons*sizeof(double), cudaMemcpyHostToDevice));
+        CUDA_CHECK(cudaMemcpy(invD, h4.data(), n_vars*sizeof(double), cudaMemcpyHostToDevice));
+    }
+
+    CUBLAS_CHECK(cublasDscal(state->blas_handle,
+                             n_cons,
+                             &E_const,
+                             rescale_info->con_rescale,
+                             1));
+    CUBLAS_CHECK(cublasDscal(state->blas_handle,
+                             n_vars,
+                             &D_const,
+                             rescale_info->var_rescale,
+                             1));
+
+    scale_problem(state, E, D, invE, invD);
+}
+
+rescale_info_t *rescale_problem(
+    const pdhg_parameters_t *params,
+    pdhg_solver_state_t *state)
+{
+    int n_vars = state->num_variables;
+    int n_cons = state->num_constraints;
+
+    clock_t start_rescaling = clock();
+    rescale_info_t *rescale_info = (rescale_info_t *)safe_calloc(1, sizeof(rescale_info_t));
+    rescale_info->con_rescale = nullptr;
+    rescale_info->var_rescale = nullptr;
+    CUDA_CHECK(cudaMalloc(&rescale_info->con_rescale, n_cons*sizeof(double)));
+    CUDA_CHECK(cudaMalloc(&rescale_info->var_rescale, n_vars*sizeof(double)));
+    {
+        std::vector<double> h1(n_cons,1.0), h2(n_vars,1.0);
+        CUDA_CHECK(cudaMemcpy(rescale_info->con_rescale, h1.data(), 
+                              n_cons*sizeof(double), cudaMemcpyHostToDevice));
+        CUDA_CHECK(cudaMemcpy(rescale_info->var_rescale, h2.data(), 
+                              n_vars*sizeof(double), cudaMemcpyHostToDevice));
+    }
+
+    double *E=nullptr, *D=nullptr, *invE=nullptr, *invD=nullptr;
+    CUDA_CHECK(cudaMalloc(&E, n_cons*sizeof(double)));
+    CUDA_CHECK(cudaMalloc(&D, n_vars*sizeof(double)));
+    CUDA_CHECK(cudaMalloc(&invE, n_cons*sizeof(double)));
+    CUDA_CHECK(cudaMalloc(&invD, n_vars*sizeof(double)));
+
+    if (params->l_inf_ruiz_iterations > 0)
+    {
+        ruiz_rescaling(state, params->l_inf_ruiz_iterations, rescale_info, E, D, invE, invD);
+    }
+    if (params->has_pock_chambolle_alpha)
+    {
+        pock_chambolle_rescaling(state, params->pock_chambolle_alpha, rescale_info, E, D, invE, invD);
+    }
+    if (params->bound_objective_rescaling)
+    {
+        bound_objective_rescaling(state, rescale_info, E, D, invE, invD);
+    }
+
+    rescale_info->rescaling_time_sec = (double)(clock() - start_rescaling) / CLOCKS_PER_SEC;
+
+    CUDA_CHECK(cudaFree(E));
+    CUDA_CHECK(cudaFree(D));
+    CUDA_CHECK(cudaFree(invE));
+    CUDA_CHECK(cudaFree(invD));
+    
+    return rescale_info;
+}
\ No newline at end of file
diff --git a/src/solver.cu b/src/solver.cu
index 2880e9a..300ce9f 100644
--- a/src/solver.cu
+++ b/src/solver.cu
@@ -28,52 +28,40 @@ limitations under the License.
 #include <time.h>
 
 __global__ void compute_next_pdhg_primal_solution_kernel(
-    const double *current_primal, double *reflected_primal,
-    const double *dual_product, const double *objective, const double *var_lb,
-    const double *var_ub, int n, double step_size);
+    const double *current_primal, double *reflected_primal, const double *dual_product,
+    const double *objective, const double *var_lb, const double *var_ub,
+    int n, double step_size);
 __global__ void compute_next_pdhg_primal_solution_major_kernel(
     const double *current_primal, double *pdhg_primal, double *reflected_primal,
     const double *dual_product, const double *objective, const double *var_lb,
     const double *var_ub, int n, double step_size, double *dual_slack);
 __global__ void compute_next_pdhg_dual_solution_kernel(
-    const double *current_dual, double *reflected_dual,
-    const double *primal_product, const double *const_lb,
-    const double *const_ub, int n, double step_size);
+    const double *current_dual, double *reflected_dual, const double *primal_product,
+    const double *const_lb, const double *const_ub, int n, double step_size);
 __global__ void compute_next_pdhg_dual_solution_major_kernel(
     const double *current_dual, double *pdhg_dual, double *reflected_dual,
-    const double *primal_product, const double *const_lb,
-    const double *const_ub, int n, double step_size);
-__global__ void
-halpern_update_kernel(const double *initial_primal, double *current_primal,
-                      const double *reflected_primal,
-                      const double *initial_dual, double *current_dual,
-                      const double *reflected_dual, int n_vars, int n_cons,
-                      double weight, double reflection_coeff);
-__global__ void rescale_solution_kernel(double *primal_solution,
-                                        double *dual_solution,
-                                        const double *variable_rescaling,
-                                        const double *constraint_rescaling,
-                                        const double objective_vector_rescaling,
-                                        const double constraint_bound_rescaling,
-                                        int n_vars, int n_cons);
+    const double *primal_product, const double *const_lb, const double *const_ub,
+    int n, double step_size);
+__global__ void halpern_update_kernel(
+    const double *initial_primal, double *current_primal, const double *reflected_primal,
+    const double *initial_dual, double *current_dual, const double *reflected_dual,
+    int n_vars, int n_cons, double weight, double reflection_coeff);
+__global__ void rescale_solution_kernel(
+    double *primal_solution, double *dual_solution, 
+    const double *variable_rescaling, const double *constraint_rescaling,
+    int n_vars, int n_cons);
 __global__ void compute_delta_solution_kernel(
-    const double *initial_primal, const double *pdhg_primal,
-    double *delta_primal, const double *initial_dual, const double *pdhg_dual,
-    double *delta_dual, int n_vars, int n_cons);
+    const double *initial_primal, const double *pdhg_primal, double *delta_primal,
+    const double *initial_dual, const double *pdhg_dual, double *delta_dual,
+    int n_vars, int n_cons);
 static void compute_next_pdhg_primal_solution(pdhg_solver_state_t *state);
 static void compute_next_pdhg_dual_solution(pdhg_solver_state_t *state);
-static void halpern_update(pdhg_solver_state_t *state,
-                           double reflection_coefficient);
+static void halpern_update(pdhg_solver_state_t *state, double reflection_coefficient);
 static void rescale_solution(pdhg_solver_state_t *state);
 static cupdlpx_result_t *create_result_from_state(pdhg_solver_state_t *state);
-static void perform_restart(pdhg_solver_state_t *state,
-                            const pdhg_parameters_t *params);
-static void
-initialize_step_size_and_primal_weight(pdhg_solver_state_t *state,
-                                       const pdhg_parameters_t *params);
-static pdhg_solver_state_t *
-initialize_solver_state(const lp_problem_t *original_problem,
-                        const rescale_info_t *rescale_info);
+static void perform_restart(pdhg_solver_state_t *state, const pdhg_parameters_t *params);
+static void initialize_step_size_and_primal_weight(pdhg_solver_state_t *state, const pdhg_parameters_t *params);
+static pdhg_solver_state_t *initialize_solver_state(const lp_problem_t *original_problem, const pdhg_parameters_t* params);
 static void compute_fixed_point_error(pdhg_solver_state_t *state);
 void lp_problem_free(lp_problem_t *prob);
 void pdhg_solver_state_free(pdhg_solver_state_t *state);
@@ -83,11 +71,8 @@ cupdlpx_result_t *optimize(const pdhg_parameters_t *params,
                            const lp_problem_t *original_problem)
 {
     print_initial_info(params, original_problem);
-    rescale_info_t *rescale_info = rescale_problem(params, original_problem);
-    pdhg_solver_state_t *state =
-        initialize_solver_state(original_problem, rescale_info);
+    pdhg_solver_state_t *state = initialize_solver_state(original_problem, params);
 
-    rescale_info_free(rescale_info);
     initialize_step_size_and_primal_weight(state, params);
     clock_t start_time = clock();
     bool do_restart = false;
@@ -147,15 +132,15 @@ cupdlpx_result_t *optimize(const pdhg_parameters_t *params,
     return results;
 }
 
-static pdhg_solver_state_t *
-initialize_solver_state(const lp_problem_t *original_problem,
-                        const rescale_info_t *rescale_info)
+static pdhg_solver_state_t *initialize_solver_state(
+    const lp_problem_t *original_problem,
+    const pdhg_parameters_t* params)
 {
-    pdhg_solver_state_t *state =
-        (pdhg_solver_state_t *)safe_calloc(1, sizeof(pdhg_solver_state_t));
+    pdhg_solver_state_t *state = (pdhg_solver_state_t *)safe_calloc(1, sizeof(pdhg_solver_state_t));
 
     int n_vars = original_problem->num_variables;
     int n_cons = original_problem->num_constraints;
+    int nnz = original_problem->constraint_matrix_num_nonzeros;
     size_t var_bytes = n_vars * sizeof(double);
     size_t con_bytes = n_cons * sizeof(double);
 
@@ -163,99 +148,60 @@ initialize_solver_state(const lp_problem_t *original_problem,
     state->num_constraints = n_cons;
     state->objective_constant = original_problem->objective_constant;
 
-    state->constraint_matrix =
-        (cu_sparse_matrix_csr_t *)safe_malloc(sizeof(cu_sparse_matrix_csr_t));
-    state->constraint_matrix_t =
-        (cu_sparse_matrix_csr_t *)safe_malloc(sizeof(cu_sparse_matrix_csr_t));
+    state->constraint_matrix = (cu_sparse_matrix_csr_t *)safe_malloc(sizeof(cu_sparse_matrix_csr_t));
+    state->constraint_matrix_t = (cu_sparse_matrix_csr_t *)safe_malloc(sizeof(cu_sparse_matrix_csr_t));
 
     state->constraint_matrix->num_rows = n_cons;
     state->constraint_matrix->num_cols = n_vars;
-    state->constraint_matrix->num_nonzeros =
-        original_problem->constraint_matrix_num_nonzeros;
+    state->constraint_matrix->num_nonzeros = original_problem->constraint_matrix_num_nonzeros;
 
     state->constraint_matrix_t->num_rows = n_vars;
     state->constraint_matrix_t->num_cols = n_cons;
-    state->constraint_matrix_t->num_nonzeros =
-        original_problem->constraint_matrix_num_nonzeros;
+    state->constraint_matrix_t->num_nonzeros = original_problem->constraint_matrix_num_nonzeros;
 
     state->termination_reason = TERMINATION_REASON_UNSPECIFIED;
 
-    state->rescaling_time_sec = rescale_info->rescaling_time_sec;
+    state->num_blocks_primal = (state->num_variables + THREADS_PER_BLOCK - 1) / THREADS_PER_BLOCK;
+    state->num_blocks_dual = (state->num_constraints + THREADS_PER_BLOCK - 1) / THREADS_PER_BLOCK;
+    state->num_blocks_primal_dual = (state->num_variables + state->num_constraints + THREADS_PER_BLOCK - 1) / THREADS_PER_BLOCK;
+    state->num_blocks_nnz = (state->constraint_matrix->num_nonzeros + THREADS_PER_BLOCK - 1) / THREADS_PER_BLOCK;
+    
+    CUSPARSE_CHECK(cusparseCreate(&state->sparse_handle));
+    CUBLAS_CHECK(cublasCreate(&state->blas_handle));
+    CUBLAS_CHECK(cublasSetPointerMode(state->blas_handle, CUBLAS_POINTER_MODE_HOST));
 
 #define ALLOC_AND_COPY(dest, src, bytes)  \
     CUDA_CHECK(cudaMalloc(&dest, bytes)); \
     CUDA_CHECK(cudaMemcpy(dest, src, bytes, cudaMemcpyHostToDevice));
 
-    ALLOC_AND_COPY(state->constraint_matrix->row_ptr,
-                   rescale_info->scaled_problem->constraint_matrix_row_pointers,
-                   (n_cons + 1) * sizeof(int));
-    ALLOC_AND_COPY(state->constraint_matrix->col_ind,
-                   rescale_info->scaled_problem->constraint_matrix_col_indices,
-                   rescale_info->scaled_problem->constraint_matrix_num_nonzeros *
-                       sizeof(int));
-    ALLOC_AND_COPY(state->constraint_matrix->val,
-                   rescale_info->scaled_problem->constraint_matrix_values,
-                   rescale_info->scaled_problem->constraint_matrix_num_nonzeros *
-                       sizeof(double));
-
-    CUDA_CHECK(cudaMalloc(&state->constraint_matrix_t->row_ptr,
-                          (n_vars + 1) * sizeof(int)));
-    CUDA_CHECK(
-        cudaMalloc(&state->constraint_matrix_t->col_ind,
-                   rescale_info->scaled_problem->constraint_matrix_num_nonzeros *
-                       sizeof(int)));
-    CUDA_CHECK(
-        cudaMalloc(&state->constraint_matrix_t->val,
-                   rescale_info->scaled_problem->constraint_matrix_num_nonzeros *
-                       sizeof(double)));
+    ALLOC_AND_COPY(state->constraint_matrix->row_ptr, original_problem->constraint_matrix_row_pointers, (n_cons + 1) * sizeof(int));
+    ALLOC_AND_COPY(state->constraint_matrix->col_ind, original_problem->constraint_matrix_col_indices, original_problem->constraint_matrix_num_nonzeros * sizeof(int));
+    ALLOC_AND_COPY(state->constraint_matrix->val, original_problem->constraint_matrix_values, original_problem->constraint_matrix_num_nonzeros * sizeof(double));
 
-    CUSPARSE_CHECK(cusparseCreate(&state->sparse_handle));
-    CUBLAS_CHECK(cublasCreate(&state->blas_handle));
-    CUBLAS_CHECK(
-        cublasSetPointerMode(state->blas_handle, CUBLAS_POINTER_MODE_HOST));
+    int* h_row_ind = build_row_ind_from_row_ptr(original_problem->constraint_matrix_row_pointers, n_cons, nnz);
+    ALLOC_AND_COPY(state->constraint_matrix->row_ind, h_row_ind, nnz * sizeof(int));
+    free(h_row_ind);
 
-    size_t buffer_size = 0;
-    void *buffer = nullptr;
-    CUSPARSE_CHECK(cusparseCsr2cscEx2_bufferSize(
-        state->sparse_handle, state->constraint_matrix->num_rows,
-        state->constraint_matrix->num_cols,
-        state->constraint_matrix->num_nonzeros, state->constraint_matrix->val,
-        state->constraint_matrix->row_ptr, state->constraint_matrix->col_ind,
-        state->constraint_matrix_t->val, state->constraint_matrix_t->row_ptr,
-        state->constraint_matrix_t->col_ind, CUDA_R_64F, CUSPARSE_ACTION_NUMERIC,
-        CUSPARSE_INDEX_BASE_ZERO, CUSPARSE_CSR2CSC_ALG_DEFAULT, &buffer_size));
-    CUDA_CHECK(cudaMalloc(&buffer, buffer_size));
+    ALLOC_AND_COPY(state->variable_lower_bound, original_problem->variable_lower_bound, var_bytes);
+    ALLOC_AND_COPY(state->variable_upper_bound, original_problem->variable_upper_bound, var_bytes);
+    ALLOC_AND_COPY(state->objective_vector, original_problem->objective_vector, var_bytes);
+    ALLOC_AND_COPY(state->constraint_lower_bound, original_problem->constraint_lower_bound, con_bytes);
+    ALLOC_AND_COPY(state->constraint_upper_bound, original_problem->constraint_upper_bound, con_bytes);
 
-    CUSPARSE_CHECK(cusparseCsr2cscEx2(
-        state->sparse_handle, state->constraint_matrix->num_rows,
-        state->constraint_matrix->num_cols,
-        state->constraint_matrix->num_nonzeros, state->constraint_matrix->val,
-        state->constraint_matrix->row_ptr, state->constraint_matrix->col_ind,
-        state->constraint_matrix_t->val, state->constraint_matrix_t->row_ptr,
-        state->constraint_matrix_t->col_ind, CUDA_R_64F, CUSPARSE_ACTION_NUMERIC,
-        CUSPARSE_INDEX_BASE_ZERO, CUSPARSE_CSR2CSC_ALG_DEFAULT, buffer));
-
-    CUDA_CHECK(cudaFree(buffer));
-
-    ALLOC_AND_COPY(state->variable_lower_bound,
-                   rescale_info->scaled_problem->variable_lower_bound, var_bytes);
-    ALLOC_AND_COPY(state->variable_upper_bound,
-                   rescale_info->scaled_problem->variable_upper_bound, var_bytes);
-    ALLOC_AND_COPY(state->objective_vector,
-                   rescale_info->scaled_problem->objective_vector, var_bytes);
-    ALLOC_AND_COPY(state->constraint_lower_bound,
-                   rescale_info->scaled_problem->constraint_lower_bound,
-                   con_bytes);
-    ALLOC_AND_COPY(state->constraint_upper_bound,
-                   rescale_info->scaled_problem->constraint_upper_bound,
-                   con_bytes);
-    ALLOC_AND_COPY(state->constraint_rescaling, rescale_info->con_rescale,
-                   con_bytes);
-    ALLOC_AND_COPY(state->variable_rescaling, rescale_info->var_rescale,
-                   var_bytes);
-
-    state->constraint_bound_rescaling = rescale_info->con_bound_rescale;
-    state->objective_vector_rescaling = rescale_info->obj_vec_rescale;
+    double *temp_host = (double *)safe_malloc(fmax(var_bytes, con_bytes));
+    for (int i = 0; i < n_cons; ++i)
+        temp_host[i] = isfinite(original_problem->constraint_lower_bound[i]) ? original_problem->constraint_lower_bound[i] : 0.0;
+    ALLOC_AND_COPY(state->constraint_lower_bound_finite_val, temp_host, con_bytes);
+    for (int i = 0; i < n_cons; ++i)
+        temp_host[i] = isfinite(original_problem->constraint_upper_bound[i]) ? original_problem->constraint_upper_bound[i] : 0.0;
+    ALLOC_AND_COPY(state->constraint_upper_bound_finite_val, temp_host, con_bytes);
+    for (int i = 0; i < n_vars; ++i)
+        temp_host[i] = isfinite(original_problem->variable_lower_bound[i]) ? original_problem->variable_lower_bound[i] : 0.0;
+    ALLOC_AND_COPY(state->variable_lower_bound_finite_val, temp_host, var_bytes);
+    for (int i = 0; i < n_vars; ++i)
+        temp_host[i] = isfinite(original_problem->variable_upper_bound[i]) ? original_problem->variable_upper_bound[i] : 0.0;
+    ALLOC_AND_COPY(state->variable_upper_bound_finite_val, temp_host, var_bytes);
+    free(temp_host);
 
 #define ALLOC_ZERO(dest, bytes)           \
     CUDA_CHECK(cudaMalloc(&dest, bytes)); \
@@ -281,62 +227,56 @@ initialize_solver_state(const lp_problem_t *original_problem,
 
     if (original_problem->primal_start)
     {
-        double *rescaled = (double *)safe_malloc(var_bytes);
-        for (int i = 0; i < n_vars; ++i)
-            rescaled[i] = original_problem->primal_start[i] *
-                          rescale_info->var_rescale[i] *
-                          rescale_info->con_bound_rescale;
-        CUDA_CHECK(cudaMemcpy(state->initial_primal_solution, rescaled, var_bytes,
-                              cudaMemcpyHostToDevice));
-        free(rescaled);
+    CUDA_CHECK(cudaMemcpy(state->initial_primal_solution,
+                          original_problem->primal_start,
+                          var_bytes, cudaMemcpyHostToDevice));
     }
     if (original_problem->dual_start)
     {
-        double *rescaled = (double *)safe_malloc(con_bytes);
-        for (int i = 0; i < n_cons; ++i)
-            rescaled[i] = original_problem->dual_start[i] *
-                          rescale_info->con_rescale[i] *
-                          rescale_info->obj_vec_rescale;
-        CUDA_CHECK(cudaMemcpy(state->initial_dual_solution, rescaled, con_bytes,
-                              cudaMemcpyHostToDevice));
-        free(rescaled);
+    CUDA_CHECK(cudaMemcpy(state->initial_dual_solution,
+                          original_problem->dual_start,
+                          con_bytes, cudaMemcpyHostToDevice));
     }
 
-    double *temp_host = (double *)safe_malloc(fmax(var_bytes, con_bytes));
-    for (int i = 0; i < n_cons; ++i)
-        temp_host[i] =
-            isfinite(rescale_info->scaled_problem->constraint_lower_bound[i])
-                ? rescale_info->scaled_problem->constraint_lower_bound[i]
-                : 0.0;
-    ALLOC_AND_COPY(state->constraint_lower_bound_finite_val, temp_host,
-                   con_bytes);
-    for (int i = 0; i < n_cons; ++i)
-        temp_host[i] =
-            isfinite(rescale_info->scaled_problem->constraint_upper_bound[i])
-                ? rescale_info->scaled_problem->constraint_upper_bound[i]
-                : 0.0;
-    ALLOC_AND_COPY(state->constraint_upper_bound_finite_val, temp_host,
-                   con_bytes);
-    for (int i = 0; i < n_vars; ++i)
-        temp_host[i] =
-            isfinite(rescale_info->scaled_problem->variable_lower_bound[i])
-                ? rescale_info->scaled_problem->variable_lower_bound[i]
-                : 0.0;
-    ALLOC_AND_COPY(state->variable_lower_bound_finite_val, temp_host, var_bytes);
-    for (int i = 0; i < n_vars; ++i)
-        temp_host[i] =
-            isfinite(rescale_info->scaled_problem->variable_upper_bound[i])
-                ? rescale_info->scaled_problem->variable_upper_bound[i]
-                : 0.0;
-    ALLOC_AND_COPY(state->variable_upper_bound_finite_val, temp_host, var_bytes);
-    free(temp_host);
+    rescale_info_t *rescale_info = rescale_problem(params, state);
+
+    state->constraint_rescaling = rescale_info->con_rescale;
+    state->variable_rescaling   = rescale_info->var_rescale;
+    state->rescaling_time_sec   = rescale_info->rescaling_time_sec;
+
+    CUDA_CHECK(cudaMalloc(&state->constraint_matrix_t->row_ptr, (n_vars + 1) * sizeof(int)));
+    CUDA_CHECK(cudaMalloc(&state->constraint_matrix_t->col_ind, original_problem->constraint_matrix_num_nonzeros * sizeof(int)));
+    CUDA_CHECK(cudaMalloc(&state->constraint_matrix_t->val, original_problem->constraint_matrix_num_nonzeros * sizeof(double)));
+
+    size_t buffer_size = 0;
+    void *buffer = nullptr;
+    CUSPARSE_CHECK(cusparseCsr2cscEx2_bufferSize(
+        state->sparse_handle, state->constraint_matrix->num_rows, state->constraint_matrix->num_cols, state->constraint_matrix->num_nonzeros,
+        state->constraint_matrix->val, state->constraint_matrix->row_ptr, state->constraint_matrix->col_ind,
+        state->constraint_matrix_t->val, state->constraint_matrix_t->row_ptr, state->constraint_matrix_t->col_ind,
+        CUDA_R_64F, CUSPARSE_ACTION_NUMERIC, CUSPARSE_INDEX_BASE_ZERO,
+        CUSPARSE_CSR2CSC_ALG_DEFAULT, &buffer_size));
+    CUDA_CHECK(cudaMalloc(&buffer, buffer_size));
+
+    CUSPARSE_CHECK(cusparseCsr2cscEx2(
+        state->sparse_handle, state->constraint_matrix->num_rows, state->constraint_matrix->num_cols, state->constraint_matrix->num_nonzeros,
+        state->constraint_matrix->val, state->constraint_matrix->row_ptr, state->constraint_matrix->col_ind,
+        state->constraint_matrix_t->val, state->constraint_matrix_t->row_ptr, state->constraint_matrix_t->col_ind,
+        CUDA_R_64F, CUSPARSE_ACTION_NUMERIC, CUSPARSE_INDEX_BASE_ZERO,
+        CUSPARSE_CSR2CSC_ALG_DEFAULT, buffer));
+    state->constraint_matrix_t->row_ind = NULL;
+
+    CUDA_CHECK(cudaFree(buffer));
+
+    rescale_info->con_rescale = NULL;
+    rescale_info->var_rescale = NULL;
+    rescale_info_free(rescale_info);
 
     double sum_of_squares = 0.0;
 
     for (int i = 0; i < n_vars; ++i)
     {
-        sum_of_squares += original_problem->objective_vector[i] *
-                          original_problem->objective_vector[i];
+        sum_of_squares += original_problem->objective_vector[i] * original_problem->objective_vector[i];
     }
     state->objective_vector_norm = sqrt(sum_of_squares);
 
@@ -357,15 +297,7 @@ initialize_solver_state(const lp_problem_t *original_problem,
             sum_of_squares += upper * upper;
         }
     }
-
     state->constraint_bound_norm = sqrt(sum_of_squares);
-    state->num_blocks_primal =
-        (state->num_variables + THREADS_PER_BLOCK - 1) / THREADS_PER_BLOCK;
-    state->num_blocks_dual =
-        (state->num_constraints + THREADS_PER_BLOCK - 1) / THREADS_PER_BLOCK;
-    state->num_blocks_primal_dual =
-        (state->num_variables + state->num_constraints + THREADS_PER_BLOCK - 1) /
-        THREADS_PER_BLOCK;
 
     state->best_primal_dual_residual_gap = INFINITY;
     state->last_trial_fixed_point_error = INFINITY;
@@ -375,77 +307,42 @@ initialize_solver_state(const lp_problem_t *original_problem,
     size_t primal_spmv_buffer_size;
     size_t dual_spmv_buffer_size;
 
-    CUSPARSE_CHECK(cusparseCreateCsr(
-        &state->matA, state->num_constraints, state->num_variables,
-        state->constraint_matrix->num_nonzeros, state->constraint_matrix->row_ptr,
-        state->constraint_matrix->col_ind, state->constraint_matrix->val,
-        CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_BASE_ZERO,
-        CUDA_R_64F));
-
+    CUSPARSE_CHECK(cusparseCreateCsr(&state->matA, state->num_constraints, state->num_variables, state->constraint_matrix->num_nonzeros, state->constraint_matrix->row_ptr, state->constraint_matrix->col_ind, state->constraint_matrix->val, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_BASE_ZERO, CUDA_R_64F));
     CUDA_CHECK(cudaGetLastError());
 
-    CUSPARSE_CHECK(cusparseCreateCsr(
-        &state->matAt, state->num_variables, state->num_constraints,
-        state->constraint_matrix_t->num_nonzeros,
-        state->constraint_matrix_t->row_ptr, state->constraint_matrix_t->col_ind,
-        state->constraint_matrix_t->val, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I,
-        CUSPARSE_INDEX_BASE_ZERO, CUDA_R_64F));
+    CUSPARSE_CHECK(cusparseCreateCsr(&state->matAt, state->num_variables, state->num_constraints, state->constraint_matrix_t->num_nonzeros, state->constraint_matrix_t->row_ptr, state->constraint_matrix_t->col_ind, state->constraint_matrix_t->val, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_BASE_ZERO, CUDA_R_64F));
     CUDA_CHECK(cudaGetLastError());
 
-    CUSPARSE_CHECK(cusparseCreateDnVec(&state->vec_primal_sol,
-                                       state->num_variables,
-                                       state->pdhg_primal_solution, CUDA_R_64F));
-    CUSPARSE_CHECK(cusparseCreateDnVec(&state->vec_dual_sol,
-                                       state->num_constraints,
-                                       state->pdhg_dual_solution, CUDA_R_64F));
-    CUSPARSE_CHECK(cusparseCreateDnVec(&state->vec_primal_prod,
-                                       state->num_constraints,
-                                       state->primal_product, CUDA_R_64F));
-    CUSPARSE_CHECK(cusparseCreateDnVec(&state->vec_dual_prod,
-                                       state->num_variables, state->dual_product,
-                                       CUDA_R_64F));
-    CUSPARSE_CHECK(cusparseSpMV_bufferSize(
-        state->sparse_handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &HOST_ONE,
-        state->matA, state->vec_primal_sol, &HOST_ZERO, state->vec_primal_prod,
-        CUDA_R_64F, CUSPARSE_SPMV_CSR_ALG2, &primal_spmv_buffer_size));
+    CUSPARSE_CHECK(cusparseCreateDnVec(&state->vec_primal_sol, state->num_variables, state->pdhg_primal_solution, CUDA_R_64F));
+    CUSPARSE_CHECK(cusparseCreateDnVec(&state->vec_dual_sol, state->num_constraints, state->pdhg_dual_solution, CUDA_R_64F));
+    CUSPARSE_CHECK(cusparseCreateDnVec(&state->vec_primal_prod, state->num_constraints, state->primal_product, CUDA_R_64F));
+    CUSPARSE_CHECK(cusparseCreateDnVec(&state->vec_dual_prod, state->num_variables, state->dual_product, CUDA_R_64F));
+    CUSPARSE_CHECK(cusparseSpMV_bufferSize(state->sparse_handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &HOST_ONE, state->matA, state->vec_primal_sol, &HOST_ZERO, state->vec_primal_prod, CUDA_R_64F, CUSPARSE_SPMV_CSR_ALG2, &primal_spmv_buffer_size));
 
-    CUSPARSE_CHECK(cusparseSpMV_bufferSize(
-        state->sparse_handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &HOST_ONE,
-        state->matAt, state->vec_dual_sol, &HOST_ZERO, state->vec_dual_prod,
-        CUDA_R_64F, CUSPARSE_SPMV_CSR_ALG2, &dual_spmv_buffer_size));
+    CUSPARSE_CHECK(cusparseSpMV_bufferSize(state->sparse_handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &HOST_ONE, state->matAt, state->vec_dual_sol, &HOST_ZERO, state->vec_dual_prod, CUDA_R_64F, CUSPARSE_SPMV_CSR_ALG2, &dual_spmv_buffer_size));
     CUDA_CHECK(cudaMalloc(&state->primal_spmv_buffer, primal_spmv_buffer_size));
-    CUSPARSE_CHECK(cusparseSpMV_preprocess(
-        state->sparse_handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &HOST_ONE,
-        state->matA, state->vec_primal_sol, &HOST_ZERO, state->vec_primal_prod,
-        CUDA_R_64F, CUSPARSE_SPMV_CSR_ALG2, state->primal_spmv_buffer));
+    CUSPARSE_CHECK(cusparseSpMV_preprocess(state->sparse_handle, CUSPARSE_OPERATION_NON_TRANSPOSE,
+                                           &HOST_ONE, state->matA, state->vec_primal_sol, &HOST_ZERO, state->vec_primal_prod,
+                                           CUDA_R_64F, CUSPARSE_SPMV_CSR_ALG2, state->primal_spmv_buffer));
 
     CUDA_CHECK(cudaMalloc(&state->dual_spmv_buffer, dual_spmv_buffer_size));
-    CUSPARSE_CHECK(cusparseSpMV_preprocess(
-        state->sparse_handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &HOST_ONE,
-        state->matAt, state->vec_dual_sol, &HOST_ZERO, state->vec_dual_prod,
-        CUDA_R_64F, CUSPARSE_SPMV_CSR_ALG2, state->dual_spmv_buffer));
+    CUSPARSE_CHECK(cusparseSpMV_preprocess(state->sparse_handle, CUSPARSE_OPERATION_NON_TRANSPOSE,
+                                           &HOST_ONE, state->matAt, state->vec_dual_sol, &HOST_ZERO, state->vec_dual_prod,
+                                           CUDA_R_64F, CUSPARSE_SPMV_CSR_ALG2, state->dual_spmv_buffer));
 
-    CUDA_CHECK(
-        cudaMalloc(&state->ones_primal_d, state->num_variables * sizeof(double)));
-    CUDA_CHECK(
-        cudaMalloc(&state->ones_dual_d, state->num_constraints * sizeof(double)));
+    CUDA_CHECK(cudaMalloc(&state->ones_primal_d, state->num_variables * sizeof(double)));
+    CUDA_CHECK(cudaMalloc(&state->ones_dual_d, state->num_constraints * sizeof(double)));
 
-    double *ones_primal_h =
-        (double *)safe_malloc(state->num_variables * sizeof(double));
+    double *ones_primal_h = (double *)safe_malloc(state->num_variables * sizeof(double));
     for (int i = 0; i < state->num_variables; ++i)
         ones_primal_h[i] = 1.0;
-    CUDA_CHECK(cudaMemcpy(state->ones_primal_d, ones_primal_h,
-                          state->num_variables * sizeof(double),
-                          cudaMemcpyHostToDevice));
+    CUDA_CHECK(cudaMemcpy(state->ones_primal_d, ones_primal_h, state->num_variables * sizeof(double), cudaMemcpyHostToDevice));
     free(ones_primal_h);
 
-    double *ones_dual_h =
-        (double *)safe_malloc(state->num_constraints * sizeof(double));
+    double *ones_dual_h = (double *)safe_malloc(state->num_constraints * sizeof(double));
     for (int i = 0; i < state->num_constraints; ++i)
         ones_dual_h[i] = 1.0;
-    CUDA_CHECK(cudaMemcpy(state->ones_dual_d, ones_dual_h,
-                          state->num_constraints * sizeof(double),
-                          cudaMemcpyHostToDevice));
+    CUDA_CHECK(cudaMemcpy(state->ones_dual_d, ones_dual_h, state->num_constraints * sizeof(double), cudaMemcpyHostToDevice));
     free(ones_dual_h);
 
     return state;
@@ -538,21 +435,18 @@ __global__ void rescale_solution_kernel(double *primal_solution,
                                         double *dual_solution,
                                         const double *variable_rescaling,
                                         const double *constraint_rescaling,
-                                        const double objective_vector_rescaling,
-                                        const double constraint_bound_rescaling,
                                         int n_vars, int n_cons)
 {
     int i = blockIdx.x * blockDim.x + threadIdx.x;
     if (i < n_vars)
     {
         primal_solution[i] =
-            primal_solution[i] / variable_rescaling[i] / constraint_bound_rescaling;
+            primal_solution[i] / variable_rescaling[i];
     }
     else if (i < n_vars + n_cons)
     {
         int idx = i - n_vars;
-        dual_solution[idx] = dual_solution[idx] / constraint_rescaling[idx] /
-                             objective_vector_rescaling;
+        dual_solution[idx] = dual_solution[idx] / constraint_rescaling[idx];
     }
 }
 
@@ -662,7 +556,6 @@ static void rescale_solution(pdhg_solver_state_t *state)
     rescale_solution_kernel<<<state->num_blocks_primal_dual, THREADS_PER_BLOCK>>>(
         state->pdhg_primal_solution, state->pdhg_dual_solution,
         state->variable_rescaling, state->constraint_rescaling,
-        state->objective_vector_rescaling, state->constraint_bound_rescaling,
         state->num_variables, state->num_constraints);
 }
 
@@ -882,9 +775,8 @@ void rescale_info_free(rescale_info_t *info)
         return;
     }
 
-    lp_problem_free(info->scaled_problem);
-    free(info->con_rescale);
-    free(info->var_rescale);
+    CUDA_CHECK(cudaFree(info->con_rescale));
+    CUDA_CHECK(cudaFree(info->var_rescale));
 
     free(info);
 }
diff --git a/src/utils.cu b/src/utils.cu
index 83ba194..0de6c4b 100644
--- a/src/utils.cu
+++ b/src/utils.cu
@@ -596,19 +596,14 @@ void compute_residual(pdhg_solver_state_t *state)
     CUBLAS_CHECK(cublasDnrm2_v2_64(state->blas_handle, state->num_constraints,
                                    state->primal_residual, 1,
                                    &state->absolute_primal_residual));
-    state->absolute_primal_residual /= state->constraint_bound_rescaling;
     CUBLAS_CHECK(cublasDnrm2_v2_64(state->blas_handle, state->num_variables,
                                    state->dual_residual, 1,
                                    &state->absolute_dual_residual));
-    state->absolute_dual_residual /= state->objective_vector_rescaling;
 
     CUBLAS_CHECK(cublasDdot(
         state->blas_handle, state->num_variables, state->objective_vector, 1,
         state->pdhg_primal_solution, 1, &state->primal_objective_value));
-    state->primal_objective_value =
-        state->primal_objective_value / (state->constraint_bound_rescaling *
-                                         state->objective_vector_rescaling) +
-        state->objective_constant;
+    state->primal_objective_value = state->primal_objective_value + state->objective_constant;
 
     double base_dual_objective;
     CUBLAS_CHECK(cublasDdot(state->blas_handle, state->num_variables,
@@ -617,10 +612,7 @@ void compute_residual(pdhg_solver_state_t *state)
     double dual_slack_sum =
         get_vector_sum(state->blas_handle, state->num_constraints,
                        state->ones_dual_d, state->primal_slack);
-    state->dual_objective_value = (base_dual_objective + dual_slack_sum) /
-                                      (state->constraint_bound_rescaling *
-                                       state->objective_vector_rescaling) +
-                                  state->objective_constant;
+    state->dual_objective_value = base_dual_objective + dual_slack_sum + state->objective_constant;
 
     state->relative_primal_residual =
         state->absolute_primal_residual / (1.0 + state->constraint_bound_norm);
@@ -677,8 +669,6 @@ void compute_infeasibility_information(pdhg_solver_state_t *state)
     CUBLAS_CHECK(cublasDdot(
         state->blas_handle, state->num_variables, state->objective_vector, 1,
         state->delta_primal_solution, 1, &state->primal_ray_linear_objective));
-    state->primal_ray_linear_objective /=
-        (state->constraint_bound_rescaling * state->objective_vector_rescaling);
 
     dual_solution_dual_objective_contribution_kernel<<<state->num_blocks_dual,
                                                        THREADS_PER_BLOCK>>>(
@@ -698,9 +688,7 @@ void compute_infeasibility_information(pdhg_solver_state_t *state)
     double sum_dual_slack =
         get_vector_sum(state->blas_handle, state->num_variables,
                        state->ones_primal_d, state->dual_slack);
-    state->dual_ray_objective =
-        (sum_primal_slack + sum_dual_slack) /
-        (state->constraint_bound_rescaling * state->objective_vector_rescaling);
+    state->dual_ray_objective = sum_primal_slack + sum_dual_slack;
 
     compute_primal_infeasibility_kernel<<<state->num_blocks_dual,
                                           THREADS_PER_BLOCK>>>(
@@ -961,3 +949,19 @@ int coo_to_csr(const matrix_desc_t *desc, int **row_ptr, int **col_ind,
     *nnz_out = nnz;
     return 0;
 }
+
+int* build_row_ind_from_row_ptr(int* row_ptr, int num_rows, int nnz)
+{
+    if (!row_ptr || num_rows < 0 || nnz <= 0) return NULL;
+
+    int* row_ind = (int*)safe_malloc(nnz * sizeof(int));
+    for (int i = 0; i < num_rows; ++i) {
+        int s = row_ptr[i];
+        int e = row_ptr[i + 1];
+        for (int k = s; k < e; ++k) {
+            row_ind[k] = i;
+        }
+    }
+    return row_ind;
+}
+

From 5a993cd41d35f5a0c1597251c76d9a5b0017241c Mon Sep 17 00:00:00 2001
From: Bo Tang <botang@mie.utoronto.ca>
Date: Sat, 15 Nov 2025 03:53:04 -0500
Subject: [PATCH 02/13] New feat: synchronized A/At scaling

---
 internal/internal_types.h |   1 +
 internal/utils.h          |   2 -
 src/preconditioner.cu     | 424 +++++++++++++++++---------------------
 src/solver.cu             | 170 ++++++++++++---
 src/utils.cu              |  15 --
 5 files changed, 332 insertions(+), 280 deletions(-)

diff --git a/internal/internal_types.h b/internal/internal_types.h
index f0d8def..177a293 100644
--- a/internal/internal_types.h
+++ b/internal/internal_types.h
@@ -30,6 +30,7 @@ typedef struct
 	int *col_ind;
 	int *row_ind;
 	double *val;
+	int *transpose_pos;
 } cu_sparse_matrix_csr_t;
 
 typedef struct
diff --git a/internal/utils.h b/internal/utils.h
index 7b939ea..62da072 100644
--- a/internal/utils.h
+++ b/internal/utils.h
@@ -125,8 +125,6 @@ extern "C"
     int coo_to_csr(const matrix_desc_t *desc,
                    int **row_ptr, int **col_ind, double **vals, int *nnz_out);
 
-    int* build_row_ind_from_row_ptr(int* row_ptr, int num_rows, int nnz);
-
 #ifdef __cplusplus
 }
 
diff --git a/src/preconditioner.cu b/src/preconditioner.cu
index e197905..ec3a6f2 100644
--- a/src/preconditioner.cu
+++ b/src/preconditioner.cu
@@ -19,7 +19,6 @@ limitations under the License.
 #include <time.h>
 #include <stdio.h>
 #include <math.h>
-#include <vector>
 #include <cuda_runtime.h>
 #include <cublas_v2.h>
 
@@ -44,7 +43,9 @@ __global__ void scale_constraints_kernel(double* __restrict__ con_lb,
                                          int m);
 __global__ void csr_scale_nnz_kernel(const int* __restrict__ row_ids,
                                      const int* __restrict__ col_ind,
-                                     double* __restrict__ vals,
+                                     double* __restrict__ A_vals,
+                                     double* __restrict__ At_vals,
+                                     const int* __restrict__ A_to_At,
                                      const double* __restrict__ invD,
                                      const double* __restrict__ invE,
                                      int nnz);
@@ -52,205 +53,34 @@ __global__ void csr_row_absmax_kernel(const int* __restrict__ row_ptr,
                                       const double* __restrict__ vals,
                                       int num_rows,
                                       double* __restrict__ out_max);
-__global__ void csr_col_absmax_atomic_kernel(const int* __restrict__ col_ind,
-                                             const double* __restrict__ vals,
-                                             int nnz,
-                                             unsigned long long* __restrict__ out_max_bits);
-__global__ void u64bits_to_double(const unsigned long long* __restrict__ in_bits,
-                                  double* __restrict__ out_val,
-                                  int n);
 __global__ void csr_row_powsum_kernel(const int* __restrict__ row_ptr,
                                           const double* __restrict__ vals,
                                           int num_rows,
                                           double degree,
                                           double* __restrict__ out_sum);
-__global__ void csr_col_powsum_atomic_kernel(const int* __restrict__ col_ind,
-                                                const double* __restrict__ vals,
-                                                int nnz,
-                                                double degree,
-                                                double* __restrict__ out_sum);
 __global__ void clamp_sqrt_and_accum(double* __restrict__ x,
                                      double* __restrict__ inv_x,
                                      double* __restrict__ cum, 
                                      int n);
+__global__ void reduce_bound_norm_sq_atomic(
+    const double* __restrict__ L,
+    const double* __restrict__ U,
+    int m,
+    double* __restrict__ out_sum);
+__global__ void fill_const_and_accum_kernel(double* __restrict__ x,
+                                            double* __restrict__ inv_x,
+                                            double* __restrict__ cum,
+                                            double val,
+                                            int n);
 static void scale_problem(pdhg_solver_state_t *state, double *E, double *D, double *invE, double *invD);
 static void ruiz_rescaling(pdhg_solver_state_t *state, int num_iters, rescale_info_t *rescale_info,
                            double *E, double *D, double *invE, double *invD);
+__global__ void fill_ones_kernel(double* __restrict__ x, int n);
 static void pock_chambolle_rescaling(pdhg_solver_state_t *state, double alpha, rescale_info_t *rescale_info,
                                      double *E, double *D, double *invE, double *invD);
 static void bound_objective_rescaling(pdhg_solver_state_t *state, rescale_info_t *rescale_info,
                                      double *E, double *D, double *invE, double *invD);
 
-__global__ void scale_variables_kernel(double* __restrict__ c,
-                                       double* __restrict__ var_lb,
-                                       double* __restrict__ var_ub,
-                                       double* __restrict__ var_lb_finite,
-                                       double* __restrict__ var_ub_finite,
-                                       double* __restrict__ primal_start,
-                                       const double* __restrict__ D,
-                                       const double* __restrict__ invD,
-                                       int n)
-{
-    int j = blockIdx.x * blockDim.x + threadIdx.x;
-    if (j >= n) return;
-    double dj = D[j];
-    double inv_dj = invD[j];
-    c[j]      *= inv_dj;
-    var_lb[j] *= dj;
-    var_ub[j] *= dj;
-    var_lb_finite[j] *= dj;
-    var_ub_finite[j] *= dj;
-    primal_start[j] *= dj;
-}
-
-__global__ void scale_constraints_kernel(double* __restrict__ con_lb,
-                                         double* __restrict__ con_ub,
-                                         double* __restrict__ con_lb_finite,
-                                         double* __restrict__ con_ub_finite,
-                                         double* __restrict__ dual_start,
-                                         const double* __restrict__ E,
-                                         const double* __restrict__ invE,
-                                         int m)
-{
-    int i = blockIdx.x * blockDim.x + threadIdx.x;
-    if (i >= m) return;
-    double inv_ei = invE[i];
-    double ei = E[i];
-    con_lb[i] *= inv_ei;
-    con_ub[i] *= inv_ei;
-    con_lb_finite[i] *= inv_ei;
-    con_ub_finite[i] *= inv_ei;
-    dual_start[i] *= ei;
-}
-
-__global__ void csr_scale_nnz_kernel(const int* __restrict__ row_ids,
-                                     const int* __restrict__ col_ind,
-                                     double* __restrict__ vals,
-                                     const double* __restrict__ invD,
-                                     const double* __restrict__ invE,
-                                     int nnz)
-{
-    for (int k = blockIdx.x * blockDim.x + threadIdx.x;
-         k < nnz; 
-         k += gridDim.x * blockDim.x)
-    {
-        int i = row_ids[k];
-        int j = col_ind[k];
-        vals[k] *= invD[j] * invE[i];
-    }
-}
-
-__global__ void csr_row_absmax_kernel(const int* __restrict__ row_ptr,
-                                      const double* __restrict__ vals,
-                                      int num_rows,
-                                      double* __restrict__ out_max)
-{
-    int i = blockIdx.x * blockDim.x + threadIdx.x;
-    if (i >= num_rows) return;
-    int s = row_ptr[i], e = row_ptr[i + 1];
-    double m = 0.0;
-    for (int k = s; k < e; ++k) {
-        double v = fabs(vals[k]);
-        if (!isfinite(v)) v = 0.0;
-        if (v > m) m = v;
-    }
-    out_max[i] = m;
-}
-
-__global__ void csr_col_absmax_atomic_kernel(const int* __restrict__ col_ind,
-                                             const double* __restrict__ vals,
-                                             int nnz,
-                                             unsigned long long* __restrict__ out_max_bits)
-{
-    for (int k = blockIdx.x * blockDim.x + threadIdx.x; k < nnz; k += gridDim.x * blockDim.x) {
-        int j = col_ind[k];
-        double v = fabs(vals[k]);
-        if (!isfinite(v)) v = 0.0;
-        unsigned long long bits = __double_as_longlong(v);
-        atomicMax(&out_max_bits[j], bits);
-    }
-}
-
-__global__ void u64bits_to_double(const unsigned long long* __restrict__ in_bits,
-                                  double* __restrict__ out_val,
-                                  int n)
-{
-    for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < n; i += gridDim.x * blockDim.x) {
-        out_val[i] = __longlong_as_double(in_bits[i]);
-    }
-}
-
-__device__ __forceinline__ double pow_fast(double v, double p) {
-    if (p == 2.0)   return v * v;
-    if (p == 1.0)   return v;
-    if (p == 0.5)   return sqrt(v);
-    return pow(v, p);
-}
-
-__global__ void csr_row_powsum_kernel(const int* __restrict__ row_ptr,
-                                       const double* __restrict__ vals,
-                                       int num_rows,
-                                       double degree,
-                                       double* __restrict__ out_sum)
-{
-    int i = blockIdx.x * blockDim.x + threadIdx.x;
-    if (i >= num_rows) return;
-    int s = row_ptr[i], e = row_ptr[i + 1];
-    double acc = 0.0;
-    for (int k = s; k < e; ++k) {
-        double v = fabs(vals[k]);
-        if (!isfinite(v)) v = 0.0;
-        acc += pow_fast(v, degree);
-    }
-    out_sum[i] = acc;
-}
-
-__global__ void csr_col_powsum_atomic_kernel(const int* __restrict__ col_ind,
-                                                const double* __restrict__ vals,
-                                                int nnz,
-                                                double degree,
-                                                double* __restrict__ out_sum)
-{
-    for (int k = blockIdx.x * blockDim.x + threadIdx.x; k < nnz; k += gridDim.x * blockDim.x) {
-        int j = col_ind[k];
-        double v = fabs(vals[k]);
-        if (!isfinite(v)) v = 0.0;
-        double t = pow_fast(v, degree);
-        atomicAdd(&out_sum[j], t);
-    }
-}
-
-__global__ void clamp_sqrt_and_accum(double* __restrict__ x, 
-                                     double* __restrict__ inv_x,
-                                     double* __restrict__ cum, 
-                                     int n) 
-{
-    for (int t = blockIdx.x * blockDim.x + threadIdx.x; t < n; t += blockDim.x * gridDim.x)
-    {
-        double v = x[t]; 
-        double s = (v < SCALING_EPSILON) ? 1.0 : sqrt(v); 
-        cum[t] *= s; 
-        x[t] = s;
-        inv_x[t] = 1.0 / s;
-    }
-}
-
-__global__ void reduce_bound_norm_sq_atomic(
-    const double* __restrict__ L,
-    const double* __restrict__ U,
-    int m,
-    double* __restrict__ out_sum)
-{
-    double acc = 0.0;
-    for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < m; i += blockDim.x * gridDim.x) {
-        double Li = L[i], Ui = U[i];
-        bool fL = isfinite(Li), fU = isfinite(Ui);
-        if (fL && (!fU || fabs(Li - Ui) > SCALING_EPSILON)) acc += Li * Li;
-        if (fU)                                 acc += Ui * Ui;
-    }
-    atomicAdd(out_sum, acc);
-}
-
 static void scale_problem(
     pdhg_solver_state_t *state,
     double *E,
@@ -286,6 +116,8 @@ static void scale_problem(
         state->constraint_matrix->row_ind,
         state->constraint_matrix->col_ind,
         state->constraint_matrix->val,
+        state->constraint_matrix_t->val,
+        state->constraint_matrix->transpose_pos,
         invD,
         invE,
         state->constraint_matrix->num_nonzeros);
@@ -302,10 +134,6 @@ static void ruiz_rescaling(
 {
     const int n_cons = state->num_constraints;
     const int n_vars = state->num_variables;
-    const int nnz    = state->constraint_matrix->num_nonzeros;
-
-    unsigned long long *D_bits=nullptr;
-    CUDA_CHECK(cudaMalloc(&D_bits, n_vars*sizeof(unsigned long long)));
 
     for (int iter = 0; iter < num_iterations; ++iter)
     {
@@ -320,13 +148,11 @@ static void ruiz_rescaling(
             rescale_info->con_rescale,
             n_cons);
 
-        CUDA_CHECK(cudaMemset(D_bits, 0, n_vars*sizeof(unsigned long long)));
-        csr_col_absmax_atomic_kernel<<<state->num_blocks_nnz, THREADS_PER_BLOCK>>>(
-            state->constraint_matrix->col_ind,
-            state->constraint_matrix->val,
-            nnz,
-            D_bits);
-        u64bits_to_double<<<state->num_blocks_primal, THREADS_PER_BLOCK>>>(D_bits, D, n_vars); 
+        csr_row_absmax_kernel<<<state->num_blocks_primal, THREADS_PER_BLOCK>>>(
+            state->constraint_matrix_t->row_ptr,
+            state->constraint_matrix_t->val,
+            n_vars,
+            D);
         clamp_sqrt_and_accum<<<state->num_blocks_primal, THREADS_PER_BLOCK>>>(
             D,
             invD,
@@ -335,8 +161,6 @@ static void ruiz_rescaling(
 
         scale_problem(state, E, D, invE, invD);
     }
-
-    CUDA_CHECK(cudaFree(D_bits));
 }
 
 static void pock_chambolle_rescaling(
@@ -350,7 +174,6 @@ static void pock_chambolle_rescaling(
 {
     const int n_cons = state->num_constraints;
     const int n_vars = state->num_variables;
-    const int nnz    = state->constraint_matrix->num_nonzeros;
 
     csr_row_powsum_kernel<<<state->num_blocks_dual, THREADS_PER_BLOCK>>>(
         state->constraint_matrix->row_ptr,
@@ -364,12 +187,11 @@ static void pock_chambolle_rescaling(
         rescale_info->con_rescale,
         n_cons);
         
-    CUDA_CHECK(cudaMemset(D, 0, n_vars*sizeof(double)));
-    csr_col_powsum_atomic_kernel<<<state->num_blocks_nnz, THREADS_PER_BLOCK>>>(
-        state->constraint_matrix->col_ind,
-        state->constraint_matrix->val,
-        nnz,
-        2.0 - alpha,
+    csr_row_powsum_kernel<<<state->num_blocks_primal, THREADS_PER_BLOCK>>>(
+        state->constraint_matrix_t->row_ptr,
+        state->constraint_matrix_t->val,
+        n_vars,
+        alpha,
         D);
     clamp_sqrt_and_accum<<<state->num_blocks_primal, THREADS_PER_BLOCK>>>(
         D,
@@ -415,25 +237,10 @@ static void bound_objective_rescaling(
 
     const double E_const = bnd_norm + 1.0;
     const double D_const = obj_norm + 1.0;
-    {
-        std::vector<double> h1(n_cons,E_const), h2(n_vars,D_const);
-        CUDA_CHECK(cudaMemcpy(E, h1.data(), n_cons*sizeof(double), cudaMemcpyHostToDevice));
-        CUDA_CHECK(cudaMemcpy(D, h2.data(), n_vars*sizeof(double), cudaMemcpyHostToDevice));
-        std::vector<double> h3(n_cons,1/E_const), h4(n_vars,1/D_const);
-        CUDA_CHECK(cudaMemcpy(invE, h3.data(), n_cons*sizeof(double), cudaMemcpyHostToDevice));
-        CUDA_CHECK(cudaMemcpy(invD, h4.data(), n_vars*sizeof(double), cudaMemcpyHostToDevice));
-    }
-
-    CUBLAS_CHECK(cublasDscal(state->blas_handle,
-                             n_cons,
-                             &E_const,
-                             rescale_info->con_rescale,
-                             1));
-    CUBLAS_CHECK(cublasDscal(state->blas_handle,
-                             n_vars,
-                             &D_const,
-                             rescale_info->var_rescale,
-                             1));
+    fill_const_and_accum_kernel<<<state->num_blocks_dual, THREADS_PER_BLOCK>>>(
+        E, invE, rescale_info->con_rescale, E_const, n_cons);
+    fill_const_and_accum_kernel<<<state->num_blocks_primal, THREADS_PER_BLOCK>>>(
+        D, invD, rescale_info->var_rescale, D_const, n_vars);
 
     scale_problem(state, E, D, invE, invD);
 }
@@ -447,17 +254,12 @@ rescale_info_t *rescale_problem(
 
     clock_t start_rescaling = clock();
     rescale_info_t *rescale_info = (rescale_info_t *)safe_calloc(1, sizeof(rescale_info_t));
-    rescale_info->con_rescale = nullptr;
-    rescale_info->var_rescale = nullptr;
     CUDA_CHECK(cudaMalloc(&rescale_info->con_rescale, n_cons*sizeof(double)));
     CUDA_CHECK(cudaMalloc(&rescale_info->var_rescale, n_vars*sizeof(double)));
-    {
-        std::vector<double> h1(n_cons,1.0), h2(n_vars,1.0);
-        CUDA_CHECK(cudaMemcpy(rescale_info->con_rescale, h1.data(), 
-                              n_cons*sizeof(double), cudaMemcpyHostToDevice));
-        CUDA_CHECK(cudaMemcpy(rescale_info->var_rescale, h2.data(), 
-                              n_vars*sizeof(double), cudaMemcpyHostToDevice));
-    }
+    fill_ones_kernel<<<state->num_blocks_dual, THREADS_PER_BLOCK>>>(
+        rescale_info->con_rescale, n_cons);
+    fill_ones_kernel<<<state->num_blocks_primal, THREADS_PER_BLOCK>>>(
+        rescale_info->var_rescale, n_vars);
 
     double *E=nullptr, *D=nullptr, *invE=nullptr, *invD=nullptr;
     CUDA_CHECK(cudaMalloc(&E, n_cons*sizeof(double)));
@@ -486,4 +288,160 @@ rescale_info_t *rescale_problem(
     CUDA_CHECK(cudaFree(invD));
     
     return rescale_info;
-}
\ No newline at end of file
+}
+
+__global__ void scale_variables_kernel(double* __restrict__ c,
+                                       double* __restrict__ var_lb,
+                                       double* __restrict__ var_ub,
+                                       double* __restrict__ var_lb_finite,
+                                       double* __restrict__ var_ub_finite,
+                                       double* __restrict__ primal_start,
+                                       const double* __restrict__ D,
+                                       const double* __restrict__ invD,
+                                       int n)
+{
+    int j = blockIdx.x * blockDim.x + threadIdx.x;
+    if (j >= n) return;
+    double dj = D[j];
+    double inv_dj = invD[j];
+    c[j]      *= inv_dj;
+    var_lb[j] *= dj;
+    var_ub[j] *= dj;
+    var_lb_finite[j] *= dj;
+    var_ub_finite[j] *= dj;
+    primal_start[j] *= dj;
+}
+
+__global__ void scale_constraints_kernel(double* __restrict__ con_lb,
+                                         double* __restrict__ con_ub,
+                                         double* __restrict__ con_lb_finite,
+                                         double* __restrict__ con_ub_finite,
+                                         double* __restrict__ dual_start,
+                                         const double* __restrict__ E,
+                                         const double* __restrict__ invE,
+                                         int m)
+{
+    int i = blockIdx.x * blockDim.x + threadIdx.x;
+    if (i >= m) return;
+    double inv_ei = invE[i];
+    double ei = E[i];
+    con_lb[i] *= inv_ei;
+    con_ub[i] *= inv_ei;
+    con_lb_finite[i] *= inv_ei;
+    con_ub_finite[i] *= inv_ei;
+    dual_start[i] *= ei;
+}
+
+__global__ void csr_scale_nnz_kernel(const int* __restrict__ row_ids,
+                                     const int* __restrict__ col_ind,
+                                     double* __restrict__ A_vals,
+                                     double* __restrict__ At_vals,
+                                     const int* __restrict__ A_to_At,
+                                     const double* __restrict__ invD,
+                                     const double* __restrict__ invE,
+                                     int nnz)
+{
+    for (int k = blockIdx.x * blockDim.x + threadIdx.x;
+         k < nnz; 
+         k += gridDim.x * blockDim.x)
+    {
+        int i = row_ids[k];
+        int j = col_ind[k];
+        double scale = invD[j] * invE[i];
+        A_vals[k] *= scale;
+        At_vals[A_to_At[k]] *= scale;
+    }
+}
+
+__global__ void csr_row_absmax_kernel(const int* __restrict__ row_ptr,
+                                      const double* __restrict__ vals,
+                                      int num_rows,
+                                      double* __restrict__ out_max)
+{
+    int i = blockIdx.x * blockDim.x + threadIdx.x;
+    if (i >= num_rows) return;
+    int s = row_ptr[i], e = row_ptr[i + 1];
+    double m = 0.0;
+    for (int k = s; k < e; ++k) {
+        double v = fabs(vals[k]);
+        if (!isfinite(v)) v = 0.0;
+        if (v > m) m = v;
+    }
+    out_max[i] = m;
+}
+
+__device__ __forceinline__ double pow_fast(double v, double p) {
+    if (p == 2.0)   return v * v;
+    if (p == 1.0)   return v;
+    if (p == 0.5)   return sqrt(v);
+    return pow(v, p);
+}
+
+__global__ void csr_row_powsum_kernel(const int* __restrict__ row_ptr,
+                                       const double* __restrict__ vals,
+                                       int num_rows,
+                                       double degree,
+                                       double* __restrict__ out_sum)
+{
+    int i = blockIdx.x * blockDim.x + threadIdx.x;
+    if (i >= num_rows) return;
+    int s = row_ptr[i], e = row_ptr[i + 1];
+    double acc = 0.0;
+    for (int k = s; k < e; ++k) {
+        double v = fabs(vals[k]);
+        if (!isfinite(v)) v = 0.0;
+        acc += pow_fast(v, degree);
+    }
+    out_sum[i] = acc;
+}
+
+__global__ void clamp_sqrt_and_accum(double* __restrict__ x, 
+                                     double* __restrict__ inv_x,
+                                     double* __restrict__ cum, 
+                                     int n) 
+{
+    for (int t = blockIdx.x * blockDim.x + threadIdx.x; t < n; t += blockDim.x * gridDim.x)
+    {
+        double v = x[t]; 
+        double s = (v < SCALING_EPSILON) ? 1.0 : sqrt(v); 
+        cum[t] *= s; 
+        x[t] = s;
+        inv_x[t] = 1.0 / s;
+    }
+}
+
+__global__ void reduce_bound_norm_sq_atomic(
+    const double* __restrict__ L,
+    const double* __restrict__ U,
+    int m,
+    double* __restrict__ out_sum)
+{
+    double acc = 0.0;
+    for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < m; i += blockDim.x * gridDim.x) {
+        double Li = L[i], Ui = U[i];
+        bool fL = isfinite(Li), fU = isfinite(Ui);
+        if (fL && (!fU || fabs(Li - Ui) > SCALING_EPSILON)) acc += Li * Li;
+        if (fU)                                 acc += Ui * Ui;
+    }
+    atomicAdd(out_sum, acc);
+}
+
+__global__ void fill_const_and_accum_kernel(double* __restrict__ x,
+                                            double* __restrict__ inv_x,
+                                            double* __restrict__ cum,
+                                            double val,
+                                            int n)
+{
+    int i = blockIdx.x * blockDim.x + threadIdx.x;
+    if (i >= n) return;
+
+    x[i]     = val;
+    inv_x[i] = 1.0 / val;
+    cum[i]  *= val;
+}
+
+__global__ void fill_ones_kernel(double* __restrict__ x, int n)
+{
+    int i = blockIdx.x * blockDim.x + threadIdx.x;
+    if (i < n) x[i] = 1.0;
+}
diff --git a/src/solver.cu b/src/solver.cu
index 300ce9f..130fd75 100644
--- a/src/solver.cu
+++ b/src/solver.cu
@@ -27,6 +27,15 @@ limitations under the License.
 #include <stdio.h>
 #include <time.h>
 
+__global__ void build_row_ind(const int* __restrict__ row_ptr,
+                              int num_rows,
+                              int* __restrict__ row_ind);
+__global__ void build_transpose_pos(const int* __restrict__ A_row_ind,
+                                    const int* __restrict__ A_col_ind,
+                                    const int* __restrict__ At_row_ptr,
+                                    const int* __restrict__ At_col_ind,
+                                    int nnz,
+                                    int* __restrict__ A_to_At);
 __global__ void compute_next_pdhg_primal_solution_kernel(
     const double *current_primal, double *reflected_primal, const double *dual_product,
     const double *objective, const double *var_lb, const double *var_ub,
@@ -178,9 +187,49 @@ static pdhg_solver_state_t *initialize_solver_state(
     ALLOC_AND_COPY(state->constraint_matrix->col_ind, original_problem->constraint_matrix_col_indices, original_problem->constraint_matrix_num_nonzeros * sizeof(int));
     ALLOC_AND_COPY(state->constraint_matrix->val, original_problem->constraint_matrix_values, original_problem->constraint_matrix_num_nonzeros * sizeof(double));
 
-    int* h_row_ind = build_row_ind_from_row_ptr(original_problem->constraint_matrix_row_pointers, n_cons, nnz);
-    ALLOC_AND_COPY(state->constraint_matrix->row_ind, h_row_ind, nnz * sizeof(int));
-    free(h_row_ind);
+    CUDA_CHECK(cudaMalloc(&state->constraint_matrix->row_ind, nnz * sizeof(int)));
+    build_row_ind<<<state->num_blocks_dual, THREADS_PER_BLOCK>>>(
+        state->constraint_matrix->row_ptr, n_cons, state->constraint_matrix->row_ind);
+    CUDA_CHECK(cudaGetLastError());
+
+    CUDA_CHECK(cudaMalloc(&state->constraint_matrix_t->row_ptr, (n_vars + 1) * sizeof(int)));
+    CUDA_CHECK(cudaMalloc(&state->constraint_matrix_t->col_ind, original_problem->constraint_matrix_num_nonzeros * sizeof(int)));
+    CUDA_CHECK(cudaMalloc(&state->constraint_matrix_t->val, original_problem->constraint_matrix_num_nonzeros * sizeof(double)));
+
+    size_t buffer_size = 0;
+    void *buffer = nullptr;
+    CUSPARSE_CHECK(cusparseCsr2cscEx2_bufferSize(
+        state->sparse_handle, state->constraint_matrix->num_rows, state->constraint_matrix->num_cols, state->constraint_matrix->num_nonzeros,
+        state->constraint_matrix->val, state->constraint_matrix->row_ptr, state->constraint_matrix->col_ind,
+        state->constraint_matrix_t->val, state->constraint_matrix_t->row_ptr, state->constraint_matrix_t->col_ind,
+        CUDA_R_64F, CUSPARSE_ACTION_NUMERIC, CUSPARSE_INDEX_BASE_ZERO,
+        CUSPARSE_CSR2CSC_ALG_DEFAULT, &buffer_size));
+    CUDA_CHECK(cudaMalloc(&buffer, buffer_size));
+
+    CUSPARSE_CHECK(cusparseCsr2cscEx2(
+        state->sparse_handle, state->constraint_matrix->num_rows, state->constraint_matrix->num_cols, state->constraint_matrix->num_nonzeros,
+        state->constraint_matrix->val, state->constraint_matrix->row_ptr, state->constraint_matrix->col_ind,
+        state->constraint_matrix_t->val, state->constraint_matrix_t->row_ptr, state->constraint_matrix_t->col_ind,
+        CUDA_R_64F, CUSPARSE_ACTION_NUMERIC, CUSPARSE_INDEX_BASE_ZERO,
+        CUSPARSE_CSR2CSC_ALG_DEFAULT, buffer));
+
+    CUDA_CHECK(cudaFree(buffer));
+
+    CUDA_CHECK(cudaMalloc(&state->constraint_matrix_t->row_ind, nnz * sizeof(int)));
+    build_row_ind<<<state->num_blocks_primal, THREADS_PER_BLOCK>>>(
+        state->constraint_matrix_t->row_ptr, n_vars, state->constraint_matrix_t->row_ind);
+    CUDA_CHECK(cudaGetLastError());
+
+    CUDA_CHECK(cudaMalloc(&state->constraint_matrix->transpose_pos, nnz * sizeof(int)));
+    state->constraint_matrix_t->transpose_pos = NULL;
+    build_transpose_pos<<<state->num_blocks_nnz, THREADS_PER_BLOCK>>>(
+        state->constraint_matrix->row_ind,
+        state->constraint_matrix->col_ind,
+        state->constraint_matrix_t->row_ptr,
+        state->constraint_matrix_t->col_ind,
+        nnz,
+        state->constraint_matrix->transpose_pos);
+    CUDA_CHECK(cudaGetLastError());
 
     ALLOC_AND_COPY(state->variable_lower_bound, original_problem->variable_lower_bound, var_bytes);
     ALLOC_AND_COPY(state->variable_upper_bound, original_problem->variable_upper_bound, var_bytes);
@@ -244,34 +293,17 @@ static pdhg_solver_state_t *initialize_solver_state(
     state->variable_rescaling   = rescale_info->var_rescale;
     state->rescaling_time_sec   = rescale_info->rescaling_time_sec;
 
-    CUDA_CHECK(cudaMalloc(&state->constraint_matrix_t->row_ptr, (n_vars + 1) * sizeof(int)));
-    CUDA_CHECK(cudaMalloc(&state->constraint_matrix_t->col_ind, original_problem->constraint_matrix_num_nonzeros * sizeof(int)));
-    CUDA_CHECK(cudaMalloc(&state->constraint_matrix_t->val, original_problem->constraint_matrix_num_nonzeros * sizeof(double)));
-
-    size_t buffer_size = 0;
-    void *buffer = nullptr;
-    CUSPARSE_CHECK(cusparseCsr2cscEx2_bufferSize(
-        state->sparse_handle, state->constraint_matrix->num_rows, state->constraint_matrix->num_cols, state->constraint_matrix->num_nonzeros,
-        state->constraint_matrix->val, state->constraint_matrix->row_ptr, state->constraint_matrix->col_ind,
-        state->constraint_matrix_t->val, state->constraint_matrix_t->row_ptr, state->constraint_matrix_t->col_ind,
-        CUDA_R_64F, CUSPARSE_ACTION_NUMERIC, CUSPARSE_INDEX_BASE_ZERO,
-        CUSPARSE_CSR2CSC_ALG_DEFAULT, &buffer_size));
-    CUDA_CHECK(cudaMalloc(&buffer, buffer_size));
-
-    CUSPARSE_CHECK(cusparseCsr2cscEx2(
-        state->sparse_handle, state->constraint_matrix->num_rows, state->constraint_matrix->num_cols, state->constraint_matrix->num_nonzeros,
-        state->constraint_matrix->val, state->constraint_matrix->row_ptr, state->constraint_matrix->col_ind,
-        state->constraint_matrix_t->val, state->constraint_matrix_t->row_ptr, state->constraint_matrix_t->col_ind,
-        CUDA_R_64F, CUSPARSE_ACTION_NUMERIC, CUSPARSE_INDEX_BASE_ZERO,
-        CUSPARSE_CSR2CSC_ALG_DEFAULT, buffer));
-    state->constraint_matrix_t->row_ind = NULL;
-
-    CUDA_CHECK(cudaFree(buffer));
-
     rescale_info->con_rescale = NULL;
     rescale_info->var_rescale = NULL;
     rescale_info_free(rescale_info);
 
+    CUDA_CHECK(cudaFree(state->constraint_matrix->row_ind));
+    state->constraint_matrix->row_ind = NULL;
+    CUDA_CHECK(cudaFree(state->constraint_matrix_t->row_ind));
+    state->constraint_matrix_t->row_ind = NULL;
+    CUDA_CHECK(cudaFree(state->constraint_matrix->transpose_pos));
+    state->constraint_matrix->transpose_pos = NULL;
+
     double sum_of_squares = 0.0;
 
     for (int i = 0; i < n_vars; ++i)
@@ -348,6 +380,51 @@ static pdhg_solver_state_t *initialize_solver_state(
     return state;
 }
 
+__global__ void build_row_ind(const int* __restrict__ row_ptr,
+                              int num_rows,
+                              int* __restrict__ row_ind)
+{
+    int i = blockIdx.x * blockDim.x + threadIdx.x;
+    if (i >= num_rows) return;
+
+    int s = row_ptr[i];
+    int e = row_ptr[i + 1];
+
+    for (int k = s; k < e; ++k) {
+        row_ind[k] = i;
+    }
+}
+
+__global__ void build_transpose_pos(
+    const int* __restrict__ A_row_ind,
+    const int* __restrict__ A_col_ind,
+    const int* __restrict__ At_row_ptr,
+    const int* __restrict__ At_col_ind,
+    int nnz,
+    int* __restrict__ A_to_At)
+{
+    int k = blockIdx.x * blockDim.x + threadIdx.x;
+    if (k >= nnz) return;
+
+    int i = A_row_ind[k]; 
+    int j = A_col_ind[k];  
+
+    int start = At_row_ptr[j];
+    int end   = At_row_ptr[j + 1];
+
+    int pos = -1;
+    for (int idx = start; idx < end; ++idx) {
+        if (At_col_ind[idx] == i) {
+            pos = idx;
+            break;
+        }
+    }
+
+    if (pos < 0) return;
+
+    A_to_At[k]    = pos;
+}
+
 __global__ void compute_next_pdhg_primal_solution_kernel(
     const double *current_primal, double *reflected_primal,
     const double *dual_product, const double *objective, const double *var_lb,
@@ -625,9 +702,8 @@ static void perform_restart(pdhg_solver_state_t *state,
     state->last_trial_fixed_point_error = INFINITY;
 }
 
-static void
-initialize_step_size_and_primal_weight(pdhg_solver_state_t *state,
-                                       const pdhg_parameters_t *params)
+static void initialize_step_size_and_primal_weight(pdhg_solver_state_t *state,
+                                                   const pdhg_parameters_t *params)
 {
     double max_sv = estimate_maximum_singular_value(
         state->sparse_handle, state->blas_handle, state->constraint_matrix,
@@ -694,6 +770,27 @@ void pdhg_solver_state_free(pdhg_solver_state_t *state)
         return;
     }
 
+    if (state->matA)
+        CUSPARSE_CHECK(cusparseDestroySpMat(state->matA));
+    if (state->matAt)
+        CUSPARSE_CHECK(cusparseDestroySpMat(state->matAt));
+    if (state->vec_primal_sol)
+        CUSPARSE_CHECK(cusparseDestroyDnVec(state->vec_primal_sol));
+    if (state->vec_dual_sol)
+        CUSPARSE_CHECK(cusparseDestroyDnVec(state->vec_dual_sol));
+    if (state->vec_primal_prod)
+        CUSPARSE_CHECK(cusparseDestroyDnVec(state->vec_primal_prod));
+    if (state->vec_dual_prod)
+        CUSPARSE_CHECK(cusparseDestroyDnVec(state->vec_dual_prod));
+    if (state->primal_spmv_buffer)
+         CUDA_CHECK(cudaFree(state->primal_spmv_buffer));
+    if (state->dual_spmv_buffer)
+        CUDA_CHECK(cudaFree(state->dual_spmv_buffer));
+    if (state->sparse_handle)
+        CUSPARSE_CHECK(cusparseDestroy(state->sparse_handle));
+    if (state->blas_handle)
+        CUBLAS_CHECK(cublasDestroy(state->blas_handle));
+
     if (state->variable_lower_bound)
         CUDA_CHECK(cudaFree(state->variable_lower_bound));
     if (state->variable_upper_bound)
@@ -704,14 +801,22 @@ void pdhg_solver_state_free(pdhg_solver_state_t *state)
         CUDA_CHECK(cudaFree(state->constraint_matrix->row_ptr));
     if (state->constraint_matrix->col_ind)
         CUDA_CHECK(cudaFree(state->constraint_matrix->col_ind));
+    if (state->constraint_matrix->row_ind)
+        CUDA_CHECK(cudaFree(state->constraint_matrix->row_ind));
     if (state->constraint_matrix->val)
         CUDA_CHECK(cudaFree(state->constraint_matrix->val));
+    if (state->constraint_matrix->transpose_pos)
+        CUDA_CHECK(cudaFree(state->constraint_matrix->transpose_pos));
     if (state->constraint_matrix_t->row_ptr)
         CUDA_CHECK(cudaFree(state->constraint_matrix_t->row_ptr));
     if (state->constraint_matrix_t->col_ind)
         CUDA_CHECK(cudaFree(state->constraint_matrix_t->col_ind));
+    if (state->constraint_matrix_t->row_ind)
+        CUDA_CHECK(cudaFree(state->constraint_matrix_t->row_ind));
     if (state->constraint_matrix_t->val)
         CUDA_CHECK(cudaFree(state->constraint_matrix_t->val));
+    if (state->constraint_matrix_t->transpose_pos)
+        CUDA_CHECK(cudaFree(state->constraint_matrix_t->transpose_pos));
     if (state->constraint_lower_bound)
         CUDA_CHECK(cudaFree(state->constraint_lower_bound));
     if (state->constraint_upper_bound)
@@ -765,6 +870,11 @@ void pdhg_solver_state_free(pdhg_solver_state_t *state)
     if (state->ones_dual_d)
         CUDA_CHECK(cudaFree(state->ones_dual_d));
 
+    if (state->constraint_matrix)
+        free(state->constraint_matrix);
+    if (state->constraint_matrix_t)
+        free(state->constraint_matrix_t);
+
     free(state);
 }
 
diff --git a/src/utils.cu b/src/utils.cu
index 0de6c4b..3212c66 100644
--- a/src/utils.cu
+++ b/src/utils.cu
@@ -950,18 +950,3 @@ int coo_to_csr(const matrix_desc_t *desc, int **row_ptr, int **col_ind,
     return 0;
 }
 
-int* build_row_ind_from_row_ptr(int* row_ptr, int num_rows, int nnz)
-{
-    if (!row_ptr || num_rows < 0 || nnz <= 0) return NULL;
-
-    int* row_ind = (int*)safe_malloc(nnz * sizeof(int));
-    for (int i = 0; i < num_rows; ++i) {
-        int s = row_ptr[i];
-        int e = row_ptr[i + 1];
-        for (int k = s; k < e; ++k) {
-            row_ind[k] = i;
-        }
-    }
-    return row_ind;
-}
-

From c2c7252b6e05214e6f4225d6b50ba25799895cb8 Mon Sep 17 00:00:00 2001
From: Bo Tang <botang@mie.utoronto.ca>
Date: Sat, 15 Nov 2025 03:53:39 -0500
Subject: [PATCH 03/13] Todo: infeasible is stucked

---
 test/test_infeasible_unbounded.py | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/test/test_infeasible_unbounded.py b/test/test_infeasible_unbounded.py
index d09d9a9..4388cc4 100644
--- a/test/test_infeasible_unbounded.py
+++ b/test/test_infeasible_unbounded.py
@@ -63,13 +63,13 @@ def test_infeasible_lp(base_lp_data, atol):
     # turn off output
     model.setParams(OutputFlag=False)
     # optimize
-    model.optimize()
+    #model.optimize()
     # check status
-    assert hasattr(model, "Status"), "Model.Status not exposed."
-    assert model.Status == "PRIMAL_INFEASIBLE", f"Unexpected termination status: {model.Status}"
-    assert model.StatusCode == PDLP.PRIMAL_INFEASIBLE, f"Unexpected termination status code: {model.StatusCode}"
+    #assert hasattr(model, "Status"), "Model.Status not exposed."
+    #assert model.Status == "PRIMAL_INFEASIBLE", f"Unexpected termination status: {model.Status}"
+    #assert model.StatusCode == PDLP.PRIMAL_INFEASIBLE, f"Unexpected termination status code: {model.StatusCode}"
     # check dual ray
-    assert model.DualRayObj > atol, f"DualRayObj should be positive for dual infeasible, got {model.DualRayObj}"
+    #assert model.DualRayObj > atol, f"DualRayObj should be positive for dual infeasible, got {model.DualRayObj}"
 
 
 def test_unbounded_lp(base_lp_data, atol):

From 63b25260cc571b98654039fd4929de47ef41f5d9 Mon Sep 17 00:00:00 2001
From: Bo Tang <botang@mie.utoronto.ca>
Date: Sat, 15 Nov 2025 16:45:58 -0500
Subject: [PATCH 04/13] New feat: preconditioner prints

---
 src/preconditioner.cu | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/src/preconditioner.cu b/src/preconditioner.cu
index ec3a6f2..1b9d8dd 100644
--- a/src/preconditioner.cu
+++ b/src/preconditioner.cu
@@ -249,6 +249,8 @@ rescale_info_t *rescale_problem(
     const pdhg_parameters_t *params,
     pdhg_solver_state_t *state)
 {
+    printf("[Precondition] Start\n");
+    
     int n_vars = state->num_variables;
     int n_cons = state->num_constraints;
 
@@ -269,14 +271,17 @@ rescale_info_t *rescale_problem(
 
     if (params->l_inf_ruiz_iterations > 0)
     {
+        printf("[Precondition] Ruiz scaling (%d iterations)\n", params->l_inf_ruiz_iterations);
         ruiz_rescaling(state, params->l_inf_ruiz_iterations, rescale_info, E, D, invE, invD);
     }
     if (params->has_pock_chambolle_alpha)
     {
+        printf("[Precondition] Pock-Chambolle scaling (alpha=%.4f)\n", params->pock_chambolle_alpha);
         pock_chambolle_rescaling(state, params->pock_chambolle_alpha, rescale_info, E, D, invE, invD);
     }
     if (params->bound_objective_rescaling)
     {
+        printf("[Precondition] Bound-objective scaling\n");
         bound_objective_rescaling(state, rescale_info, E, D, invE, invD);
     }
 

From 3f17100dd92756d7d5257a4e9d527d4cda05166c Mon Sep 17 00:00:00 2001
From: Bo Tang <botang@mie.utoronto.ca>
Date: Sat, 15 Nov 2025 21:39:58 -0500
Subject: [PATCH 05/13] Bug fixed: objective & bound rescale

---
 CMakeLists.txt                    |   2 +-
 internal/internal_types.h         |   4 +
 pyproject.toml                    |   4 +-
 src/preconditioner.cu             | 188 +++++++++++++++++++++---------
 src/solver.cu                     | 103 ++++++++++------
 src/utils.cu                      |  21 +++-
 test/test_infeasible_unbounded.py |  10 +-
 7 files changed, 229 insertions(+), 103 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index bd1a955..e06c649 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -20,7 +20,7 @@ if(NOT CMAKE_BUILD_TYPE AND NOT CMAKE_CONFIGURATION_TYPES)
 endif()
 
 # Increase compatibility by compiling for all supported real and virtual architectures.
-set(CMAKE_CUDA_ARCHITECTURES 60 70 75 80 86 89 90)
+set(CMAKE_CUDA_ARCHITECTURES all)
 
 # Global Compile flags (corresponding to CFLAGS/NVCCFLAGS)
 add_compile_options(-fPIC -O3 -Wall -Wextra -g)
diff --git a/internal/internal_types.h b/internal/internal_types.h
index 177a293..ea90e65 100644
--- a/internal/internal_types.h
+++ b/internal/internal_types.h
@@ -77,6 +77,8 @@ typedef struct
 
 	double *constraint_rescaling;
 	double *variable_rescaling;
+	double constraint_bound_rescaling;
+	double objective_vector_rescaling;
 	double *primal_slack;
 	double *dual_slack;
 	double rescaling_time_sec;
@@ -129,5 +131,7 @@ typedef struct
 {
 	double *con_rescale;
 	double *var_rescale;
+	double con_bound_rescale;
+	double obj_vec_rescale;
 	double rescaling_time_sec;
 } rescale_info_t;
diff --git a/pyproject.toml b/pyproject.toml
index 2a0a2c0..f7cacc8 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -4,7 +4,7 @@ build-backend = "scikit_build_core.build"
 
 [project]
 name = "cupdlpx"
-version = "0.1.3"
+version = "0.1.4"
 description = "Python bindings for cuPDLPx (GPU-accelerated first-order LP solver)"
 readme = "README.md"
 license = { text = "Apache-2.0" }
@@ -25,7 +25,7 @@ wheel.packages = ["python/cupdlpx"]
 sdist.include = ["tests/**", "pyproject.toml", "README.md", "LICENSE"]
 
 [tool.scikit-build.cmake.define]
-CMAKE_CUDA_ARCHITECTURES = "75;80;86;89;90;90-virtual"
+CMAKE_CUDA_ARCHITECTURES = "all"
 CMAKE_CUDA_STANDARD = "17"
 CUPDLPX_BUILD_PYTHON = "ON"
 
diff --git a/src/preconditioner.cu b/src/preconditioner.cu
index 1b9d8dd..80c77da 100644
--- a/src/preconditioner.cu
+++ b/src/preconditioner.cu
@@ -32,7 +32,7 @@ __global__ void scale_variables_kernel(double* __restrict__ c,
                                        double* __restrict__ primal_start,
                                        const double* __restrict__ D,
                                        const double* __restrict__ invD,
-                                       int n);
+                                       int n_vars);
 __global__ void scale_constraints_kernel(double* __restrict__ con_lb,
                                          double* __restrict__ con_ub,
                                          double* __restrict__ con_lb_finite,
@@ -40,7 +40,7 @@ __global__ void scale_constraints_kernel(double* __restrict__ con_lb,
                                          double* __restrict__ dual_start,
                                          const double* __restrict__ E,
                                          const double* __restrict__ invE,
-                                         int m);
+                                         int n_cons);
 __global__ void csr_scale_nnz_kernel(const int* __restrict__ row_ids,
                                      const int* __restrict__ col_ind,
                                      double* __restrict__ A_vals,
@@ -61,25 +61,38 @@ __global__ void csr_row_powsum_kernel(const int* __restrict__ row_ptr,
 __global__ void clamp_sqrt_and_accum(double* __restrict__ x,
                                      double* __restrict__ inv_x,
                                      double* __restrict__ cum, 
-                                     int n);
-__global__ void reduce_bound_norm_sq_atomic(
+                                     int n_vars);
+__global__ void reduce_bound_norm_sq_kernel(
     const double* __restrict__ L,
     const double* __restrict__ U,
-    int m,
-    double* __restrict__ out_sum);
-__global__ void fill_const_and_accum_kernel(double* __restrict__ x,
-                                            double* __restrict__ inv_x,
-                                            double* __restrict__ cum,
-                                            double val,
-                                            int n);
+    int n_cons,
+    double* __restrict__ block_sums);
+__global__ void scale_bounds_kernel(
+    double* __restrict__ con_lb,
+    double* __restrict__ con_ub,
+    double* __restrict__ con_lb_finite,
+    double* __restrict__ con_ub_finite,
+    double* __restrict__ dual_start,
+    int n_cons,
+    double con_scale,
+    double obj_scale);
+__global__ void scale_obj_kernel(
+    double* __restrict__ c,
+    double* __restrict__ var_lb,
+    double* __restrict__ var_ub,
+    double* __restrict__ var_lb_finite,
+    double* __restrict__ var_ub_finite,
+    double* __restrict__ primal_start,
+    int n_vars,
+    double con_scale,
+    double obj_scale);
+__global__ void fill_ones_kernel(double* __restrict__ x, int n_vars);
 static void scale_problem(pdhg_solver_state_t *state, double *E, double *D, double *invE, double *invD);
 static void ruiz_rescaling(pdhg_solver_state_t *state, int num_iters, rescale_info_t *rescale_info,
                            double *E, double *D, double *invE, double *invD);
-__global__ void fill_ones_kernel(double* __restrict__ x, int n);
 static void pock_chambolle_rescaling(pdhg_solver_state_t *state, double alpha, rescale_info_t *rescale_info,
                                      double *E, double *D, double *invE, double *invD);
-static void bound_objective_rescaling(pdhg_solver_state_t *state, rescale_info_t *rescale_info,
-                                     double *E, double *D, double *invE, double *invD);
+static void bound_objective_rescaling(pdhg_solver_state_t *state, rescale_info_t *rescale_info);
 
 static void scale_problem(
     pdhg_solver_state_t *state,
@@ -191,7 +204,7 @@ static void pock_chambolle_rescaling(
         state->constraint_matrix_t->row_ptr,
         state->constraint_matrix_t->val,
         n_vars,
-        alpha,
+        2.0 - alpha,
         D);
     clamp_sqrt_and_accum<<<state->num_blocks_primal, THREADS_PER_BLOCK>>>(
         D,
@@ -205,12 +218,7 @@ static void pock_chambolle_rescaling(
 
 static void bound_objective_rescaling(
     pdhg_solver_state_t *state,
-    rescale_info_t *rescale_info,
-    double *E,
-    double *D,
-    double *invE,
-    double *invD
-    )
+    rescale_info_t *rescale_info)
 {
     const int n_cons = state->num_constraints;
     const int n_vars = state->num_variables;
@@ -218,7 +226,7 @@ static void bound_objective_rescaling(
     double *bnd_norm_sq_cuda = nullptr;
     CUDA_CHECK(cudaMalloc(&bnd_norm_sq_cuda, sizeof(double)));
     CUDA_CHECK(cudaMemset(bnd_norm_sq_cuda, 0, sizeof(double)));
-    reduce_bound_norm_sq_atomic<<<state->num_blocks_dual, THREADS_PER_BLOCK>>>(
+    reduce_bound_norm_sq_kernel<<<1, THREADS_PER_BLOCK, THREADS_PER_BLOCK * sizeof(double)>>>(
         state->constraint_lower_bound,
         state->constraint_upper_bound,
         n_cons,
@@ -235,14 +243,32 @@ static void bound_objective_rescaling(
                              state->objective_vector, 1,
                              &obj_norm));
 
-    const double E_const = bnd_norm + 1.0;
-    const double D_const = obj_norm + 1.0;
-    fill_const_and_accum_kernel<<<state->num_blocks_dual, THREADS_PER_BLOCK>>>(
-        E, invE, rescale_info->con_rescale, E_const, n_cons);
-    fill_const_and_accum_kernel<<<state->num_blocks_primal, THREADS_PER_BLOCK>>>(
-        D, invD, rescale_info->var_rescale, D_const, n_vars);
+    double con_scale = 1.0 / (bnd_norm + 1.0);
+    double obj_scale = 1.0 / (obj_norm + 1.0);
 
-    scale_problem(state, E, D, invE, invD);
+    scale_bounds_kernel<<<state->num_blocks_dual, THREADS_PER_BLOCK>>>(
+        state->constraint_lower_bound,
+        state->constraint_upper_bound,
+        state->constraint_lower_bound_finite_val,
+        state->constraint_upper_bound_finite_val,
+        state->initial_dual_solution,
+        n_cons,
+        con_scale,
+        obj_scale);
+
+    scale_obj_kernel<<<state->num_blocks_primal, THREADS_PER_BLOCK>>>(
+        state->objective_vector,
+        state->variable_lower_bound,
+        state->variable_upper_bound,
+        state->variable_lower_bound_finite_val,
+        state->variable_upper_bound_finite_val,
+        state->initial_primal_solution,
+        n_vars,
+        con_scale,
+        obj_scale);
+
+    rescale_info->con_bound_rescale = con_scale;
+    rescale_info->obj_vec_rescale   = obj_scale;
 }
 
 rescale_info_t *rescale_problem(
@@ -279,12 +305,15 @@ rescale_info_t *rescale_problem(
         printf("[Precondition] Pock-Chambolle scaling (alpha=%.4f)\n", params->pock_chambolle_alpha);
         pock_chambolle_rescaling(state, params->pock_chambolle_alpha, rescale_info, E, D, invE, invD);
     }
+
+    rescale_info->con_bound_rescale = 1.0;
+    rescale_info->obj_vec_rescale = 1.0;
     if (params->bound_objective_rescaling)
     {
         printf("[Precondition] Bound-objective scaling\n");
-        bound_objective_rescaling(state, rescale_info, E, D, invE, invD);
+        bound_objective_rescaling(state, rescale_info);
     }
-
+    
     rescale_info->rescaling_time_sec = (double)(clock() - start_rescaling) / CLOCKS_PER_SEC;
 
     CUDA_CHECK(cudaFree(E));
@@ -303,10 +332,10 @@ __global__ void scale_variables_kernel(double* __restrict__ c,
                                        double* __restrict__ primal_start,
                                        const double* __restrict__ D,
                                        const double* __restrict__ invD,
-                                       int n)
+                                       int n_vars)
 {
     int j = blockIdx.x * blockDim.x + threadIdx.x;
-    if (j >= n) return;
+    if (j >= n_vars) return;
     double dj = D[j];
     double inv_dj = invD[j];
     c[j]      *= inv_dj;
@@ -324,10 +353,10 @@ __global__ void scale_constraints_kernel(double* __restrict__ con_lb,
                                          double* __restrict__ dual_start,
                                          const double* __restrict__ E,
                                          const double* __restrict__ invE,
-                                         int m)
+                                         int n_cons)
 {
     int i = blockIdx.x * blockDim.x + threadIdx.x;
-    if (i >= m) return;
+    if (i >= n_cons) return;
     double inv_ei = invE[i];
     double ei = E[i];
     con_lb[i] *= inv_ei;
@@ -403,9 +432,9 @@ __global__ void csr_row_powsum_kernel(const int* __restrict__ row_ptr,
 __global__ void clamp_sqrt_and_accum(double* __restrict__ x, 
                                      double* __restrict__ inv_x,
                                      double* __restrict__ cum, 
-                                     int n) 
+                                     int n_vars) 
 {
-    for (int t = blockIdx.x * blockDim.x + threadIdx.x; t < n; t += blockDim.x * gridDim.x)
+    for (int t = blockIdx.x * blockDim.x + threadIdx.x; t < n_vars; t += blockDim.x * gridDim.x)
     {
         double v = x[t]; 
         double s = (v < SCALING_EPSILON) ? 1.0 : sqrt(v); 
@@ -415,38 +444,87 @@ __global__ void clamp_sqrt_and_accum(double* __restrict__ x,
     }
 }
 
-__global__ void reduce_bound_norm_sq_atomic(
+__global__ void reduce_bound_norm_sq_kernel(
     const double* __restrict__ L,
     const double* __restrict__ U,
-    int m,
-    double* __restrict__ out_sum)
+    int n_cons,
+    double* __restrict__ block_sums)
 {
+    extern __shared__ double sdata[];
+    int tid = threadIdx.x;
+    int global_tid = blockIdx.x * blockDim.x + tid;
+    int stride = blockDim.x * gridDim.x;
+
     double acc = 0.0;
-    for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < m; i += blockDim.x * gridDim.x) {
+    for (int i = global_tid; i < n_cons; i += stride) {
         double Li = L[i], Ui = U[i];
         bool fL = isfinite(Li), fU = isfinite(Ui);
-        if (fL && (!fU || fabs(Li - Ui) > SCALING_EPSILON)) acc += Li * Li;
-        if (fU)                                 acc += Ui * Ui;
+
+        if (fL && (!fU || fabs(Li - Ui) > SCALING_EPSILON)) {
+            acc += Li * Li;
+        }
+        if (fU) {
+            acc += Ui * Ui;
+        }
+    }
+
+    sdata[tid] = acc;
+    __syncthreads();
+
+    for (int s = blockDim.x / 2; s > 0; s >>= 1) {
+        if (tid < s) {
+            sdata[tid] += sdata[tid + s];
+        }
+        __syncthreads();
+    }
+
+    if (tid == 0) {
+        block_sums[blockIdx.x] = sdata[0];
     }
-    atomicAdd(out_sum, acc);
 }
 
-__global__ void fill_const_and_accum_kernel(double* __restrict__ x,
-                                            double* __restrict__ inv_x,
-                                            double* __restrict__ cum,
-                                            double val,
-                                            int n)
+__global__ void scale_bounds_kernel(
+    double* __restrict__ con_lb,
+    double* __restrict__ con_ub,
+    double* __restrict__ con_lb_finite,
+    double* __restrict__ con_ub_finite,
+    double* __restrict__ dual_start,
+    int n_cons,
+    double con_scale,
+    double obj_scale)
 {
     int i = blockIdx.x * blockDim.x + threadIdx.x;
-    if (i >= n) return;
+    if (i >= n_cons) return;
+    con_lb[i]        *= con_scale;
+    con_ub[i]        *= con_scale;
+    con_lb_finite[i] *= con_scale;
+    con_ub_finite[i] *= con_scale;
+    dual_start[i]    *= obj_scale;
+}
 
-    x[i]     = val;
-    inv_x[i] = 1.0 / val;
-    cum[i]  *= val;
+__global__ void scale_obj_kernel(
+    double* __restrict__ c,
+    double* __restrict__ var_lb,
+    double* __restrict__ var_ub,
+    double* __restrict__ var_lb_finite,
+    double* __restrict__ var_ub_finite,
+    double* __restrict__ primal_start,
+    int n_vars,
+    double con_scale,
+    double obj_scale)
+{
+    int j = blockIdx.x * blockDim.x + threadIdx.x;
+    if (j >= n_vars) return;
+    var_lb[j]        *= con_scale;
+    var_ub[j]        *= con_scale;
+    var_lb_finite[j] *= con_scale;
+    var_ub_finite[j] *= con_scale;
+    c[j]             *= obj_scale;
+    primal_start[j]  *= con_scale;
 }
 
-__global__ void fill_ones_kernel(double* __restrict__ x, int n)
+__global__ void fill_ones_kernel(double* __restrict__ x, int n_vars)
 {
     int i = blockIdx.x * blockDim.x + threadIdx.x;
-    if (i < n) x[i] = 1.0;
+    if (i < n_vars) x[i] = 1.0;
 }
diff --git a/src/solver.cu b/src/solver.cu
index 130fd75..fc8fcd1 100644
--- a/src/solver.cu
+++ b/src/solver.cu
@@ -36,33 +36,64 @@ __global__ void build_transpose_pos(const int* __restrict__ A_row_ind,
                                     const int* __restrict__ At_col_ind,
                                     int nnz,
                                     int* __restrict__ A_to_At);
-__global__ void compute_next_pdhg_primal_solution_kernel(
-    const double *current_primal, double *reflected_primal, const double *dual_product,
-    const double *objective, const double *var_lb, const double *var_ub,
-    int n, double step_size);
-__global__ void compute_next_pdhg_primal_solution_major_kernel(
-    const double *current_primal, double *pdhg_primal, double *reflected_primal,
-    const double *dual_product, const double *objective, const double *var_lb,
-    const double *var_ub, int n, double step_size, double *dual_slack);
-__global__ void compute_next_pdhg_dual_solution_kernel(
-    const double *current_dual, double *reflected_dual, const double *primal_product,
-    const double *const_lb, const double *const_ub, int n, double step_size);
-__global__ void compute_next_pdhg_dual_solution_major_kernel(
-    const double *current_dual, double *pdhg_dual, double *reflected_dual,
-    const double *primal_product, const double *const_lb, const double *const_ub,
-    int n, double step_size);
-__global__ void halpern_update_kernel(
-    const double *initial_primal, double *current_primal, const double *reflected_primal,
-    const double *initial_dual, double *current_dual, const double *reflected_dual,
-    int n_vars, int n_cons, double weight, double reflection_coeff);
-__global__ void rescale_solution_kernel(
-    double *primal_solution, double *dual_solution, 
-    const double *variable_rescaling, const double *constraint_rescaling,
-    int n_vars, int n_cons);
-__global__ void compute_delta_solution_kernel(
-    const double *initial_primal, const double *pdhg_primal, double *delta_primal,
-    const double *initial_dual, const double *pdhg_dual, double *delta_dual,
-    int n_vars, int n_cons);
+__global__ void compute_next_pdhg_primal_solution_kernel(const double *current_primal, 
+                                                         double *reflected_primal, 
+                                                         const double *dual_product,
+                                                         const double *objective, 
+                                                         const double *var_lb, 
+                                                         const double *var_ub,
+                                                         int n, 
+                                                         double step_size);
+__global__ void compute_next_pdhg_primal_solution_major_kernel(const double *current_primal, 
+                                                               double *pdhg_primal, 
+                                                               double *reflected_primal,
+                                                               const double *dual_product, 
+                                                               const double *objective, 
+                                                               const double *var_lb,
+                                                               const double *var_ub, 
+                                                               int n, 
+                                                               double step_size, 
+                                                               double *dual_slack);
+__global__ void compute_next_pdhg_dual_solution_kernel(const double *current_dual, 
+                                                       double *reflected_dual, 
+                                                       const double *primal_product,
+                                                       const double *const_lb, 
+                                                       const double *const_ub, 
+                                                       int n, 
+                                                       double step_size);
+__global__ void compute_next_pdhg_dual_solution_major_kernel(const double *current_dual, 
+                                                             double *pdhg_dual, 
+                                                             double *reflected_dual,
+                                                             const double *primal_product, 
+                                                             const double *const_lb, 
+                                                             const double *const_ub,
+                                                             int n, 
+                                                             double step_size);
+__global__ void halpern_update_kernel(const double *initial_primal, 
+                                      double *current_primal, 
+                                      const double *reflected_primal,
+                                      const double *initial_dual, 
+                                      double *current_dual, 
+                                      const double *reflected_dual,
+                                      int n_vars, 
+                                      int n_cons, 
+                                      double weight, 
+                                      double reflection_coeff);
+__global__ void rescale_solution_kernel(double *primal_solution,
+                                        double *dual_solution,
+                                        const double *variable_rescaling,
+                                        const double *constraint_rescaling,
+                                        const double objective_vector_rescaling,
+                                        const double constraint_bound_rescaling,
+                                        int n_vars, int n_cons);
+__global__ void compute_delta_solution_kernel(const double *initial_primal, 
+                                              const double *pdhg_primal, 
+                                              double *delta_primal,
+                                              const double *initial_dual, 
+                                              const double *pdhg_dual, 
+                                              double *delta_dual,
+                                              int n_vars, 
+                                              int n_cons);
 static void compute_next_pdhg_primal_solution(pdhg_solver_state_t *state);
 static void compute_next_pdhg_dual_solution(pdhg_solver_state_t *state);
 static void halpern_update(pdhg_solver_state_t *state, double reflection_coefficient);
@@ -81,7 +112,7 @@ cupdlpx_result_t *optimize(const pdhg_parameters_t *params,
 {
     print_initial_info(params, original_problem);
     pdhg_solver_state_t *state = initialize_solver_state(original_problem, params);
-
+    
     initialize_step_size_and_primal_weight(state, params);
     clock_t start_time = clock();
     bool do_restart = false;
@@ -291,6 +322,8 @@ static pdhg_solver_state_t *initialize_solver_state(
 
     state->constraint_rescaling = rescale_info->con_rescale;
     state->variable_rescaling   = rescale_info->var_rescale;
+    state->constraint_bound_rescaling = rescale_info->con_bound_rescale;
+    state->objective_vector_rescaling = rescale_info->obj_vec_rescale;
     state->rescaling_time_sec   = rescale_info->rescaling_time_sec;
 
     rescale_info->con_rescale = NULL;
@@ -305,7 +338,6 @@ static pdhg_solver_state_t *initialize_solver_state(
     state->constraint_matrix->transpose_pos = NULL;
 
     double sum_of_squares = 0.0;
-
     for (int i = 0; i < n_vars; ++i)
     {
         sum_of_squares += original_problem->objective_vector[i] * original_problem->objective_vector[i];
@@ -313,17 +345,14 @@ static pdhg_solver_state_t *initialize_solver_state(
     state->objective_vector_norm = sqrt(sum_of_squares);
 
     sum_of_squares = 0.0;
-
     for (int i = 0; i < n_cons; ++i)
     {
         double lower = original_problem->constraint_lower_bound[i];
         double upper = original_problem->constraint_upper_bound[i];
-
         if (isfinite(lower) && (lower != upper))
         {
             sum_of_squares += lower * lower;
         }
-
         if (isfinite(upper))
         {
             sum_of_squares += upper * upper;
@@ -350,7 +379,7 @@ static pdhg_solver_state_t *initialize_solver_state(
     CUSPARSE_CHECK(cusparseCreateDnVec(&state->vec_primal_prod, state->num_constraints, state->primal_product, CUDA_R_64F));
     CUSPARSE_CHECK(cusparseCreateDnVec(&state->vec_dual_prod, state->num_variables, state->dual_product, CUDA_R_64F));
     CUSPARSE_CHECK(cusparseSpMV_bufferSize(state->sparse_handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &HOST_ONE, state->matA, state->vec_primal_sol, &HOST_ZERO, state->vec_primal_prod, CUDA_R_64F, CUSPARSE_SPMV_CSR_ALG2, &primal_spmv_buffer_size));
-
+    
     CUSPARSE_CHECK(cusparseSpMV_bufferSize(state->sparse_handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &HOST_ONE, state->matAt, state->vec_dual_sol, &HOST_ZERO, state->vec_dual_prod, CUDA_R_64F, CUSPARSE_SPMV_CSR_ALG2, &dual_spmv_buffer_size));
     CUDA_CHECK(cudaMalloc(&state->primal_spmv_buffer, primal_spmv_buffer_size));
     CUSPARSE_CHECK(cusparseSpMV_preprocess(state->sparse_handle, CUSPARSE_OPERATION_NON_TRANSPOSE,
@@ -512,18 +541,21 @@ __global__ void rescale_solution_kernel(double *primal_solution,
                                         double *dual_solution,
                                         const double *variable_rescaling,
                                         const double *constraint_rescaling,
+                                        const double objective_vector_rescaling,
+                                        const double constraint_bound_rescaling,
                                         int n_vars, int n_cons)
 {
     int i = blockIdx.x * blockDim.x + threadIdx.x;
     if (i < n_vars)
     {
         primal_solution[i] =
-            primal_solution[i] / variable_rescaling[i];
+            primal_solution[i] / variable_rescaling[i] / constraint_bound_rescaling;
     }
     else if (i < n_vars + n_cons)
     {
         int idx = i - n_vars;
-        dual_solution[idx] = dual_solution[idx] / constraint_rescaling[idx];
+        dual_solution[idx] = dual_solution[idx] / constraint_rescaling[idx] /
+                             objective_vector_rescaling;
     }
 }
 
@@ -633,6 +665,7 @@ static void rescale_solution(pdhg_solver_state_t *state)
     rescale_solution_kernel<<<state->num_blocks_primal_dual, THREADS_PER_BLOCK>>>(
         state->pdhg_primal_solution, state->pdhg_dual_solution,
         state->variable_rescaling, state->constraint_rescaling,
+        state->objective_vector_rescaling, state->constraint_bound_rescaling,
         state->num_variables, state->num_constraints);
 }
 
diff --git a/src/utils.cu b/src/utils.cu
index 3212c66..199c555 100644
--- a/src/utils.cu
+++ b/src/utils.cu
@@ -596,14 +596,19 @@ void compute_residual(pdhg_solver_state_t *state)
     CUBLAS_CHECK(cublasDnrm2_v2_64(state->blas_handle, state->num_constraints,
                                    state->primal_residual, 1,
                                    &state->absolute_primal_residual));
+    state->absolute_primal_residual /= state->constraint_bound_rescaling;
     CUBLAS_CHECK(cublasDnrm2_v2_64(state->blas_handle, state->num_variables,
                                    state->dual_residual, 1,
                                    &state->absolute_dual_residual));
+    state->absolute_dual_residual /= state->objective_vector_rescaling;
 
     CUBLAS_CHECK(cublasDdot(
         state->blas_handle, state->num_variables, state->objective_vector, 1,
         state->pdhg_primal_solution, 1, &state->primal_objective_value));
-    state->primal_objective_value = state->primal_objective_value + state->objective_constant;
+    state->primal_objective_value =
+        state->primal_objective_value / (state->constraint_bound_rescaling *
+                                         state->objective_vector_rescaling) +
+        state->objective_constant;
 
     double base_dual_objective;
     CUBLAS_CHECK(cublasDdot(state->blas_handle, state->num_variables,
@@ -612,7 +617,10 @@ void compute_residual(pdhg_solver_state_t *state)
     double dual_slack_sum =
         get_vector_sum(state->blas_handle, state->num_constraints,
                        state->ones_dual_d, state->primal_slack);
-    state->dual_objective_value = base_dual_objective + dual_slack_sum + state->objective_constant;
+    state->dual_objective_value = (base_dual_objective + dual_slack_sum) /
+                                      (state->constraint_bound_rescaling *
+                                       state->objective_vector_rescaling) +
+                                  state->objective_constant;
 
     state->relative_primal_residual =
         state->absolute_primal_residual / (1.0 + state->constraint_bound_norm);
@@ -669,6 +677,8 @@ void compute_infeasibility_information(pdhg_solver_state_t *state)
     CUBLAS_CHECK(cublasDdot(
         state->blas_handle, state->num_variables, state->objective_vector, 1,
         state->delta_primal_solution, 1, &state->primal_ray_linear_objective));
+    state->primal_ray_linear_objective /=
+        (state->constraint_bound_rescaling * state->objective_vector_rescaling);
 
     dual_solution_dual_objective_contribution_kernel<<<state->num_blocks_dual,
                                                        THREADS_PER_BLOCK>>>(
@@ -688,7 +698,9 @@ void compute_infeasibility_information(pdhg_solver_state_t *state)
     double sum_dual_slack =
         get_vector_sum(state->blas_handle, state->num_variables,
                        state->ones_primal_d, state->dual_slack);
-    state->dual_ray_objective = sum_primal_slack + sum_dual_slack;
+    state->dual_ray_objective =
+        (sum_primal_slack + sum_dual_slack) /
+        (state->constraint_bound_rescaling * state->objective_vector_rescaling);
 
     compute_primal_infeasibility_kernel<<<state->num_blocks_dual,
                                           THREADS_PER_BLOCK>>>(
@@ -948,5 +960,4 @@ int coo_to_csr(const matrix_desc_t *desc, int **row_ptr, int **col_ind,
     free(next);
     *nnz_out = nnz;
     return 0;
-}
-
+}
\ No newline at end of file
diff --git a/test/test_infeasible_unbounded.py b/test/test_infeasible_unbounded.py
index 4388cc4..d09d9a9 100644
--- a/test/test_infeasible_unbounded.py
+++ b/test/test_infeasible_unbounded.py
@@ -63,13 +63,13 @@ def test_infeasible_lp(base_lp_data, atol):
     # turn off output
     model.setParams(OutputFlag=False)
     # optimize
-    #model.optimize()
+    model.optimize()
     # check status
-    #assert hasattr(model, "Status"), "Model.Status not exposed."
-    #assert model.Status == "PRIMAL_INFEASIBLE", f"Unexpected termination status: {model.Status}"
-    #assert model.StatusCode == PDLP.PRIMAL_INFEASIBLE, f"Unexpected termination status code: {model.StatusCode}"
+    assert hasattr(model, "Status"), "Model.Status not exposed."
+    assert model.Status == "PRIMAL_INFEASIBLE", f"Unexpected termination status: {model.Status}"
+    assert model.StatusCode == PDLP.PRIMAL_INFEASIBLE, f"Unexpected termination status code: {model.StatusCode}"
     # check dual ray
-    #assert model.DualRayObj > atol, f"DualRayObj should be positive for dual infeasible, got {model.DualRayObj}"
+    assert model.DualRayObj > atol, f"DualRayObj should be positive for dual infeasible, got {model.DualRayObj}"
 
 
 def test_unbounded_lp(base_lp_data, atol):

From 1989b6bb30a9a1a798a03673e74ebcc2f77b61cc Mon Sep 17 00:00:00 2001
From: Bo Tang <botang@mie.utoronto.ca>
Date: Sun, 16 Nov 2025 00:40:11 -0500
Subject: [PATCH 06/13] New feat: record precondition time

---
 src/cli.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/src/cli.c b/src/cli.c
index 72aa2c9..97e2943 100644
--- a/src/cli.c
+++ b/src/cli.c
@@ -121,6 +121,7 @@ void save_solver_summary(const cupdlpx_result_t *result, const char *output_dir,
     }
     fprintf(outfile, "Termination Reason: %s\n",
             termination_reason_tToString(result->termination_reason));
+    fprintf(outfile, "Precondition Time (sec): %e\n", result->rescaling_time_sec);
     fprintf(outfile, "Runtime (sec): %e\n", result->cumulative_time_sec);
     fprintf(outfile, "Iterations Count: %d\n", result->total_count);
     fprintf(outfile, "Primal Objective Value: %e\n",

From 32889ca694082e9a48e7209f5f68a58081770876 Mon Sep 17 00:00:00 2001
From: bolucastang <lucastang1994@gmail.com>
Date: Fri, 21 Nov 2025 11:29:15 -0500
Subject: [PATCH 07/13] feat: add logging for precondition time

---
 src/utils.cu | 19 ++++++++++---------
 1 file changed, 10 insertions(+), 9 deletions(-)

diff --git a/src/utils.cu b/src/utils.cu
index 96ae8d2..e660fd1 100644
--- a/src/utils.cu
+++ b/src/utils.cu
@@ -370,13 +370,14 @@ void pdhg_final_log(const pdhg_solver_state_t *state, bool verbose,
                "--------------------\n");
     }
     printf("Solution Summary\n");
-    printf("  Status        : %s\n", termination_reason_to_string(reason));
-    printf("  Iterations    : %d\n", state->total_count - 1);
-    printf("  Solve time    : %.3g sec\n", state->cumulative_time_sec);
-    printf("  Primal obj    : %.10g\n", state->primal_objective_value);
-    printf("  Dual obj      : %.10g\n", state->dual_objective_value);
-    printf("  Primal infeas : %.3e\n", state->relative_primal_residual);
-    printf("  Dual infeas   : %.3e\n", state->relative_dual_residual);
+    printf("  Status            : %s\n", termination_reason_to_string(reason));
+    printf("  Precondition time : %.5g sec\n", state->rescaling_time_sec);
+    printf("  Iterations        : %d\n", state->total_count - 1);
+    printf("  Solve time        : %.3g sec\n", state->cumulative_time_sec);
+    printf("  Primal obj        : %.10g\n", state->primal_objective_value);
+    printf("  Dual obj          : %.10g\n", state->dual_objective_value);
+    printf("  Primal infeas     : %.3e\n", state->relative_primal_residual);
+    printf("  Dual infeas       : %.3e\n", state->relative_dual_residual);
 }
 
 void display_iteration_stats(const pdhg_solver_state_t *state, bool verbose)
@@ -1010,7 +1011,7 @@ void print_initial_feas_polish_info(bool is_primal_polish, const pdhg_parameters
     printf("---------------------------------------------------------------------------------------\n");
     if (is_primal_polish) printf("%s %s |  %s  | %s | %s \n",  "  iter", "  time ", "pr obj", " abs pr res ", " rel pr res ");
     // else printf("%s %s | %s | %s \n",  "  iter", "  time ", " abs du res ", " rel du res ");
-    else printf("%s %s |  %s  | %s | %s \n", "  iter", "  time ", "du obj", " abs du res ", " rel du res ");  
+    else printf("%s %s |  %s  | %s | %s \n", "  iter", "  time ", "du obj", " abs du res ", " rel du res ");
     printf("---------------------------------------------------------------------------------------\n");
 }
 
@@ -1134,7 +1135,7 @@ void compute_dual_feas_polish_residual(pdhg_solver_state_t *state, const pdhg_so
     CUSPARSE_CHECK(cusparseSpMV(state->sparse_handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &HOST_ONE, state->matAt, state->vec_dual_sol, &HOST_ZERO, state->vec_dual_prod, CUDA_R_64F, CUSPARSE_SPMV_CSR_ALG2, state->dual_spmv_buffer));
 
     compute_dual_feas_polish_residual_kerenl<<<state->num_blocks_primal_dual, THREADS_PER_BLOCK>>>(
-        state->dual_residual, 
+        state->dual_residual,
         state->pdhg_dual_solution,
         state->dual_product,
         state->dual_slack, state->objective_vector,

From e3fd2945ab2436376223f769345a92bb78220b16 Mon Sep 17 00:00:00 2001
From: Bo Tang <botang@mie.utoronto.ca>
Date: Tue, 25 Nov 2025 22:44:22 -0500
Subject: [PATCH 08/13] Bug fixed: numerical issue

---
 src/preconditioner.cu | 28 ++++++++++++++--------------
 1 file changed, 14 insertions(+), 14 deletions(-)

diff --git a/src/preconditioner.cu b/src/preconditioner.cu
index 80c77da..c042175 100644
--- a/src/preconditioner.cu
+++ b/src/preconditioner.cu
@@ -22,7 +22,7 @@ limitations under the License.
 #include <cuda_runtime.h>
 #include <cublas_v2.h>
 
-#define SCALING_EPSILON 1e-12
+#define SCALING_EPSILON 1e-8
 
 __global__ void scale_variables_kernel(double* __restrict__ c,
                                        double* __restrict__ var_lb,
@@ -60,7 +60,7 @@ __global__ void csr_row_powsum_kernel(const int* __restrict__ row_ptr,
                                           double* __restrict__ out_sum);
 __global__ void clamp_sqrt_and_accum(double* __restrict__ x,
                                      double* __restrict__ inv_x,
-                                     double* __restrict__ cum, 
+                                     double* __restrict__ cum,
                                      int n_vars);
 __global__ void reduce_bound_norm_sq_kernel(
     const double* __restrict__ L,
@@ -199,7 +199,7 @@ static void pock_chambolle_rescaling(
         invE,
         rescale_info->con_rescale,
         n_cons);
-        
+
     csr_row_powsum_kernel<<<state->num_blocks_primal, THREADS_PER_BLOCK>>>(
         state->constraint_matrix_t->row_ptr,
         state->constraint_matrix_t->val,
@@ -231,7 +231,7 @@ static void bound_objective_rescaling(
         state->constraint_upper_bound,
         n_cons,
         bnd_norm_sq_cuda);
-    
+
     double bnd_norm_sq = 0.0;
     CUDA_CHECK(cudaMemcpy(&bnd_norm_sq, bnd_norm_sq_cuda, sizeof(double), cudaMemcpyDeviceToHost));
     CUDA_CHECK(cudaFree(bnd_norm_sq_cuda));
@@ -276,7 +276,7 @@ rescale_info_t *rescale_problem(
     pdhg_solver_state_t *state)
 {
     printf("[Precondition] Start\n");
-    
+
     int n_vars = state->num_variables;
     int n_cons = state->num_constraints;
 
@@ -313,14 +313,14 @@ rescale_info_t *rescale_problem(
         printf("[Precondition] Bound-objective scaling\n");
         bound_objective_rescaling(state, rescale_info);
     }
-    
+
     rescale_info->rescaling_time_sec = (double)(clock() - start_rescaling) / CLOCKS_PER_SEC;
 
     CUDA_CHECK(cudaFree(E));
     CUDA_CHECK(cudaFree(D));
     CUDA_CHECK(cudaFree(invE));
     CUDA_CHECK(cudaFree(invD));
-    
+
     return rescale_info;
 }
 
@@ -376,7 +376,7 @@ __global__ void csr_scale_nnz_kernel(const int* __restrict__ row_ids,
                                      int nnz)
 {
     for (int k = blockIdx.x * blockDim.x + threadIdx.x;
-         k < nnz; 
+         k < nnz;
          k += gridDim.x * blockDim.x)
     {
         int i = row_ids[k];
@@ -429,16 +429,16 @@ __global__ void csr_row_powsum_kernel(const int* __restrict__ row_ptr,
     out_sum[i] = acc;
 }
 
-__global__ void clamp_sqrt_and_accum(double* __restrict__ x, 
+__global__ void clamp_sqrt_and_accum(double* __restrict__ x,
                                      double* __restrict__ inv_x,
-                                     double* __restrict__ cum, 
-                                     int n_vars) 
+                                     double* __restrict__ cum,
+                                     int n_vars)
 {
     for (int t = blockIdx.x * blockDim.x + threadIdx.x; t < n_vars; t += blockDim.x * gridDim.x)
     {
-        double v = x[t]; 
-        double s = (v < SCALING_EPSILON) ? 1.0 : sqrt(v); 
-        cum[t] *= s; 
+        double v = x[t];
+        double s = sqrt(fmax(v, SCALING_EPSILON));
+        cum[t] *= s;
         x[t] = s;
         inv_x[t] = 1.0 / s;
     }

From 7381654f91662ba05a173a05fdfcfea4e2c031ff Mon Sep 17 00:00:00 2001
From: Bo Tang <botang@mie.utoronto.ca>
Date: Thu, 27 Nov 2025 14:38:50 -0500
Subject: [PATCH 09/13] Clean code: clang-format and align naming style

---
 internal/internal_types.h |   2 +-
 src/preconditioner.cu     | 343 ++++++++++++++++++++------------------
 src/solver.cu             | 282 ++++++++++++++++---------------
 3 files changed, 327 insertions(+), 300 deletions(-)

diff --git a/internal/internal_types.h b/internal/internal_types.h
index 3ce8c23..aa69005 100644
--- a/internal/internal_types.h
+++ b/internal/internal_types.h
@@ -30,7 +30,7 @@ typedef struct
 	int *col_ind;
 	int *row_ind;
 	double *val;
-	int *transpose_pos;
+	int *transpose_map;
 } cu_sparse_matrix_csr_t;
 
 typedef struct
diff --git a/src/preconditioner.cu b/src/preconditioner.cu
index c042175..031051f 100644
--- a/src/preconditioner.cu
+++ b/src/preconditioner.cu
@@ -24,73 +24,73 @@ limitations under the License.
 
 #define SCALING_EPSILON 1e-8
 
-__global__ void scale_variables_kernel(double* __restrict__ c,
-                                       double* __restrict__ var_lb,
-                                       double* __restrict__ var_ub,
-                                       double* __restrict__ var_lb_finite,
-                                       double* __restrict__ var_ub_finite,
-                                       double* __restrict__ primal_start,
-                                       const double* __restrict__ D,
-                                       const double* __restrict__ invD,
+__global__ void scale_variables_kernel(double *__restrict__ c,
+                                       double *__restrict__ var_lb,
+                                       double *__restrict__ var_ub,
+                                       double *__restrict__ var_lb_finite,
+                                       double *__restrict__ var_ub_finite,
+                                       double *__restrict__ primal_start,
+                                       const double *__restrict__ D,
+                                       const double *__restrict__ invD,
                                        int n_vars);
-__global__ void scale_constraints_kernel(double* __restrict__ con_lb,
-                                         double* __restrict__ con_ub,
-                                         double* __restrict__ con_lb_finite,
-                                         double* __restrict__ con_ub_finite,
-                                         double* __restrict__ dual_start,
-                                         const double* __restrict__ E,
-                                         const double* __restrict__ invE,
+__global__ void scale_constraints_kernel(double *__restrict__ con_lb,
+                                         double *__restrict__ con_ub,
+                                         double *__restrict__ con_lb_finite,
+                                         double *__restrict__ con_ub_finite,
+                                         double *__restrict__ dual_start,
+                                         const double *__restrict__ E,
+                                         const double *__restrict__ invE,
                                          int n_cons);
-__global__ void csr_scale_nnz_kernel(const int* __restrict__ row_ids,
-                                     const int* __restrict__ col_ind,
-                                     double* __restrict__ A_vals,
-                                     double* __restrict__ At_vals,
-                                     const int* __restrict__ A_to_At,
-                                     const double* __restrict__ invD,
-                                     const double* __restrict__ invE,
+__global__ void scale_csr_nnz_kernel(const int *__restrict__ row_ids,
+                                     const int *__restrict__ col_ind,
+                                     double *__restrict__ A_vals,
+                                     double *__restrict__ At_vals,
+                                     const int *__restrict__ A_to_At,
+                                     const double *__restrict__ invD,
+                                     const double *__restrict__ invE,
                                      int nnz);
-__global__ void csr_row_absmax_kernel(const int* __restrict__ row_ptr,
-                                      const double* __restrict__ vals,
-                                      int num_rows,
-                                      double* __restrict__ out_max);
-__global__ void csr_row_powsum_kernel(const int* __restrict__ row_ptr,
-                                          const double* __restrict__ vals,
-                                          int num_rows,
-                                          double degree,
-                                          double* __restrict__ out_sum);
-__global__ void clamp_sqrt_and_accum(double* __restrict__ x,
-                                     double* __restrict__ inv_x,
-                                     double* __restrict__ cum,
-                                     int n_vars);
+__global__ void compute_csr_row_absmax_kernel(const int *__restrict__ row_ptr,
+                                              const double *__restrict__ vals,
+                                              int num_rows,
+                                              double *__restrict__ out_max);
+__global__ void compute_csr_row_powsum_kernel(const int *__restrict__ row_ptr,
+                                              const double *__restrict__ vals,
+                                              int num_rows,
+                                              double degree,
+                                              double *__restrict__ out_sum);
+__global__ void clamp_sqrt_and_accum_kernel(double *__restrict__ x,
+                                            double *__restrict__ inv_x,
+                                            double *__restrict__ cum,
+                                            int n_vars);
 __global__ void reduce_bound_norm_sq_kernel(
-    const double* __restrict__ L,
-    const double* __restrict__ U,
+    const double *__restrict__ L,
+    const double *__restrict__ U,
     int n_cons,
-    double* __restrict__ block_sums);
+    double *__restrict__ block_sums);
 __global__ void scale_bounds_kernel(
-    double* __restrict__ con_lb,
-    double* __restrict__ con_ub,
-    double* __restrict__ con_lb_finite,
-    double* __restrict__ con_ub_finite,
-    double* __restrict__ dual_start,
+    double *__restrict__ con_lb,
+    double *__restrict__ con_ub,
+    double *__restrict__ con_lb_finite,
+    double *__restrict__ con_ub_finite,
+    double *__restrict__ dual_start,
     int n_cons,
     double con_scale,
     double obj_scale);
-__global__ void scale_obj_kernel(
-    double* __restrict__ c,
-    double* __restrict__ var_lb,
-    double* __restrict__ var_ub,
-    double* __restrict__ var_lb_finite,
-    double* __restrict__ var_ub_finite,
-    double* __restrict__ primal_start,
+__global__ void scale_objective_kernel(
+    double *__restrict__ c,
+    double *__restrict__ var_lb,
+    double *__restrict__ var_ub,
+    double *__restrict__ var_lb_finite,
+    double *__restrict__ var_ub_finite,
+    double *__restrict__ primal_start,
     int n_vars,
     double con_scale,
     double obj_scale);
-__global__ void fill_ones_kernel(double* __restrict__ x, int n_vars);
+__global__ void fill_ones_kernel(double *__restrict__ x, int n_vars);
 static void scale_problem(pdhg_solver_state_t *state, double *E, double *D, double *invE, double *invD);
 static void ruiz_rescaling(pdhg_solver_state_t *state, int num_iters, rescale_info_t *rescale_info,
                            double *E, double *D, double *invE, double *invD);
-static void pock_chambolle_rescaling(pdhg_solver_state_t *state, double alpha, rescale_info_t *rescale_info,
+static void pock_chambolle_rescaling(pdhg_solver_state_t *state, const double alpha, rescale_info_t *rescale_info,
                                      double *E, double *D, double *invE, double *invD);
 static void bound_objective_rescaling(pdhg_solver_state_t *state, rescale_info_t *rescale_info);
 
@@ -125,12 +125,12 @@ static void scale_problem(
         invE,
         n_cons);
 
-    csr_scale_nnz_kernel<<<state->num_blocks_nnz, THREADS_PER_BLOCK>>>(
+    scale_csr_nnz_kernel<<<state->num_blocks_nnz, THREADS_PER_BLOCK>>>(
         state->constraint_matrix->row_ind,
         state->constraint_matrix->col_ind,
         state->constraint_matrix->val,
         state->constraint_matrix_t->val,
-        state->constraint_matrix->transpose_pos,
+        state->constraint_matrix->transpose_map,
         invD,
         invE,
         state->constraint_matrix->num_nonzeros);
@@ -150,23 +150,23 @@ static void ruiz_rescaling(
 
     for (int iter = 0; iter < num_iterations; ++iter)
     {
-        csr_row_absmax_kernel<<<state->num_blocks_dual, THREADS_PER_BLOCK>>>(
+        compute_csr_row_absmax_kernel<<<state->num_blocks_dual, THREADS_PER_BLOCK>>>(
             state->constraint_matrix->row_ptr,
             state->constraint_matrix->val,
             n_cons,
             E);
-        clamp_sqrt_and_accum<<<state->num_blocks_dual, THREADS_PER_BLOCK>>>(
+        clamp_sqrt_and_accum_kernel<<<state->num_blocks_dual, THREADS_PER_BLOCK>>>(
             E,
             invE,
             rescale_info->con_rescale,
             n_cons);
 
-        csr_row_absmax_kernel<<<state->num_blocks_primal, THREADS_PER_BLOCK>>>(
+        compute_csr_row_absmax_kernel<<<state->num_blocks_primal, THREADS_PER_BLOCK>>>(
             state->constraint_matrix_t->row_ptr,
             state->constraint_matrix_t->val,
             n_vars,
             D);
-        clamp_sqrt_and_accum<<<state->num_blocks_primal, THREADS_PER_BLOCK>>>(
+        clamp_sqrt_and_accum_kernel<<<state->num_blocks_primal, THREADS_PER_BLOCK>>>(
             D,
             invD,
             rescale_info->var_rescale,
@@ -188,32 +188,31 @@ static void pock_chambolle_rescaling(
     const int n_cons = state->num_constraints;
     const int n_vars = state->num_variables;
 
-    csr_row_powsum_kernel<<<state->num_blocks_dual, THREADS_PER_BLOCK>>>(
+    compute_csr_row_powsum_kernel<<<state->num_blocks_dual, THREADS_PER_BLOCK>>>(
         state->constraint_matrix->row_ptr,
         state->constraint_matrix->val,
         n_cons,
         alpha,
         E);
-    clamp_sqrt_and_accum<<<state->num_blocks_dual, THREADS_PER_BLOCK>>>(
+    clamp_sqrt_and_accum_kernel<<<state->num_blocks_dual, THREADS_PER_BLOCK>>>(
         E,
         invE,
         rescale_info->con_rescale,
         n_cons);
 
-    csr_row_powsum_kernel<<<state->num_blocks_primal, THREADS_PER_BLOCK>>>(
+    compute_csr_row_powsum_kernel<<<state->num_blocks_primal, THREADS_PER_BLOCK>>>(
         state->constraint_matrix_t->row_ptr,
         state->constraint_matrix_t->val,
         n_vars,
         2.0 - alpha,
         D);
-    clamp_sqrt_and_accum<<<state->num_blocks_primal, THREADS_PER_BLOCK>>>(
+    clamp_sqrt_and_accum_kernel<<<state->num_blocks_primal, THREADS_PER_BLOCK>>>(
         D,
         invD,
         rescale_info->var_rescale,
         n_vars);
 
     scale_problem(state, E, D, invE, invD);
-
 }
 
 static void bound_objective_rescaling(
@@ -223,19 +222,19 @@ static void bound_objective_rescaling(
     const int n_cons = state->num_constraints;
     const int n_vars = state->num_variables;
 
-    double *bnd_norm_sq_cuda = nullptr;
-    CUDA_CHECK(cudaMalloc(&bnd_norm_sq_cuda, sizeof(double)));
-    CUDA_CHECK(cudaMemset(bnd_norm_sq_cuda, 0, sizeof(double)));
+    double *bnd_norm_sq_d = NULL;
+    CUDA_CHECK(cudaMalloc(&bnd_norm_sq_d, sizeof(double)));
+    CUDA_CHECK(cudaMemset(bnd_norm_sq_d, 0, sizeof(double)));
     reduce_bound_norm_sq_kernel<<<1, THREADS_PER_BLOCK, THREADS_PER_BLOCK * sizeof(double)>>>(
         state->constraint_lower_bound,
         state->constraint_upper_bound,
         n_cons,
-        bnd_norm_sq_cuda);
+        bnd_norm_sq_d);
 
-    double bnd_norm_sq = 0.0;
-    CUDA_CHECK(cudaMemcpy(&bnd_norm_sq, bnd_norm_sq_cuda, sizeof(double), cudaMemcpyDeviceToHost));
-    CUDA_CHECK(cudaFree(bnd_norm_sq_cuda));
-    double bnd_norm = sqrt(bnd_norm_sq);
+    double bnd_norm_sq_h = 0.0;
+    CUDA_CHECK(cudaMemcpy(&bnd_norm_sq_h, bnd_norm_sq_d, sizeof(double), cudaMemcpyDeviceToHost));
+    CUDA_CHECK(cudaFree(bnd_norm_sq_d));
+    double bnd_norm = sqrt(bnd_norm_sq_h);
 
     double obj_norm = 0.0;
     CUBLAS_CHECK(cublasDnrm2(state->blas_handle,
@@ -256,7 +255,7 @@ static void bound_objective_rescaling(
         con_scale,
         obj_scale);
 
-    scale_obj_kernel<<<state->num_blocks_primal, THREADS_PER_BLOCK>>>(
+    scale_objective_kernel<<<state->num_blocks_primal, THREADS_PER_BLOCK>>>(
         state->objective_vector,
         state->variable_lower_bound,
         state->variable_upper_bound,
@@ -268,7 +267,7 @@ static void bound_objective_rescaling(
         obj_scale);
 
     rescale_info->con_bound_rescale = con_scale;
-    rescale_info->obj_vec_rescale   = obj_scale;
+    rescale_info->obj_vec_rescale = obj_scale;
 }
 
 rescale_info_t *rescale_problem(
@@ -282,18 +281,18 @@ rescale_info_t *rescale_problem(
 
     clock_t start_rescaling = clock();
     rescale_info_t *rescale_info = (rescale_info_t *)safe_calloc(1, sizeof(rescale_info_t));
-    CUDA_CHECK(cudaMalloc(&rescale_info->con_rescale, n_cons*sizeof(double)));
-    CUDA_CHECK(cudaMalloc(&rescale_info->var_rescale, n_vars*sizeof(double)));
+    CUDA_CHECK(cudaMalloc(&rescale_info->con_rescale, n_cons * sizeof(double)));
+    CUDA_CHECK(cudaMalloc(&rescale_info->var_rescale, n_vars * sizeof(double)));
     fill_ones_kernel<<<state->num_blocks_dual, THREADS_PER_BLOCK>>>(
         rescale_info->con_rescale, n_cons);
     fill_ones_kernel<<<state->num_blocks_primal, THREADS_PER_BLOCK>>>(
         rescale_info->var_rescale, n_vars);
 
-    double *E=nullptr, *D=nullptr, *invE=nullptr, *invD=nullptr;
-    CUDA_CHECK(cudaMalloc(&E, n_cons*sizeof(double)));
-    CUDA_CHECK(cudaMalloc(&D, n_vars*sizeof(double)));
-    CUDA_CHECK(cudaMalloc(&invE, n_cons*sizeof(double)));
-    CUDA_CHECK(cudaMalloc(&invD, n_vars*sizeof(double)));
+    double *E = NULL, *D = NULL, *invE = NULL, *invD = NULL;
+    CUDA_CHECK(cudaMalloc(&E, n_cons * sizeof(double)));
+    CUDA_CHECK(cudaMalloc(&D, n_vars * sizeof(double)));
+    CUDA_CHECK(cudaMalloc(&invE, n_cons * sizeof(double)));
+    CUDA_CHECK(cudaMalloc(&invD, n_vars * sizeof(double)));
 
     if (params->l_inf_ruiz_iterations > 0)
     {
@@ -324,21 +323,22 @@ rescale_info_t *rescale_problem(
     return rescale_info;
 }
 
-__global__ void scale_variables_kernel(double* __restrict__ c,
-                                       double* __restrict__ var_lb,
-                                       double* __restrict__ var_ub,
-                                       double* __restrict__ var_lb_finite,
-                                       double* __restrict__ var_ub_finite,
-                                       double* __restrict__ primal_start,
-                                       const double* __restrict__ D,
-                                       const double* __restrict__ invD,
+__global__ void scale_variables_kernel(double *__restrict__ c,
+                                       double *__restrict__ var_lb,
+                                       double *__restrict__ var_ub,
+                                       double *__restrict__ var_lb_finite,
+                                       double *__restrict__ var_ub_finite,
+                                       double *__restrict__ primal_start,
+                                       const double *__restrict__ D,
+                                       const double *__restrict__ invD,
                                        int n_vars)
 {
     int j = blockIdx.x * blockDim.x + threadIdx.x;
-    if (j >= n_vars) return;
+    if (j >= n_vars)
+        return;
     double dj = D[j];
     double inv_dj = invD[j];
-    c[j]      *= inv_dj;
+    c[j] *= inv_dj;
     var_lb[j] *= dj;
     var_ub[j] *= dj;
     var_lb_finite[j] *= dj;
@@ -346,17 +346,18 @@ __global__ void scale_variables_kernel(double* __restrict__ c,
     primal_start[j] *= dj;
 }
 
-__global__ void scale_constraints_kernel(double* __restrict__ con_lb,
-                                         double* __restrict__ con_ub,
-                                         double* __restrict__ con_lb_finite,
-                                         double* __restrict__ con_ub_finite,
-                                         double* __restrict__ dual_start,
-                                         const double* __restrict__ E,
-                                         const double* __restrict__ invE,
+__global__ void scale_constraints_kernel(double *__restrict__ con_lb,
+                                         double *__restrict__ con_ub,
+                                         double *__restrict__ con_lb_finite,
+                                         double *__restrict__ con_ub_finite,
+                                         double *__restrict__ dual_start,
+                                         const double *__restrict__ E,
+                                         const double *__restrict__ invE,
                                          int n_cons)
 {
     int i = blockIdx.x * blockDim.x + threadIdx.x;
-    if (i >= n_cons) return;
+    if (i >= n_cons)
+        return;
     double inv_ei = invE[i];
     double ei = E[i];
     con_lb[i] *= inv_ei;
@@ -366,13 +367,13 @@ __global__ void scale_constraints_kernel(double* __restrict__ con_lb,
     dual_start[i] *= ei;
 }
 
-__global__ void csr_scale_nnz_kernel(const int* __restrict__ row_ids,
-                                     const int* __restrict__ col_ind,
-                                     double* __restrict__ A_vals,
-                                     double* __restrict__ At_vals,
-                                     const int* __restrict__ A_to_At,
-                                     const double* __restrict__ invD,
-                                     const double* __restrict__ invE,
+__global__ void scale_csr_nnz_kernel(const int *__restrict__ row_ids,
+                                     const int *__restrict__ col_ind,
+                                     double *__restrict__ A_vals,
+                                     double *__restrict__ At_vals,
+                                     const int *__restrict__ A_to_At,
+                                     const double *__restrict__ invD,
+                                     const double *__restrict__ invE,
                                      int nnz)
 {
     for (int k = blockIdx.x * blockDim.x + threadIdx.x;
@@ -387,52 +388,63 @@ __global__ void csr_scale_nnz_kernel(const int* __restrict__ row_ids,
     }
 }
 
-__global__ void csr_row_absmax_kernel(const int* __restrict__ row_ptr,
-                                      const double* __restrict__ vals,
-                                      int num_rows,
-                                      double* __restrict__ out_max)
+__global__ void compute_csr_row_absmax_kernel(const int *__restrict__ row_ptr,
+                                              const double *__restrict__ vals,
+                                              int num_rows,
+                                              double *__restrict__ out_max)
 {
     int i = blockIdx.x * blockDim.x + threadIdx.x;
-    if (i >= num_rows) return;
+    if (i >= num_rows)
+        return;
     int s = row_ptr[i], e = row_ptr[i + 1];
     double m = 0.0;
-    for (int k = s; k < e; ++k) {
+    for (int k = s; k < e; ++k)
+    {
         double v = fabs(vals[k]);
-        if (!isfinite(v)) v = 0.0;
-        if (v > m) m = v;
+        if (!isfinite(v))
+            v = 0.0;
+        if (v > m)
+            m = v;
     }
     out_max[i] = m;
 }
 
-__device__ __forceinline__ double pow_fast(double v, double p) {
-    if (p == 2.0)   return v * v;
-    if (p == 1.0)   return v;
-    if (p == 0.5)   return sqrt(v);
+__device__ __forceinline__ double pow_fast(double v, double p)
+{
+    if (p == 2.0)
+        return v * v;
+    if (p == 1.0)
+        return v;
+    if (p == 0.5)
+        return sqrt(v);
     return pow(v, p);
 }
 
-__global__ void csr_row_powsum_kernel(const int* __restrict__ row_ptr,
-                                       const double* __restrict__ vals,
-                                       int num_rows,
-                                       double degree,
-                                       double* __restrict__ out_sum)
+__global__ void compute_csr_row_powsum_kernel(const int *__restrict__ row_ptr,
+                                              const double *__restrict__ vals,
+                                              int num_rows,
+                                              double degree,
+                                              double *__restrict__ out_sum)
 {
     int i = blockIdx.x * blockDim.x + threadIdx.x;
-    if (i >= num_rows) return;
+    if (i >= num_rows)
+        return;
     int s = row_ptr[i], e = row_ptr[i + 1];
     double acc = 0.0;
-    for (int k = s; k < e; ++k) {
+    for (int k = s; k < e; ++k)
+    {
         double v = fabs(vals[k]);
-        if (!isfinite(v)) v = 0.0;
+        if (!isfinite(v))
+            v = 0.0;
         acc += pow_fast(v, degree);
     }
     out_sum[i] = acc;
 }
 
-__global__ void clamp_sqrt_and_accum(double* __restrict__ x,
-                                     double* __restrict__ inv_x,
-                                     double* __restrict__ cum,
-                                     int n_vars)
+__global__ void clamp_sqrt_and_accum_kernel(double *__restrict__ x,
+                                            double *__restrict__ inv_x,
+                                            double *__restrict__ cum,
+                                            int n_vars)
 {
     for (int t = blockIdx.x * blockDim.x + threadIdx.x; t < n_vars; t += blockDim.x * gridDim.x)
     {
@@ -445,10 +457,10 @@ __global__ void clamp_sqrt_and_accum(double* __restrict__ x,
 }
 
 __global__ void reduce_bound_norm_sq_kernel(
-    const double* __restrict__ L,
-    const double* __restrict__ U,
+    const double *__restrict__ L,
+    const double *__restrict__ U,
     int n_cons,
-    double* __restrict__ block_sums)
+    double *__restrict__ block_sums)
 {
     extern __shared__ double sdata[];
     int tid = threadIdx.x;
@@ -456,14 +468,17 @@ __global__ void reduce_bound_norm_sq_kernel(
     int stride = blockDim.x * gridDim.x;
 
     double acc = 0.0;
-    for (int i = global_tid; i < n_cons; i += stride) {
+    for (int i = global_tid; i < n_cons; i += stride)
+    {
         double Li = L[i], Ui = U[i];
         bool fL = isfinite(Li), fU = isfinite(Ui);
 
-        if (fL && (!fU || fabs(Li - Ui) > SCALING_EPSILON)) {
+        if (fL && (!fU || fabs(Li - Ui) > SCALING_EPSILON))
+        {
             acc += Li * Li;
         }
-        if (fU) {
+        if (fU)
+        {
             acc += Ui * Ui;
         }
     }
@@ -471,60 +486,66 @@ __global__ void reduce_bound_norm_sq_kernel(
     sdata[tid] = acc;
     __syncthreads();
 
-    for (int s = blockDim.x / 2; s > 0; s >>= 1) {
-        if (tid < s) {
+    for (int s = blockDim.x / 2; s > 0; s >>= 1)
+    {
+        if (tid < s)
+        {
             sdata[tid] += sdata[tid + s];
         }
         __syncthreads();
     }
 
-    if (tid == 0) {
+    if (tid == 0)
+    {
         block_sums[blockIdx.x] = sdata[0];
     }
 }
 
 __global__ void scale_bounds_kernel(
-    double* __restrict__ con_lb,
-    double* __restrict__ con_ub,
-    double* __restrict__ con_lb_finite,
-    double* __restrict__ con_ub_finite,
-    double* __restrict__ dual_start,
+    double *__restrict__ con_lb,
+    double *__restrict__ con_ub,
+    double *__restrict__ con_lb_finite,
+    double *__restrict__ con_ub_finite,
+    double *__restrict__ dual_start,
     int n_cons,
     double con_scale,
     double obj_scale)
 {
     int i = blockIdx.x * blockDim.x + threadIdx.x;
-    if (i >= n_cons) return;
-    con_lb[i]        *= con_scale;
-    con_ub[i]        *= con_scale;
+    if (i >= n_cons)
+        return;
+    con_lb[i] *= con_scale;
+    con_ub[i] *= con_scale;
     con_lb_finite[i] *= con_scale;
     con_ub_finite[i] *= con_scale;
-    dual_start[i]    *= obj_scale;
+    dual_start[i] *= obj_scale;
 }
 
-__global__ void scale_obj_kernel(
-    double* __restrict__ c,
-    double* __restrict__ var_lb,
-    double* __restrict__ var_ub,
-    double* __restrict__ var_lb_finite,
-    double* __restrict__ var_ub_finite,
-    double* __restrict__ primal_start,
+__global__ void scale_objective_kernel(
+    double *__restrict__ c,
+    double *__restrict__ var_lb,
+    double *__restrict__ var_ub,
+    double *__restrict__ var_lb_finite,
+    double *__restrict__ var_ub_finite,
+    double *__restrict__ primal_start,
     int n_vars,
     double con_scale,
     double obj_scale)
 {
     int j = blockIdx.x * blockDim.x + threadIdx.x;
-    if (j >= n_vars) return;
-    var_lb[j]        *= con_scale;
-    var_ub[j]        *= con_scale;
+    if (j >= n_vars)
+        return;
+    var_lb[j] *= con_scale;
+    var_ub[j] *= con_scale;
     var_lb_finite[j] *= con_scale;
     var_ub_finite[j] *= con_scale;
-    c[j]             *= obj_scale;
-    primal_start[j]  *= con_scale;
+    c[j] *= obj_scale;
+    primal_start[j] *= con_scale;
 }
 
-__global__ void fill_ones_kernel(double* __restrict__ x, int n_vars)
+__global__ void fill_ones_kernel(double *__restrict__ x, int n_vars)
 {
     int i = blockIdx.x * blockDim.x + threadIdx.x;
-    if (i < n_vars) x[i] = 1.0;
+    if (i < n_vars)
+        x[i] = 1.0;
 }
diff --git a/src/solver.cu b/src/solver.cu
index c40dfb3..c01d5b8 100644
--- a/src/solver.cu
+++ b/src/solver.cu
@@ -27,57 +27,57 @@ limitations under the License.
 #include <stdio.h>
 #include <time.h>
 
-__global__ void build_row_ind(const int* __restrict__ row_ptr,
+__global__ void build_row_ind(const int *__restrict__ row_ptr,
                               int num_rows,
-                              int* __restrict__ row_ind);
-__global__ void build_transpose_pos(const int* __restrict__ A_row_ind,
-                                    const int* __restrict__ A_col_ind,
-                                    const int* __restrict__ At_row_ptr,
-                                    const int* __restrict__ At_col_ind,
+                              int *__restrict__ row_ind);
+__global__ void build_transpose_map(const int *__restrict__ A_row_ind,
+                                    const int *__restrict__ A_col_ind,
+                                    const int *__restrict__ At_row_ptr,
+                                    const int *__restrict__ At_col_ind,
                                     int nnz,
-                                    int* __restrict__ A_to_At);
-__global__ void compute_next_pdhg_primal_solution_kernel(const double *current_primal, 
-                                                         double *reflected_primal, 
+                                    int *__restrict__ A_to_At);
+__global__ void compute_next_pdhg_primal_solution_kernel(const double *current_primal,
+                                                         double *reflected_primal,
                                                          const double *dual_product,
-                                                         const double *objective, 
-                                                         const double *var_lb, 
+                                                         const double *objective,
+                                                         const double *var_lb,
                                                          const double *var_ub,
-                                                         int n, 
+                                                         int n,
                                                          double step_size);
-__global__ void compute_next_pdhg_primal_solution_major_kernel(const double *current_primal, 
-                                                               double *pdhg_primal, 
+__global__ void compute_next_pdhg_primal_solution_major_kernel(const double *current_primal,
+                                                               double *pdhg_primal,
                                                                double *reflected_primal,
-                                                               const double *dual_product, 
-                                                               const double *objective, 
+                                                               const double *dual_product,
+                                                               const double *objective,
                                                                const double *var_lb,
-                                                               const double *var_ub, 
-                                                               int n, 
-                                                               double step_size, 
+                                                               const double *var_ub,
+                                                               int n,
+                                                               double step_size,
                                                                double *dual_slack);
-__global__ void compute_next_pdhg_dual_solution_kernel(const double *current_dual, 
-                                                       double *reflected_dual, 
+__global__ void compute_next_pdhg_dual_solution_kernel(const double *current_dual,
+                                                       double *reflected_dual,
                                                        const double *primal_product,
-                                                       const double *const_lb, 
-                                                       const double *const_ub, 
-                                                       int n, 
+                                                       const double *const_lb,
+                                                       const double *const_ub,
+                                                       int n,
                                                        double step_size);
-__global__ void compute_next_pdhg_dual_solution_major_kernel(const double *current_dual, 
-                                                             double *pdhg_dual, 
+__global__ void compute_next_pdhg_dual_solution_major_kernel(const double *current_dual,
+                                                             double *pdhg_dual,
                                                              double *reflected_dual,
-                                                             const double *primal_product, 
-                                                             const double *const_lb, 
+                                                             const double *primal_product,
+                                                             const double *const_lb,
                                                              const double *const_ub,
-                                                             int n, 
+                                                             int n,
                                                              double step_size);
-__global__ void halpern_update_kernel(const double *initial_primal, 
-                                      double *current_primal, 
+__global__ void halpern_update_kernel(const double *initial_primal,
+                                      double *current_primal,
                                       const double *reflected_primal,
-                                      const double *initial_dual, 
-                                      double *current_dual, 
+                                      const double *initial_dual,
+                                      double *current_dual,
                                       const double *reflected_dual,
-                                      int n_vars, 
-                                      int n_cons, 
-                                      double weight, 
+                                      int n_vars,
+                                      int n_cons,
+                                      double weight,
                                       double reflection_coeff);
 __global__ void rescale_solution_kernel(double *primal_solution,
                                         double *dual_solution,
@@ -86,13 +86,13 @@ __global__ void rescale_solution_kernel(double *primal_solution,
                                         const double objective_vector_rescaling,
                                         const double constraint_bound_rescaling,
                                         int n_vars, int n_cons);
-__global__ void compute_delta_solution_kernel(const double *initial_primal, 
-                                              const double *pdhg_primal, 
+__global__ void compute_delta_solution_kernel(const double *initial_primal,
+                                              const double *pdhg_primal,
                                               double *delta_primal,
-                                              const double *initial_dual, 
-                                              const double *pdhg_dual, 
+                                              const double *initial_dual,
+                                              const double *pdhg_dual,
                                               double *delta_dual,
-                                              int n_vars, 
+                                              int n_vars,
                                               int n_cons);
 static void compute_next_pdhg_primal_solution(pdhg_solver_state_t *state);
 static void compute_next_pdhg_dual_solution(pdhg_solver_state_t *state);
@@ -101,12 +101,12 @@ static void rescale_solution(pdhg_solver_state_t *state);
 static cupdlpx_result_t *create_result_from_state(pdhg_solver_state_t *state);
 static void perform_restart(pdhg_solver_state_t *state, const pdhg_parameters_t *params);
 static void initialize_step_size_and_primal_weight(pdhg_solver_state_t *state, const pdhg_parameters_t *params);
-static pdhg_solver_state_t *initialize_solver_state(const lp_problem_t *original_problem, const pdhg_parameters_t* params);
+static pdhg_solver_state_t *initialize_solver_state(const lp_problem_t *original_problem, const pdhg_parameters_t *params);
 static void compute_fixed_point_error(pdhg_solver_state_t *state);
 void pdhg_solver_state_free(pdhg_solver_state_t *state);
 void rescale_info_free(rescale_info_t *info);
 
-static void perform_primal_restart(pdhg_solver_state_t *state);    
+static void perform_primal_restart(pdhg_solver_state_t *state);
 static void perform_dual_restart(pdhg_solver_state_t *state);
 void primal_feasibility_polish(const pdhg_parameters_t *params, pdhg_solver_state_t *state, const pdhg_solver_state_t *ori_state);
 void dual_feasibility_polish(const pdhg_parameters_t *params, pdhg_solver_state_t *state, const pdhg_solver_state_t *ori_state);
@@ -120,13 +120,12 @@ static pdhg_solver_state_t *initialize_primal_feas_polish_state(
 static pdhg_solver_state_t *initialize_dual_feas_polish_state(
     const pdhg_solver_state_t *original_state);
 
-
 cupdlpx_result_t *optimize(const pdhg_parameters_t *params,
                            const lp_problem_t *original_problem)
 {
     print_initial_info(params, original_problem);
     pdhg_solver_state_t *state = initialize_solver_state(original_problem, params);
-    
+
     initialize_step_size_and_primal_weight(state, params);
     clock_t start_time = clock();
     bool do_restart = false;
@@ -182,8 +181,8 @@ cupdlpx_result_t *optimize(const pdhg_parameters_t *params,
 
     pdhg_final_log(state, params->verbose, state->termination_reason);
 
-    if (params->feasibility_polishing && 
-        state->termination_reason != TERMINATION_REASON_DUAL_INFEASIBLE && 
+    if (params->feasibility_polishing &&
+        state->termination_reason != TERMINATION_REASON_DUAL_INFEASIBLE &&
         state->termination_reason != TERMINATION_REASON_PRIMAL_INFEASIBLE)
     {
         feasibility_polish(params, state);
@@ -196,7 +195,7 @@ cupdlpx_result_t *optimize(const pdhg_parameters_t *params,
 
 static pdhg_solver_state_t *initialize_solver_state(
     const lp_problem_t *original_problem,
-    const pdhg_parameters_t* params)
+    const pdhg_parameters_t *params)
 {
     pdhg_solver_state_t *state = (pdhg_solver_state_t *)safe_calloc(1, sizeof(pdhg_solver_state_t));
 
@@ -227,7 +226,7 @@ static pdhg_solver_state_t *initialize_solver_state(
     state->num_blocks_dual = (state->num_constraints + THREADS_PER_BLOCK - 1) / THREADS_PER_BLOCK;
     state->num_blocks_primal_dual = (state->num_variables + state->num_constraints + THREADS_PER_BLOCK - 1) / THREADS_PER_BLOCK;
     state->num_blocks_nnz = (state->constraint_matrix->num_nonzeros + THREADS_PER_BLOCK - 1) / THREADS_PER_BLOCK;
-    
+
     CUSPARSE_CHECK(cusparseCreate(&state->sparse_handle));
     CUBLAS_CHECK(cublasCreate(&state->blas_handle));
     CUBLAS_CHECK(cublasSetPointerMode(state->blas_handle, CUBLAS_POINTER_MODE_HOST));
@@ -273,15 +272,15 @@ static pdhg_solver_state_t *initialize_solver_state(
         state->constraint_matrix_t->row_ptr, n_vars, state->constraint_matrix_t->row_ind);
     CUDA_CHECK(cudaGetLastError());
 
-    CUDA_CHECK(cudaMalloc(&state->constraint_matrix->transpose_pos, nnz * sizeof(int)));
-    state->constraint_matrix_t->transpose_pos = NULL;
-    build_transpose_pos<<<state->num_blocks_nnz, THREADS_PER_BLOCK>>>(
+    CUDA_CHECK(cudaMalloc(&state->constraint_matrix->transpose_map, nnz * sizeof(int)));
+    state->constraint_matrix_t->transpose_map = NULL;
+    build_transpose_map<<<state->num_blocks_nnz, THREADS_PER_BLOCK>>>(
         state->constraint_matrix->row_ind,
         state->constraint_matrix->col_ind,
         state->constraint_matrix_t->row_ptr,
         state->constraint_matrix_t->col_ind,
         nnz,
-        state->constraint_matrix->transpose_pos);
+        state->constraint_matrix->transpose_map);
     CUDA_CHECK(cudaGetLastError());
 
     ALLOC_AND_COPY(state->variable_lower_bound, original_problem->variable_lower_bound, var_bytes);
@@ -329,24 +328,24 @@ static pdhg_solver_state_t *initialize_solver_state(
 
     if (original_problem->primal_start)
     {
-    CUDA_CHECK(cudaMemcpy(state->initial_primal_solution,
-                          original_problem->primal_start,
-                          var_bytes, cudaMemcpyHostToDevice));
+        CUDA_CHECK(cudaMemcpy(state->initial_primal_solution,
+                              original_problem->primal_start,
+                              var_bytes, cudaMemcpyHostToDevice));
     }
     if (original_problem->dual_start)
     {
-    CUDA_CHECK(cudaMemcpy(state->initial_dual_solution,
-                          original_problem->dual_start,
-                          con_bytes, cudaMemcpyHostToDevice));
+        CUDA_CHECK(cudaMemcpy(state->initial_dual_solution,
+                              original_problem->dual_start,
+                              con_bytes, cudaMemcpyHostToDevice));
     }
 
     rescale_info_t *rescale_info = rescale_problem(params, state);
 
     state->constraint_rescaling = rescale_info->con_rescale;
-    state->variable_rescaling   = rescale_info->var_rescale;
+    state->variable_rescaling = rescale_info->var_rescale;
     state->constraint_bound_rescaling = rescale_info->con_bound_rescale;
     state->objective_vector_rescaling = rescale_info->obj_vec_rescale;
-    state->rescaling_time_sec   = rescale_info->rescaling_time_sec;
+    state->rescaling_time_sec = rescale_info->rescaling_time_sec;
 
     rescale_info->con_rescale = NULL;
     rescale_info->var_rescale = NULL;
@@ -356,8 +355,8 @@ static pdhg_solver_state_t *initialize_solver_state(
     state->constraint_matrix->row_ind = NULL;
     CUDA_CHECK(cudaFree(state->constraint_matrix_t->row_ind));
     state->constraint_matrix_t->row_ind = NULL;
-    CUDA_CHECK(cudaFree(state->constraint_matrix->transpose_pos));
-    state->constraint_matrix->transpose_pos = NULL;
+    CUDA_CHECK(cudaFree(state->constraint_matrix->transpose_map));
+    state->constraint_matrix->transpose_map = NULL;
 
     double sum_of_squares = 0.0;
     for (int i = 0; i < n_vars; ++i)
@@ -401,7 +400,7 @@ static pdhg_solver_state_t *initialize_solver_state(
     CUSPARSE_CHECK(cusparseCreateDnVec(&state->vec_primal_prod, state->num_constraints, state->primal_product, CUDA_R_64F));
     CUSPARSE_CHECK(cusparseCreateDnVec(&state->vec_dual_prod, state->num_variables, state->dual_product, CUDA_R_64F));
     CUSPARSE_CHECK(cusparseSpMV_bufferSize(state->sparse_handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &HOST_ONE, state->matA, state->vec_primal_sol, &HOST_ZERO, state->vec_primal_prod, CUDA_R_64F, CUSPARSE_SPMV_CSR_ALG2, &primal_spmv_buffer_size));
-    
+
     CUSPARSE_CHECK(cusparseSpMV_bufferSize(state->sparse_handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &HOST_ONE, state->matAt, state->vec_dual_sol, &HOST_ZERO, state->vec_dual_prod, CUDA_R_64F, CUSPARSE_SPMV_CSR_ALG2, &dual_spmv_buffer_size));
     CUDA_CHECK(cudaMalloc(&state->primal_spmv_buffer, primal_spmv_buffer_size));
     CUSPARSE_CHECK(cusparseSpMV_preprocess(state->sparse_handle, CUSPARSE_OPERATION_NON_TRANSPOSE,
@@ -431,49 +430,55 @@ static pdhg_solver_state_t *initialize_solver_state(
     return state;
 }
 
-__global__ void build_row_ind(const int* __restrict__ row_ptr,
+__global__ void build_row_ind(const int *__restrict__ row_ptr,
                               int num_rows,
-                              int* __restrict__ row_ind)
+                              int *__restrict__ row_ind)
 {
     int i = blockIdx.x * blockDim.x + threadIdx.x;
-    if (i >= num_rows) return;
+    if (i >= num_rows)
+        return;
 
     int s = row_ptr[i];
     int e = row_ptr[i + 1];
 
-    for (int k = s; k < e; ++k) {
+    for (int k = s; k < e; ++k)
+    {
         row_ind[k] = i;
     }
 }
 
-__global__ void build_transpose_pos(
-    const int* __restrict__ A_row_ind,
-    const int* __restrict__ A_col_ind,
-    const int* __restrict__ At_row_ptr,
-    const int* __restrict__ At_col_ind,
+__global__ void build_transpose_map(
+    const int *__restrict__ A_row_ind,
+    const int *__restrict__ A_col_ind,
+    const int *__restrict__ At_row_ptr,
+    const int *__restrict__ At_col_ind,
     int nnz,
-    int* __restrict__ A_to_At)
+    int *__restrict__ A_to_At)
 {
     int k = blockIdx.x * blockDim.x + threadIdx.x;
-    if (k >= nnz) return;
+    if (k >= nnz)
+        return;
 
-    int i = A_row_ind[k]; 
-    int j = A_col_ind[k];  
+    int i = A_row_ind[k];
+    int j = A_col_ind[k];
 
     int start = At_row_ptr[j];
-    int end   = At_row_ptr[j + 1];
+    int end = At_row_ptr[j + 1];
 
     int pos = -1;
-    for (int idx = start; idx < end; ++idx) {
-        if (At_col_ind[idx] == i) {
+    for (int idx = start; idx < end; ++idx)
+    {
+        if (At_col_ind[idx] == i)
+        {
             pos = idx;
             break;
         }
     }
 
-    if (pos < 0) return;
+    if (pos < 0)
+        return;
 
-    A_to_At[k]    = pos;
+    A_to_At[k] = pos;
 }
 
 __global__ void compute_next_pdhg_primal_solution_kernel(
@@ -838,7 +843,7 @@ void pdhg_solver_state_free(pdhg_solver_state_t *state)
     if (state->vec_dual_prod)
         CUSPARSE_CHECK(cusparseDestroyDnVec(state->vec_dual_prod));
     if (state->primal_spmv_buffer)
-         CUDA_CHECK(cudaFree(state->primal_spmv_buffer));
+        CUDA_CHECK(cudaFree(state->primal_spmv_buffer));
     if (state->dual_spmv_buffer)
         CUDA_CHECK(cudaFree(state->dual_spmv_buffer));
     if (state->sparse_handle)
@@ -860,8 +865,8 @@ void pdhg_solver_state_free(pdhg_solver_state_t *state)
         CUDA_CHECK(cudaFree(state->constraint_matrix->row_ind));
     if (state->constraint_matrix->val)
         CUDA_CHECK(cudaFree(state->constraint_matrix->val));
-    if (state->constraint_matrix->transpose_pos)
-        CUDA_CHECK(cudaFree(state->constraint_matrix->transpose_pos));
+    if (state->constraint_matrix->transpose_map)
+        CUDA_CHECK(cudaFree(state->constraint_matrix->transpose_map));
     if (state->constraint_matrix_t->row_ptr)
         CUDA_CHECK(cudaFree(state->constraint_matrix_t->row_ptr));
     if (state->constraint_matrix_t->col_ind)
@@ -870,8 +875,8 @@ void pdhg_solver_state_free(pdhg_solver_state_t *state)
         CUDA_CHECK(cudaFree(state->constraint_matrix_t->row_ind));
     if (state->constraint_matrix_t->val)
         CUDA_CHECK(cudaFree(state->constraint_matrix_t->val));
-    if (state->constraint_matrix_t->transpose_pos)
-        CUDA_CHECK(cudaFree(state->constraint_matrix_t->transpose_pos));
+    if (state->constraint_matrix_t->transpose_map)
+        CUDA_CHECK(cudaFree(state->constraint_matrix_t->transpose_map));
     if (state->constraint_lower_bound)
         CUDA_CHECK(cudaFree(state->constraint_lower_bound));
     if (state->constraint_upper_bound)
@@ -1013,14 +1018,14 @@ void set_default_parameters(pdhg_parameters_t *params)
     params->restart_params.i_smooth = 0.3;
 }
 
-//Feasibility Polishing
+// Feasibility Polishing
 void feasibility_polish(const pdhg_parameters_t *params, pdhg_solver_state_t *state)
 {
     clock_t start_time = clock();
     if (state->relative_primal_residual < params->termination_criteria.eps_feas_polish_relative &&
         state->relative_dual_residual < params->termination_criteria.eps_feas_polish_relative)
     {
-        
+
         printf("Skipping feasibility polishing as the solution is already sufficiently feasible.\n");
         return;
     }
@@ -1034,7 +1039,7 @@ void feasibility_polish(const pdhg_parameters_t *params, pdhg_solver_state_t *st
         original_primal_weight = (state->objective_vector_norm + 1.0) / (state->constraint_bound_norm + 1.0);
     }
 
-    //PRIMAL FEASIBILITY POLISHING
+    // PRIMAL FEASIBILITY POLISHING
     pdhg_solver_state_t *primal_state = initialize_primal_feas_polish_state(state);
     primal_state->primal_weight = original_primal_weight;
     primal_state->best_primal_weight = original_primal_weight;
@@ -1050,8 +1055,8 @@ void feasibility_polish(const pdhg_parameters_t *params, pdhg_solver_state_t *st
         state->primal_objective_value = primal_state->primal_objective_value;
     }
     state->feasibility_iteration += primal_state->total_count - 1;
-    
-    //DUAL FEASIBILITY POLISHING
+
+    // DUAL FEASIBILITY POLISHING
     pdhg_solver_state_t *dual_state = initialize_dual_feas_polish_state(state);
     dual_state->primal_weight = original_primal_weight;
     dual_state->best_primal_weight = original_primal_weight;
@@ -1070,7 +1075,7 @@ void feasibility_polish(const pdhg_parameters_t *params, pdhg_solver_state_t *st
 
     state->objective_gap = fabs(state->primal_objective_value - state->dual_objective_value);
     state->relative_objective_gap = state->objective_gap / (1.0 + fabs(state->primal_objective_value) + fabs(state->dual_objective_value));
-    
+
     // FINAL LOGGING
     pdhg_feas_polish_final_log(primal_state, dual_state, params->verbose);
     primal_feas_polish_state_free(primal_state);
@@ -1184,22 +1189,22 @@ static pdhg_solver_state_t *initialize_primal_feas_polish_state(
     CUDA_CHECK(cudaMalloc(&dest, bytes)); \
     CUDA_CHECK(cudaMemset(dest, 0, bytes));
 
-    //RESET PROBLEM TO FEASIBILITY PROBLEM
+    // RESET PROBLEM TO FEASIBILITY PROBLEM
     ALLOC_ZERO(primal_state->objective_vector, num_var * sizeof(double));
     primal_state->objective_constant = 0.0;
 
-#define ALLOC_AND_COPY_DEV(dest, src, bytes)  \
-    CUDA_CHECK(cudaMalloc(&dest, bytes)); \
+#define ALLOC_AND_COPY_DEV(dest, src, bytes) \
+    CUDA_CHECK(cudaMalloc(&dest, bytes));    \
     CUDA_CHECK(cudaMemcpy(dest, src, bytes, cudaMemcpyDeviceToDevice));
 
-    //ALLOCATE AND COPY SOLUTION VECTORS
+    // ALLOCATE AND COPY SOLUTION VECTORS
     ALLOC_AND_COPY_DEV(primal_state->initial_primal_solution, original_state->initial_primal_solution, num_var * sizeof(double));
     ALLOC_AND_COPY_DEV(primal_state->current_primal_solution, original_state->current_primal_solution, num_var * sizeof(double));
     ALLOC_AND_COPY_DEV(primal_state->pdhg_primal_solution, original_state->pdhg_primal_solution, num_var * sizeof(double));
     ALLOC_AND_COPY_DEV(primal_state->reflected_primal_solution, original_state->reflected_primal_solution, num_var * sizeof(double));
     ALLOC_AND_COPY_DEV(primal_state->primal_product, original_state->primal_product, num_cons * sizeof(double));
 
-    //ALLOC ZERO FOR OTHERS
+    // ALLOC ZERO FOR OTHERS
     ALLOC_ZERO(primal_state->initial_dual_solution, num_cons * sizeof(double));
     ALLOC_ZERO(primal_state->current_dual_solution, num_cons * sizeof(double));
     ALLOC_ZERO(primal_state->pdhg_dual_solution, num_cons * sizeof(double));
@@ -1213,7 +1218,7 @@ static pdhg_solver_state_t *initialize_primal_feas_polish_state(
     ALLOC_ZERO(primal_state->delta_primal_solution, num_var * sizeof(double));
     ALLOC_ZERO(primal_state->delta_dual_solution, num_cons * sizeof(double));
 
-    //RESET SCALAR
+    // RESET SCALAR
     primal_state->primal_weight_error_sum = 0.0;
     primal_state->primal_weight_last_error = 0.0;
     primal_state->best_primal_weight = 0.0;
@@ -1229,7 +1234,7 @@ static pdhg_solver_state_t *initialize_primal_feas_polish_state(
     primal_state->cumulative_time_sec = 0.0;
     primal_state->best_primal_dual_residual_gap = INFINITY;
 
-    //IGNORE DUAL RESIDUAL AND OBJECTIVE GAP
+    // IGNORE DUAL RESIDUAL AND OBJECTIVE GAP
     primal_state->relative_dual_residual = 0.0;
     primal_state->absolute_dual_residual = 0.0;
     primal_state->relative_objective_gap = 0.0;
@@ -1240,13 +1245,15 @@ static pdhg_solver_state_t *initialize_primal_feas_polish_state(
 
 void primal_feas_polish_state_free(pdhg_solver_state_t *state)
 {
-    #define SAFE_CUDA_FREE(p)   \
-    if ((p) != NULL) {          \
-        CUDA_CHECK(cudaFree(p));\
-        (p) = NULL;             \
-    }                           \
-    
-    if (!state) return;
+#define SAFE_CUDA_FREE(p)        \
+    if ((p) != NULL)             \
+    {                            \
+        CUDA_CHECK(cudaFree(p)); \
+        (p) = NULL;              \
+    }
+
+    if (!state)
+        return;
     SAFE_CUDA_FREE(state->objective_vector);
     SAFE_CUDA_FREE(state->initial_primal_solution);
     SAFE_CUDA_FREE(state->current_primal_solution);
@@ -1259,7 +1266,7 @@ void primal_feas_polish_state_free(pdhg_solver_state_t *state)
     SAFE_CUDA_FREE(state->pdhg_dual_solution);
     SAFE_CUDA_FREE(state->reflected_dual_solution);
     SAFE_CUDA_FREE(state->primal_product);
-    
+
     SAFE_CUDA_FREE(state->primal_slack);
     SAFE_CUDA_FREE(state->dual_slack);
     SAFE_CUDA_FREE(state->primal_residual);
@@ -1279,7 +1286,6 @@ __global__ void zero_finite_value_vectors_kernel(
             vec[idx] = 0.0;
     }
 }
-    
 
 static pdhg_solver_state_t *initialize_dual_feas_polish_state(
     const pdhg_solver_state_t *original_state)
@@ -1289,18 +1295,18 @@ static pdhg_solver_state_t *initialize_dual_feas_polish_state(
     int num_var = original_state->num_variables;
     int num_cons = original_state->num_constraints;
 
-    #define ALLOC_AND_COPY_DEV(dest, src, bytes)  \
-        CUDA_CHECK(cudaMalloc(&dest, bytes)); \
-        CUDA_CHECK(cudaMemcpy(dest, src, bytes, cudaMemcpyDeviceToDevice));
-
-    //RESET PROBLEM TO DUAL FEASIBILITY PROBLEM
-    #define SET_FINITE_TO_ZERO(vec, n) \
-        { \
-            int threads = 256; \
-            int blocks = (n + threads - 1) / threads; \
-            zero_finite_value_vectors_kernel<<<blocks, threads>>>(vec, n); \
-            CUDA_CHECK(cudaDeviceSynchronize()); \
-        }
+#define ALLOC_AND_COPY_DEV(dest, src, bytes) \
+    CUDA_CHECK(cudaMalloc(&dest, bytes));    \
+    CUDA_CHECK(cudaMemcpy(dest, src, bytes, cudaMemcpyDeviceToDevice));
+
+// RESET PROBLEM TO DUAL FEASIBILITY PROBLEM
+#define SET_FINITE_TO_ZERO(vec, n)                                     \
+    {                                                                  \
+        int threads = 256;                                             \
+        int blocks = (n + threads - 1) / threads;                      \
+        zero_finite_value_vectors_kernel<<<blocks, threads>>>(vec, n); \
+        CUDA_CHECK(cudaDeviceSynchronize());                           \
+    }
 
     ALLOC_AND_COPY_DEV(dual_state->constraint_lower_bound, original_state->constraint_lower_bound, num_cons * sizeof(double));
     ALLOC_AND_COPY_DEV(dual_state->constraint_upper_bound, original_state->constraint_upper_bound, num_cons * sizeof(double));
@@ -1312,16 +1318,16 @@ static pdhg_solver_state_t *initialize_dual_feas_polish_state(
     SET_FINITE_TO_ZERO(dual_state->variable_lower_bound, num_var);
     SET_FINITE_TO_ZERO(dual_state->variable_upper_bound, num_var);
 
-    #define ALLOC_ZERO(dest, bytes)           \
-        CUDA_CHECK(cudaMalloc(&dest, bytes)); \
-        CUDA_CHECK(cudaMemset(dest, 0, bytes));
+#define ALLOC_ZERO(dest, bytes)           \
+    CUDA_CHECK(cudaMalloc(&dest, bytes)); \
+    CUDA_CHECK(cudaMemset(dest, 0, bytes));
 
     ALLOC_ZERO(dual_state->constraint_lower_bound_finite_val, num_cons * sizeof(double));
     ALLOC_ZERO(dual_state->constraint_upper_bound_finite_val, num_cons * sizeof(double));
     ALLOC_ZERO(dual_state->variable_lower_bound_finite_val, num_var * sizeof(double));
     ALLOC_ZERO(dual_state->variable_upper_bound_finite_val, num_var * sizeof(double));
 
-    //ALLOCATE AND COPY SOLUTION VECTORS
+    // ALLOCATE AND COPY SOLUTION VECTORS
     ALLOC_AND_COPY_DEV(dual_state->initial_dual_solution, original_state->initial_dual_solution, num_cons * sizeof(double));
     ALLOC_AND_COPY_DEV(dual_state->current_dual_solution, original_state->current_dual_solution, num_cons * sizeof(double));
     ALLOC_AND_COPY_DEV(dual_state->pdhg_dual_solution, original_state->pdhg_dual_solution, num_cons * sizeof(double));
@@ -1329,7 +1335,7 @@ static pdhg_solver_state_t *initialize_dual_feas_polish_state(
     ALLOC_AND_COPY_DEV(dual_state->dual_product, original_state->dual_product, num_var * sizeof(double));
     ALLOC_AND_COPY_DEV(dual_state->dual_slack, original_state->dual_slack, num_var * sizeof(double));
 
-    //ALLOC ZERO FOR OTHERS
+    // ALLOC ZERO FOR OTHERS
     ALLOC_ZERO(dual_state->initial_primal_solution, num_var * sizeof(double));
     ALLOC_ZERO(dual_state->current_primal_solution, num_var * sizeof(double));
     ALLOC_ZERO(dual_state->pdhg_primal_solution, num_var * sizeof(double));
@@ -1341,7 +1347,7 @@ static pdhg_solver_state_t *initialize_dual_feas_polish_state(
     ALLOC_ZERO(dual_state->delta_primal_solution, num_var * sizeof(double));
     ALLOC_ZERO(dual_state->delta_dual_solution, num_cons * sizeof(double));
 
-    //RESET SCALAR
+    // RESET SCALAR
     dual_state->primal_weight_error_sum = 0.0;
     dual_state->primal_weight_last_error = 0.0;
     dual_state->best_primal_weight = 0.0;
@@ -1357,7 +1363,7 @@ static pdhg_solver_state_t *initialize_dual_feas_polish_state(
     dual_state->cumulative_time_sec = 0.0;
     dual_state->best_primal_dual_residual_gap = INFINITY;
 
-    //IGNORE PRIMAL RESIDUAL AND OBJECTIVE GAP
+    // IGNORE PRIMAL RESIDUAL AND OBJECTIVE GAP
     dual_state->relative_primal_residual = 0.0;
     dual_state->absolute_primal_residual = 0.0;
     dual_state->relative_objective_gap = 0.0;
@@ -1367,13 +1373,15 @@ static pdhg_solver_state_t *initialize_dual_feas_polish_state(
 
 void dual_feas_polish_state_free(pdhg_solver_state_t *state)
 {
-    #define SAFE_CUDA_FREE(p)   \
-    if ((p) != NULL) {          \
-        CUDA_CHECK(cudaFree(p));\
-        (p) = NULL;             \
-    }                           \
-    
-    if (!state) return;
+#define SAFE_CUDA_FREE(p)        \
+    if ((p) != NULL)             \
+    {                            \
+        CUDA_CHECK(cudaFree(p)); \
+        (p) = NULL;              \
+    }
+
+    if (!state)
+        return;
     SAFE_CUDA_FREE(state->constraint_lower_bound);
     SAFE_CUDA_FREE(state->constraint_upper_bound);
     SAFE_CUDA_FREE(state->variable_lower_bound);
@@ -1394,7 +1402,7 @@ void dual_feas_polish_state_free(pdhg_solver_state_t *state)
     SAFE_CUDA_FREE(state->pdhg_dual_solution);
     SAFE_CUDA_FREE(state->reflected_dual_solution);
     SAFE_CUDA_FREE(state->primal_product);
-    
+
     SAFE_CUDA_FREE(state->primal_slack);
     SAFE_CUDA_FREE(state->dual_slack);
     SAFE_CUDA_FREE(state->primal_residual);
@@ -1473,5 +1481,3 @@ static void compute_dual_fixed_point_error(pdhg_solver_state_t *state)
                                    &dual_norm));
     state->fixed_point_error = dual_norm * dual_norm / state->primal_weight;
 }
-
-

From f13f5725ecc2ef2b500b2f67e86f4df83a2af852 Mon Sep 17 00:00:00 2001
From: Bo Tang <botang@mie.utoronto.ca>
Date: Thu, 27 Nov 2025 18:57:02 -0500
Subject: [PATCH 10/13] Clean code: rename variables

---
 src/preconditioner.cu | 468 +++++++++++++++++++++---------------------
 1 file changed, 234 insertions(+), 234 deletions(-)

diff --git a/src/preconditioner.cu b/src/preconditioner.cu
index 031051f..1f711cf 100644
--- a/src/preconditioner.cu
+++ b/src/preconditioner.cu
@@ -24,85 +24,85 @@ limitations under the License.
 
 #define SCALING_EPSILON 1e-8
 
-__global__ void scale_variables_kernel(double *__restrict__ c,
-                                       double *__restrict__ var_lb,
-                                       double *__restrict__ var_ub,
-                                       double *__restrict__ var_lb_finite,
-                                       double *__restrict__ var_ub_finite,
-                                       double *__restrict__ primal_start,
-                                       const double *__restrict__ D,
-                                       const double *__restrict__ invD,
-                                       int n_vars);
-__global__ void scale_constraints_kernel(double *__restrict__ con_lb,
-                                         double *__restrict__ con_ub,
-                                         double *__restrict__ con_lb_finite,
-                                         double *__restrict__ con_ub_finite,
-                                         double *__restrict__ dual_start,
-                                         const double *__restrict__ E,
-                                         const double *__restrict__ invE,
-                                         int n_cons);
-__global__ void scale_csr_nnz_kernel(const int *__restrict__ row_ids,
-                                     const int *__restrict__ col_ind,
-                                     double *__restrict__ A_vals,
-                                     double *__restrict__ At_vals,
-                                     const int *__restrict__ A_to_At,
-                                     const double *__restrict__ invD,
-                                     const double *__restrict__ invE,
+__global__ void scale_variables_kernel(double *__restrict__ objective_vector,
+                                       double *__restrict__ variable_lower_bound,
+                                       double *__restrict__ variable_upper_bound,
+                                       double *__restrict__ variable_lower_bound_finite_val,
+                                       double *__restrict__ variable_upper_bound_finite_val,
+                                       double *__restrict__ initial_primal_solution,
+                                       const double *__restrict__ variable_rescaling,
+                                       const double *__restrict__ inverse_variable_rescaling,
+                                       int num_variables);
+__global__ void scale_constraints_kernel(double *__restrict__ constraint_lower_bound,
+                                         double *__restrict__ constraint_upper_bound,
+                                         double *__restrict__ constraint_lower_bound_finite_val,
+                                         double *__restrict__ constraint_upper_bound_finite_val,
+                                         double *__restrict__ initial_dual_solution,
+                                         const double *__restrict__ constraint_rescaling,
+                                         const double *__restrict__ inverse_constraint_rescaling,
+                                         int num_constraints);
+__global__ void scale_csr_nnz_kernel(const int *__restrict__ constraint_row_ind,
+                                     const int *__restrict__ constraint_col_ind,
+                                     double *__restrict__ constraint_matrix_val,
+                                     double *__restrict__ constraint_matrix_transpose_val,
+                                     const int *__restrict__ constraint_to_transpose_position,
+                                     const double *__restrict__ inverse_variable_rescaling,
+                                     const double *__restrict__ inverse_constraint_rescaling,
                                      int nnz);
 __global__ void compute_csr_row_absmax_kernel(const int *__restrict__ row_ptr,
-                                              const double *__restrict__ vals,
+                                              const double *__restrict__ matrix_vals,
                                               int num_rows,
-                                              double *__restrict__ out_max);
+                                              double *__restrict__ row_absmax_values);
 __global__ void compute_csr_row_powsum_kernel(const int *__restrict__ row_ptr,
-                                              const double *__restrict__ vals,
+                                              const double *__restrict__ matrix_vals,
                                               int num_rows,
                                               double degree,
-                                              double *__restrict__ out_sum);
-__global__ void clamp_sqrt_and_accum_kernel(double *__restrict__ x,
-                                            double *__restrict__ inv_x,
-                                            double *__restrict__ cum,
-                                            int n_vars);
+                                              double *__restrict__ row_powsum_values);
+__global__ void clamp_sqrt_and_accum_kernel(double *__restrict__ scaling_factors,
+                                            double *__restrict__ inverse_scaling_factors,
+                                            double *__restrict__ cumulative_rescaling,
+                                            int num_variables);
 __global__ void reduce_bound_norm_sq_kernel(
-    const double *__restrict__ L,
-    const double *__restrict__ U,
-    int n_cons,
+    const double *__restrict__ constraint_lower_bound,
+    const double *__restrict__ constraint_upper_bound,
+    int num_constraints,
     double *__restrict__ block_sums);
 __global__ void scale_bounds_kernel(
-    double *__restrict__ con_lb,
-    double *__restrict__ con_ub,
-    double *__restrict__ con_lb_finite,
-    double *__restrict__ con_ub_finite,
-    double *__restrict__ dual_start,
-    int n_cons,
-    double con_scale,
-    double obj_scale);
+    double *__restrict__ constraint_lower_bound,
+    double *__restrict__ constraint_upper_bound,
+    double *__restrict__ constraint_lower_bound_finite_val,
+    double *__restrict__ constraint_upper_bound_finite_val,
+    double *__restrict__ initial_dual_solution,
+    int num_constraints,
+    double constraint_scale,
+    double objective_scale);
 __global__ void scale_objective_kernel(
-    double *__restrict__ c,
-    double *__restrict__ var_lb,
-    double *__restrict__ var_ub,
-    double *__restrict__ var_lb_finite,
-    double *__restrict__ var_ub_finite,
-    double *__restrict__ primal_start,
-    int n_vars,
-    double con_scale,
-    double obj_scale);
-__global__ void fill_ones_kernel(double *__restrict__ x, int n_vars);
-static void scale_problem(pdhg_solver_state_t *state, double *E, double *D, double *invE, double *invD);
+    double *__restrict__ objective_vector,
+    double *__restrict__ variable_lower_bound,
+    double *__restrict__ variable_upper_bound,
+    double *__restrict__ variable_lower_bound_finite_val,
+    double *__restrict__ variable_upper_bound_finite_val,
+    double *__restrict__ initial_primal_solution,
+    int num_variables,
+    double constraint_scale,
+    double objective_scale);
+__global__ void fill_ones_kernel(double *__restrict__ x, int num_variables);
+static void scale_problem(pdhg_solver_state_t *state, double *constraint_rescaling, double *variable_rescaling, double *inverse_constraint_rescaling, double *inverse_variable_rescaling);
 static void ruiz_rescaling(pdhg_solver_state_t *state, int num_iters, rescale_info_t *rescale_info,
-                           double *E, double *D, double *invE, double *invD);
+                           double *constraint_rescaling, double *variable_rescaling, double *inverse_constraint_rescaling, double *inverse_variable_rescaling);
 static void pock_chambolle_rescaling(pdhg_solver_state_t *state, const double alpha, rescale_info_t *rescale_info,
-                                     double *E, double *D, double *invE, double *invD);
+                                     double *constraint_rescaling, double *variable_rescaling, double *inverse_constraint_rescaling, double *inverse_variable_rescaling);
 static void bound_objective_rescaling(pdhg_solver_state_t *state, rescale_info_t *rescale_info);
 
 static void scale_problem(
     pdhg_solver_state_t *state,
-    double *E,
-    double *D,
-    double *invE,
-    double *invD)
+    double *constraint_rescaling,
+    double *variable_rescaling,
+    double *inverse_constraint_rescaling,
+    double *inverse_variable_rescaling)
 {
-    int n_vars = state->num_variables;
-    int n_cons = state->num_constraints;
+    int num_variables = state->num_variables;
+    int num_constraints = state->num_constraints;
 
     scale_variables_kernel<<<state->num_blocks_primal, THREADS_PER_BLOCK>>>(
         state->objective_vector,
@@ -111,9 +111,9 @@ static void scale_problem(
         state->variable_lower_bound_finite_val,
         state->variable_upper_bound_finite_val,
         state->initial_primal_solution,
-        D,
-        invD,
-        n_vars);
+        variable_rescaling,
+        inverse_variable_rescaling,
+        num_variables);
 
     scale_constraints_kernel<<<state->num_blocks_dual, THREADS_PER_BLOCK>>>(
         state->constraint_lower_bound,
@@ -121,9 +121,9 @@ static void scale_problem(
         state->constraint_lower_bound_finite_val,
         state->constraint_upper_bound_finite_val,
         state->initial_dual_solution,
-        E,
-        invE,
-        n_cons);
+        constraint_rescaling,
+        inverse_constraint_rescaling,
+        num_constraints);
 
     scale_csr_nnz_kernel<<<state->num_blocks_nnz, THREADS_PER_BLOCK>>>(
         state->constraint_matrix->row_ind,
@@ -131,8 +131,8 @@ static void scale_problem(
         state->constraint_matrix->val,
         state->constraint_matrix_t->val,
         state->constraint_matrix->transpose_map,
-        invD,
-        invE,
+        inverse_variable_rescaling,
+        inverse_constraint_rescaling,
         state->constraint_matrix->num_nonzeros);
 }
 
@@ -140,39 +140,39 @@ static void ruiz_rescaling(
     pdhg_solver_state_t *state,
     int num_iterations,
     rescale_info_t *rescale_info,
-    double *E,
-    double *D,
-    double *invE,
-    double *invD)
+    double *constraint_rescaling,
+    double *variable_rescaling,
+    double *inverse_constraint_rescaling,
+    double *inverse_variable_rescaling)
 {
-    const int n_cons = state->num_constraints;
-    const int n_vars = state->num_variables;
+    const int num_constraints = state->num_constraints;
+    const int num_variables = state->num_variables;
 
     for (int iter = 0; iter < num_iterations; ++iter)
     {
         compute_csr_row_absmax_kernel<<<state->num_blocks_dual, THREADS_PER_BLOCK>>>(
             state->constraint_matrix->row_ptr,
             state->constraint_matrix->val,
-            n_cons,
-            E);
+            num_constraints,
+            constraint_rescaling);
         clamp_sqrt_and_accum_kernel<<<state->num_blocks_dual, THREADS_PER_BLOCK>>>(
-            E,
-            invE,
+            constraint_rescaling,
+            inverse_constraint_rescaling,
             rescale_info->con_rescale,
-            n_cons);
+            num_constraints);
 
         compute_csr_row_absmax_kernel<<<state->num_blocks_primal, THREADS_PER_BLOCK>>>(
             state->constraint_matrix_t->row_ptr,
             state->constraint_matrix_t->val,
-            n_vars,
-            D);
+            num_variables,
+            variable_rescaling);
         clamp_sqrt_and_accum_kernel<<<state->num_blocks_primal, THREADS_PER_BLOCK>>>(
-            D,
-            invD,
+            variable_rescaling,
+            inverse_variable_rescaling,
             rescale_info->var_rescale,
-            n_vars);
+            num_variables);
 
-        scale_problem(state, E, D, invE, invD);
+        scale_problem(state, constraint_rescaling, variable_rescaling, inverse_constraint_rescaling, inverse_variable_rescaling);
     }
 }
 
@@ -180,47 +180,47 @@ static void pock_chambolle_rescaling(
     pdhg_solver_state_t *state,
     const double alpha,
     rescale_info_t *rescale_info,
-    double *E,
-    double *D,
-    double *invE,
-    double *invD)
+    double *constraint_rescaling,
+    double *variable_rescaling,
+    double *inverse_constraint_rescaling,
+    double *inverse_variable_rescaling)
 {
-    const int n_cons = state->num_constraints;
-    const int n_vars = state->num_variables;
+    const int num_constraints = state->num_constraints;
+    const int num_variables = state->num_variables;
 
     compute_csr_row_powsum_kernel<<<state->num_blocks_dual, THREADS_PER_BLOCK>>>(
         state->constraint_matrix->row_ptr,
         state->constraint_matrix->val,
-        n_cons,
+        num_constraints,
         alpha,
-        E);
+        constraint_rescaling);
     clamp_sqrt_and_accum_kernel<<<state->num_blocks_dual, THREADS_PER_BLOCK>>>(
-        E,
-        invE,
+        constraint_rescaling,
+        inverse_constraint_rescaling,
         rescale_info->con_rescale,
-        n_cons);
+        num_constraints);
 
     compute_csr_row_powsum_kernel<<<state->num_blocks_primal, THREADS_PER_BLOCK>>>(
         state->constraint_matrix_t->row_ptr,
         state->constraint_matrix_t->val,
-        n_vars,
+        num_variables,
         2.0 - alpha,
-        D);
+        variable_rescaling);
     clamp_sqrt_and_accum_kernel<<<state->num_blocks_primal, THREADS_PER_BLOCK>>>(
-        D,
-        invD,
+        variable_rescaling,
+        inverse_variable_rescaling,
         rescale_info->var_rescale,
-        n_vars);
+        num_variables);
 
-    scale_problem(state, E, D, invE, invD);
+    scale_problem(state, constraint_rescaling, variable_rescaling, inverse_constraint_rescaling, inverse_variable_rescaling);
 }
 
 static void bound_objective_rescaling(
     pdhg_solver_state_t *state,
     rescale_info_t *rescale_info)
 {
-    const int n_cons = state->num_constraints;
-    const int n_vars = state->num_variables;
+    const int num_constraints = state->num_constraints;
+    const int num_variables = state->num_variables;
 
     double *bnd_norm_sq_d = NULL;
     CUDA_CHECK(cudaMalloc(&bnd_norm_sq_d, sizeof(double)));
@@ -228,7 +228,7 @@ static void bound_objective_rescaling(
     reduce_bound_norm_sq_kernel<<<1, THREADS_PER_BLOCK, THREADS_PER_BLOCK * sizeof(double)>>>(
         state->constraint_lower_bound,
         state->constraint_upper_bound,
-        n_cons,
+        num_constraints,
         bnd_norm_sq_d);
 
     double bnd_norm_sq_h = 0.0;
@@ -242,8 +242,8 @@ static void bound_objective_rescaling(
                              state->objective_vector, 1,
                              &obj_norm));
 
-    double con_scale = 1.0 / (bnd_norm + 1.0);
-    double obj_scale = 1.0 / (obj_norm + 1.0);
+    double constraint_scale = 1.0 / (bnd_norm + 1.0);
+    double objective_scale = 1.0 / (obj_norm + 1.0);
 
     scale_bounds_kernel<<<state->num_blocks_dual, THREADS_PER_BLOCK>>>(
         state->constraint_lower_bound,
@@ -251,9 +251,9 @@ static void bound_objective_rescaling(
         state->constraint_lower_bound_finite_val,
         state->constraint_upper_bound_finite_val,
         state->initial_dual_solution,
-        n_cons,
-        con_scale,
-        obj_scale);
+        num_constraints,
+        constraint_scale,
+        objective_scale);
 
     scale_objective_kernel<<<state->num_blocks_primal, THREADS_PER_BLOCK>>>(
         state->objective_vector,
@@ -262,12 +262,12 @@ static void bound_objective_rescaling(
         state->variable_lower_bound_finite_val,
         state->variable_upper_bound_finite_val,
         state->initial_primal_solution,
-        n_vars,
-        con_scale,
-        obj_scale);
+        num_variables,
+        constraint_scale,
+        objective_scale);
 
-    rescale_info->con_bound_rescale = con_scale;
-    rescale_info->obj_vec_rescale = obj_scale;
+    rescale_info->con_bound_rescale = constraint_scale;
+    rescale_info->obj_vec_rescale = objective_scale;
 }
 
 rescale_info_t *rescale_problem(
@@ -276,33 +276,33 @@ rescale_info_t *rescale_problem(
 {
     printf("[Precondition] Start\n");
 
-    int n_vars = state->num_variables;
-    int n_cons = state->num_constraints;
+    int num_variables = state->num_variables;
+    int num_constraints = state->num_constraints;
 
     clock_t start_rescaling = clock();
     rescale_info_t *rescale_info = (rescale_info_t *)safe_calloc(1, sizeof(rescale_info_t));
-    CUDA_CHECK(cudaMalloc(&rescale_info->con_rescale, n_cons * sizeof(double)));
-    CUDA_CHECK(cudaMalloc(&rescale_info->var_rescale, n_vars * sizeof(double)));
+    CUDA_CHECK(cudaMalloc(&rescale_info->con_rescale, num_constraints * sizeof(double)));
+    CUDA_CHECK(cudaMalloc(&rescale_info->var_rescale, num_variables * sizeof(double)));
     fill_ones_kernel<<<state->num_blocks_dual, THREADS_PER_BLOCK>>>(
-        rescale_info->con_rescale, n_cons);
+        rescale_info->con_rescale, num_constraints);
     fill_ones_kernel<<<state->num_blocks_primal, THREADS_PER_BLOCK>>>(
-        rescale_info->var_rescale, n_vars);
+        rescale_info->var_rescale, num_variables);
 
-    double *E = NULL, *D = NULL, *invE = NULL, *invD = NULL;
-    CUDA_CHECK(cudaMalloc(&E, n_cons * sizeof(double)));
-    CUDA_CHECK(cudaMalloc(&D, n_vars * sizeof(double)));
-    CUDA_CHECK(cudaMalloc(&invE, n_cons * sizeof(double)));
-    CUDA_CHECK(cudaMalloc(&invD, n_vars * sizeof(double)));
+    double *constraint_rescaling = NULL, *variable_rescaling = NULL, *inverse_constraint_rescaling = NULL, *inverse_variable_rescaling = NULL;
+    CUDA_CHECK(cudaMalloc(&constraint_rescaling, num_constraints * sizeof(double)));
+    CUDA_CHECK(cudaMalloc(&variable_rescaling, num_variables * sizeof(double)));
+    CUDA_CHECK(cudaMalloc(&inverse_constraint_rescaling, num_constraints * sizeof(double)));
+    CUDA_CHECK(cudaMalloc(&inverse_variable_rescaling, num_variables * sizeof(double)));
 
     if (params->l_inf_ruiz_iterations > 0)
     {
         printf("[Precondition] Ruiz scaling (%d iterations)\n", params->l_inf_ruiz_iterations);
-        ruiz_rescaling(state, params->l_inf_ruiz_iterations, rescale_info, E, D, invE, invD);
+        ruiz_rescaling(state, params->l_inf_ruiz_iterations, rescale_info, constraint_rescaling, variable_rescaling, inverse_constraint_rescaling, inverse_variable_rescaling);
     }
     if (params->has_pock_chambolle_alpha)
     {
         printf("[Precondition] Pock-Chambolle scaling (alpha=%.4f)\n", params->pock_chambolle_alpha);
-        pock_chambolle_rescaling(state, params->pock_chambolle_alpha, rescale_info, E, D, invE, invD);
+        pock_chambolle_rescaling(state, params->pock_chambolle_alpha, rescale_info, constraint_rescaling, variable_rescaling, inverse_constraint_rescaling, inverse_variable_rescaling);
     }
 
     rescale_info->con_bound_rescale = 1.0;
@@ -315,83 +315,83 @@ rescale_info_t *rescale_problem(
 
     rescale_info->rescaling_time_sec = (double)(clock() - start_rescaling) / CLOCKS_PER_SEC;
 
-    CUDA_CHECK(cudaFree(E));
-    CUDA_CHECK(cudaFree(D));
-    CUDA_CHECK(cudaFree(invE));
-    CUDA_CHECK(cudaFree(invD));
+    CUDA_CHECK(cudaFree(constraint_rescaling));
+    CUDA_CHECK(cudaFree(variable_rescaling));
+    CUDA_CHECK(cudaFree(inverse_constraint_rescaling));
+    CUDA_CHECK(cudaFree(inverse_variable_rescaling));
 
     return rescale_info;
 }
 
-__global__ void scale_variables_kernel(double *__restrict__ c,
-                                       double *__restrict__ var_lb,
-                                       double *__restrict__ var_ub,
-                                       double *__restrict__ var_lb_finite,
-                                       double *__restrict__ var_ub_finite,
-                                       double *__restrict__ primal_start,
-                                       const double *__restrict__ D,
-                                       const double *__restrict__ invD,
-                                       int n_vars)
+__global__ void scale_variables_kernel(double *__restrict__ objective_vector,
+                                       double *__restrict__ variable_lower_bound,
+                                       double *__restrict__ variable_upper_bound,
+                                       double *__restrict__ variable_lower_bound_finite_val,
+                                       double *__restrict__ variable_upper_bound_finite_val,
+                                       double *__restrict__ initial_primal_solution,
+                                       const double *__restrict__ variable_rescaling,
+                                       const double *__restrict__ inverse_variable_rescaling,
+                                       int num_variables)
 {
     int j = blockIdx.x * blockDim.x + threadIdx.x;
-    if (j >= n_vars)
+    if (j >= num_variables)
         return;
-    double dj = D[j];
-    double inv_dj = invD[j];
-    c[j] *= inv_dj;
-    var_lb[j] *= dj;
-    var_ub[j] *= dj;
-    var_lb_finite[j] *= dj;
-    var_ub_finite[j] *= dj;
-    primal_start[j] *= dj;
+    double dj = variable_rescaling[j];
+    double inv_dj = inverse_variable_rescaling[j];
+    objective_vector[j] *= inv_dj;
+    variable_lower_bound[j] *= dj;
+    variable_upper_bound[j] *= dj;
+    variable_lower_bound_finite_val[j] *= dj;
+    variable_upper_bound_finite_val[j] *= dj;
+    initial_primal_solution[j] *= dj;
 }
 
-__global__ void scale_constraints_kernel(double *__restrict__ con_lb,
-                                         double *__restrict__ con_ub,
-                                         double *__restrict__ con_lb_finite,
-                                         double *__restrict__ con_ub_finite,
-                                         double *__restrict__ dual_start,
-                                         const double *__restrict__ E,
-                                         const double *__restrict__ invE,
-                                         int n_cons)
+__global__ void scale_constraints_kernel(double *__restrict__ constraint_lower_bound,
+                                         double *__restrict__ constraint_upper_bound,
+                                         double *__restrict__ constraint_lower_bound_finite_val,
+                                         double *__restrict__ constraint_upper_bound_finite_val,
+                                         double *__restrict__ initial_dual_solution,
+                                         const double *__restrict__ constraint_rescaling,
+                                         const double *__restrict__ inverse_constraint_rescaling,
+                                         int num_constraints)
 {
     int i = blockIdx.x * blockDim.x + threadIdx.x;
-    if (i >= n_cons)
+    if (i >= num_constraints)
         return;
-    double inv_ei = invE[i];
-    double ei = E[i];
-    con_lb[i] *= inv_ei;
-    con_ub[i] *= inv_ei;
-    con_lb_finite[i] *= inv_ei;
-    con_ub_finite[i] *= inv_ei;
-    dual_start[i] *= ei;
+    double inv_ei = inverse_constraint_rescaling[i];
+    double ei = constraint_rescaling[i];
+    constraint_lower_bound[i] *= inv_ei;
+    constraint_upper_bound[i] *= inv_ei;
+    constraint_lower_bound_finite_val[i] *= inv_ei;
+    constraint_upper_bound_finite_val[i] *= inv_ei;
+    initial_dual_solution[i] *= ei;
 }
 
-__global__ void scale_csr_nnz_kernel(const int *__restrict__ row_ids,
-                                     const int *__restrict__ col_ind,
-                                     double *__restrict__ A_vals,
-                                     double *__restrict__ At_vals,
-                                     const int *__restrict__ A_to_At,
-                                     const double *__restrict__ invD,
-                                     const double *__restrict__ invE,
+__global__ void scale_csr_nnz_kernel(const int *__restrict__ constraint_row_ind,
+                                     const int *__restrict__ constraint_col_ind,
+                                     double *__restrict__ constraint_matrix_val,
+                                     double *__restrict__ constraint_matrix_transpose_val,
+                                     const int *__restrict__ constraint_to_transpose_position,
+                                     const double *__restrict__ inverse_variable_rescaling,
+                                     const double *__restrict__ inverse_constraint_rescaling,
                                      int nnz)
 {
     for (int k = blockIdx.x * blockDim.x + threadIdx.x;
          k < nnz;
          k += gridDim.x * blockDim.x)
     {
-        int i = row_ids[k];
-        int j = col_ind[k];
-        double scale = invD[j] * invE[i];
-        A_vals[k] *= scale;
-        At_vals[A_to_At[k]] *= scale;
+        int i = constraint_row_ind[k];
+        int j = constraint_col_ind[k];
+        double scale = inverse_variable_rescaling[j] * inverse_constraint_rescaling[i];
+        constraint_matrix_val[k] *= scale;
+        constraint_matrix_transpose_val[constraint_to_transpose_position[k]] *= scale;
     }
 }
 
 __global__ void compute_csr_row_absmax_kernel(const int *__restrict__ row_ptr,
-                                              const double *__restrict__ vals,
+                                              const double *__restrict__ matrix_vals,
                                               int num_rows,
-                                              double *__restrict__ out_max)
+                                              double *__restrict__ row_absmax_values)
 {
     int i = blockIdx.x * blockDim.x + threadIdx.x;
     if (i >= num_rows)
@@ -400,13 +400,13 @@ __global__ void compute_csr_row_absmax_kernel(const int *__restrict__ row_ptr,
     double m = 0.0;
     for (int k = s; k < e; ++k)
     {
-        double v = fabs(vals[k]);
+        double v = fabs(matrix_vals[k]);
         if (!isfinite(v))
             v = 0.0;
         if (v > m)
             m = v;
     }
-    out_max[i] = m;
+    row_absmax_values[i] = m;
 }
 
 __device__ __forceinline__ double pow_fast(double v, double p)
@@ -421,10 +421,10 @@ __device__ __forceinline__ double pow_fast(double v, double p)
 }
 
 __global__ void compute_csr_row_powsum_kernel(const int *__restrict__ row_ptr,
-                                              const double *__restrict__ vals,
+                                              const double *__restrict__ matrix_vals,
                                               int num_rows,
                                               double degree,
-                                              double *__restrict__ out_sum)
+                                              double *__restrict__ row_powsum_values)
 {
     int i = blockIdx.x * blockDim.x + threadIdx.x;
     if (i >= num_rows)
@@ -433,33 +433,33 @@ __global__ void compute_csr_row_powsum_kernel(const int *__restrict__ row_ptr,
     double acc = 0.0;
     for (int k = s; k < e; ++k)
     {
-        double v = fabs(vals[k]);
+        double v = fabs(matrix_vals[k]);
         if (!isfinite(v))
             v = 0.0;
         acc += pow_fast(v, degree);
     }
-    out_sum[i] = acc;
+    row_powsum_values[i] = acc;
 }
 
-__global__ void clamp_sqrt_and_accum_kernel(double *__restrict__ x,
-                                            double *__restrict__ inv_x,
-                                            double *__restrict__ cum,
-                                            int n_vars)
+__global__ void clamp_sqrt_and_accum_kernel(double *__restrict__ scaling_factors,
+                                            double *__restrict__ inverse_scaling_factors,
+                                            double *__restrict__ cumulative_rescaling,
+                                            int num_variables)
 {
-    for (int t = blockIdx.x * blockDim.x + threadIdx.x; t < n_vars; t += blockDim.x * gridDim.x)
+    for (int t = blockIdx.x * blockDim.x + threadIdx.x; t < num_variables; t += blockDim.x * gridDim.x)
     {
-        double v = x[t];
+        double v = scaling_factors[t];
         double s = sqrt(fmax(v, SCALING_EPSILON));
-        cum[t] *= s;
-        x[t] = s;
-        inv_x[t] = 1.0 / s;
+        cumulative_rescaling[t] *= s;
+        scaling_factors[t] = s;
+        inverse_scaling_factors[t] = 1.0 / s;
     }
 }
 
 __global__ void reduce_bound_norm_sq_kernel(
-    const double *__restrict__ L,
-    const double *__restrict__ U,
-    int n_cons,
+    const double *__restrict__ constraint_lower_bound,
+    const double *__restrict__ constraint_upper_bound,
+    int num_constraints,
     double *__restrict__ block_sums)
 {
     extern __shared__ double sdata[];
@@ -468,9 +468,9 @@ __global__ void reduce_bound_norm_sq_kernel(
     int stride = blockDim.x * gridDim.x;
 
     double acc = 0.0;
-    for (int i = global_tid; i < n_cons; i += stride)
+    for (int i = global_tid; i < num_constraints; i += stride)
     {
-        double Li = L[i], Ui = U[i];
+        double Li = constraint_lower_bound[i], Ui = constraint_upper_bound[i];
         bool fL = isfinite(Li), fU = isfinite(Ui);
 
         if (fL && (!fU || fabs(Li - Ui) > SCALING_EPSILON))
@@ -502,50 +502,50 @@ __global__ void reduce_bound_norm_sq_kernel(
 }
 
 __global__ void scale_bounds_kernel(
-    double *__restrict__ con_lb,
-    double *__restrict__ con_ub,
-    double *__restrict__ con_lb_finite,
-    double *__restrict__ con_ub_finite,
-    double *__restrict__ dual_start,
-    int n_cons,
-    double con_scale,
-    double obj_scale)
+    double *__restrict__ constraint_lower_bound,
+    double *__restrict__ constraint_upper_bound,
+    double *__restrict__ constraint_lower_bound_finite_val,
+    double *__restrict__ constraint_upper_bound_finite_val,
+    double *__restrict__ initial_dual_solution,
+    int num_constraints,
+    double constraint_scale,
+    double objective_scale)
 {
     int i = blockIdx.x * blockDim.x + threadIdx.x;
-    if (i >= n_cons)
+    if (i >= num_constraints)
         return;
-    con_lb[i] *= con_scale;
-    con_ub[i] *= con_scale;
-    con_lb_finite[i] *= con_scale;
-    con_ub_finite[i] *= con_scale;
-    dual_start[i] *= obj_scale;
+    constraint_lower_bound[i] *= constraint_scale;
+    constraint_upper_bound[i] *= constraint_scale;
+    constraint_lower_bound_finite_val[i] *= constraint_scale;
+    constraint_upper_bound_finite_val[i] *= constraint_scale;
+    initial_dual_solution[i] *= objective_scale;
 }
 
 __global__ void scale_objective_kernel(
-    double *__restrict__ c,
-    double *__restrict__ var_lb,
-    double *__restrict__ var_ub,
-    double *__restrict__ var_lb_finite,
-    double *__restrict__ var_ub_finite,
-    double *__restrict__ primal_start,
-    int n_vars,
-    double con_scale,
-    double obj_scale)
+    double *__restrict__ objective_vector,
+    double *__restrict__ variable_lower_bound,
+    double *__restrict__ variable_upper_bound,
+    double *__restrict__ variable_lower_bound_finite_val,
+    double *__restrict__ variable_upper_bound_finite_val,
+    double *__restrict__ initial_primal_solution,
+    int num_variables,
+    double constraint_scale,
+    double objective_scale)
 {
     int j = blockIdx.x * blockDim.x + threadIdx.x;
-    if (j >= n_vars)
+    if (j >= num_variables)
         return;
-    var_lb[j] *= con_scale;
-    var_ub[j] *= con_scale;
-    var_lb_finite[j] *= con_scale;
-    var_ub_finite[j] *= con_scale;
-    c[j] *= obj_scale;
-    primal_start[j] *= con_scale;
+    variable_lower_bound[j] *= constraint_scale;
+    variable_upper_bound[j] *= constraint_scale;
+    variable_lower_bound_finite_val[j] *= constraint_scale;
+    variable_upper_bound_finite_val[j] *= constraint_scale;
+    objective_vector[j] *= objective_scale;
+    initial_primal_solution[j] *= constraint_scale;
 }
 
-__global__ void fill_ones_kernel(double *__restrict__ x, int n_vars)
+__global__ void fill_ones_kernel(double *__restrict__ x, int num_variables)
 {
     int i = blockIdx.x * blockDim.x + threadIdx.x;
-    if (i < n_vars)
+    if (i < num_variables)
         x[i] = 1.0;
 }

From 5ec9302c37ff022bf7623d5321fb0081d72165d8 Mon Sep 17 00:00:00 2001
From: Bo Tang <botang@mie.utoronto.ca>
Date: Thu, 27 Nov 2025 19:27:37 -0500
Subject: [PATCH 11/13] Refactor: move finite-bound computation

---
 src/preconditioner.cu | 32 ------------------------
 src/solver.cu         | 57 +++++++++++++++++++++++++++++++------------
 2 files changed, 42 insertions(+), 47 deletions(-)

diff --git a/src/preconditioner.cu b/src/preconditioner.cu
index 1f711cf..fe087fc 100644
--- a/src/preconditioner.cu
+++ b/src/preconditioner.cu
@@ -27,16 +27,12 @@ limitations under the License.
 __global__ void scale_variables_kernel(double *__restrict__ objective_vector,
                                        double *__restrict__ variable_lower_bound,
                                        double *__restrict__ variable_upper_bound,
-                                       double *__restrict__ variable_lower_bound_finite_val,
-                                       double *__restrict__ variable_upper_bound_finite_val,
                                        double *__restrict__ initial_primal_solution,
                                        const double *__restrict__ variable_rescaling,
                                        const double *__restrict__ inverse_variable_rescaling,
                                        int num_variables);
 __global__ void scale_constraints_kernel(double *__restrict__ constraint_lower_bound,
                                          double *__restrict__ constraint_upper_bound,
-                                         double *__restrict__ constraint_lower_bound_finite_val,
-                                         double *__restrict__ constraint_upper_bound_finite_val,
                                          double *__restrict__ initial_dual_solution,
                                          const double *__restrict__ constraint_rescaling,
                                          const double *__restrict__ inverse_constraint_rescaling,
@@ -70,8 +66,6 @@ __global__ void reduce_bound_norm_sq_kernel(
 __global__ void scale_bounds_kernel(
     double *__restrict__ constraint_lower_bound,
     double *__restrict__ constraint_upper_bound,
-    double *__restrict__ constraint_lower_bound_finite_val,
-    double *__restrict__ constraint_upper_bound_finite_val,
     double *__restrict__ initial_dual_solution,
     int num_constraints,
     double constraint_scale,
@@ -80,8 +74,6 @@ __global__ void scale_objective_kernel(
     double *__restrict__ objective_vector,
     double *__restrict__ variable_lower_bound,
     double *__restrict__ variable_upper_bound,
-    double *__restrict__ variable_lower_bound_finite_val,
-    double *__restrict__ variable_upper_bound_finite_val,
     double *__restrict__ initial_primal_solution,
     int num_variables,
     double constraint_scale,
@@ -108,8 +100,6 @@ static void scale_problem(
         state->objective_vector,
         state->variable_lower_bound,
         state->variable_upper_bound,
-        state->variable_lower_bound_finite_val,
-        state->variable_upper_bound_finite_val,
         state->initial_primal_solution,
         variable_rescaling,
         inverse_variable_rescaling,
@@ -118,8 +108,6 @@ static void scale_problem(
     scale_constraints_kernel<<<state->num_blocks_dual, THREADS_PER_BLOCK>>>(
         state->constraint_lower_bound,
         state->constraint_upper_bound,
-        state->constraint_lower_bound_finite_val,
-        state->constraint_upper_bound_finite_val,
         state->initial_dual_solution,
         constraint_rescaling,
         inverse_constraint_rescaling,
@@ -248,8 +236,6 @@ static void bound_objective_rescaling(
     scale_bounds_kernel<<<state->num_blocks_dual, THREADS_PER_BLOCK>>>(
         state->constraint_lower_bound,
         state->constraint_upper_bound,
-        state->constraint_lower_bound_finite_val,
-        state->constraint_upper_bound_finite_val,
         state->initial_dual_solution,
         num_constraints,
         constraint_scale,
@@ -259,8 +245,6 @@ static void bound_objective_rescaling(
         state->objective_vector,
         state->variable_lower_bound,
         state->variable_upper_bound,
-        state->variable_lower_bound_finite_val,
-        state->variable_upper_bound_finite_val,
         state->initial_primal_solution,
         num_variables,
         constraint_scale,
@@ -326,8 +310,6 @@ rescale_info_t *rescale_problem(
 __global__ void scale_variables_kernel(double *__restrict__ objective_vector,
                                        double *__restrict__ variable_lower_bound,
                                        double *__restrict__ variable_upper_bound,
-                                       double *__restrict__ variable_lower_bound_finite_val,
-                                       double *__restrict__ variable_upper_bound_finite_val,
                                        double *__restrict__ initial_primal_solution,
                                        const double *__restrict__ variable_rescaling,
                                        const double *__restrict__ inverse_variable_rescaling,
@@ -341,15 +323,11 @@ __global__ void scale_variables_kernel(double *__restrict__ objective_vector,
     objective_vector[j] *= inv_dj;
     variable_lower_bound[j] *= dj;
     variable_upper_bound[j] *= dj;
-    variable_lower_bound_finite_val[j] *= dj;
-    variable_upper_bound_finite_val[j] *= dj;
     initial_primal_solution[j] *= dj;
 }
 
 __global__ void scale_constraints_kernel(double *__restrict__ constraint_lower_bound,
                                          double *__restrict__ constraint_upper_bound,
-                                         double *__restrict__ constraint_lower_bound_finite_val,
-                                         double *__restrict__ constraint_upper_bound_finite_val,
                                          double *__restrict__ initial_dual_solution,
                                          const double *__restrict__ constraint_rescaling,
                                          const double *__restrict__ inverse_constraint_rescaling,
@@ -362,8 +340,6 @@ __global__ void scale_constraints_kernel(double *__restrict__ constraint_lower_b
     double ei = constraint_rescaling[i];
     constraint_lower_bound[i] *= inv_ei;
     constraint_upper_bound[i] *= inv_ei;
-    constraint_lower_bound_finite_val[i] *= inv_ei;
-    constraint_upper_bound_finite_val[i] *= inv_ei;
     initial_dual_solution[i] *= ei;
 }
 
@@ -504,8 +480,6 @@ __global__ void reduce_bound_norm_sq_kernel(
 __global__ void scale_bounds_kernel(
     double *__restrict__ constraint_lower_bound,
     double *__restrict__ constraint_upper_bound,
-    double *__restrict__ constraint_lower_bound_finite_val,
-    double *__restrict__ constraint_upper_bound_finite_val,
     double *__restrict__ initial_dual_solution,
     int num_constraints,
     double constraint_scale,
@@ -516,8 +490,6 @@ __global__ void scale_bounds_kernel(
         return;
     constraint_lower_bound[i] *= constraint_scale;
     constraint_upper_bound[i] *= constraint_scale;
-    constraint_lower_bound_finite_val[i] *= constraint_scale;
-    constraint_upper_bound_finite_val[i] *= constraint_scale;
     initial_dual_solution[i] *= objective_scale;
 }
 
@@ -525,8 +497,6 @@ __global__ void scale_objective_kernel(
     double *__restrict__ objective_vector,
     double *__restrict__ variable_lower_bound,
     double *__restrict__ variable_upper_bound,
-    double *__restrict__ variable_lower_bound_finite_val,
-    double *__restrict__ variable_upper_bound_finite_val,
     double *__restrict__ initial_primal_solution,
     int num_variables,
     double constraint_scale,
@@ -537,8 +507,6 @@ __global__ void scale_objective_kernel(
         return;
     variable_lower_bound[j] *= constraint_scale;
     variable_upper_bound[j] *= constraint_scale;
-    variable_lower_bound_finite_val[j] *= constraint_scale;
-    variable_upper_bound_finite_val[j] *= constraint_scale;
     objective_vector[j] *= objective_scale;
     initial_primal_solution[j] *= constraint_scale;
 }
diff --git a/src/solver.cu b/src/solver.cu
index c01d5b8..a869666 100644
--- a/src/solver.cu
+++ b/src/solver.cu
@@ -36,6 +36,11 @@ __global__ void build_transpose_map(const int *__restrict__ A_row_ind,
                                     const int *__restrict__ At_col_ind,
                                     int nnz,
                                     int *__restrict__ A_to_At);
+__global__ void fill_finite_bounds_kernel(const double *__restrict__ lower_bound,
+                                          const double *__restrict__ upper_bound,
+                                          double *__restrict__ lower_bound_finite_val,
+                                          double *__restrict__ upper_bound_finite_val,
+                                          int num_elements);
 __global__ void compute_next_pdhg_primal_solution_kernel(const double *current_primal,
                                                          double *reflected_primal,
                                                          const double *dual_product,
@@ -289,21 +294,6 @@ static pdhg_solver_state_t *initialize_solver_state(
     ALLOC_AND_COPY(state->constraint_lower_bound, original_problem->constraint_lower_bound, con_bytes);
     ALLOC_AND_COPY(state->constraint_upper_bound, original_problem->constraint_upper_bound, con_bytes);
 
-    double *temp_host = (double *)safe_malloc(fmax(var_bytes, con_bytes));
-    for (int i = 0; i < n_cons; ++i)
-        temp_host[i] = isfinite(original_problem->constraint_lower_bound[i]) ? original_problem->constraint_lower_bound[i] : 0.0;
-    ALLOC_AND_COPY(state->constraint_lower_bound_finite_val, temp_host, con_bytes);
-    for (int i = 0; i < n_cons; ++i)
-        temp_host[i] = isfinite(original_problem->constraint_upper_bound[i]) ? original_problem->constraint_upper_bound[i] : 0.0;
-    ALLOC_AND_COPY(state->constraint_upper_bound_finite_val, temp_host, con_bytes);
-    for (int i = 0; i < n_vars; ++i)
-        temp_host[i] = isfinite(original_problem->variable_lower_bound[i]) ? original_problem->variable_lower_bound[i] : 0.0;
-    ALLOC_AND_COPY(state->variable_lower_bound_finite_val, temp_host, var_bytes);
-    for (int i = 0; i < n_vars; ++i)
-        temp_host[i] = isfinite(original_problem->variable_upper_bound[i]) ? original_problem->variable_upper_bound[i] : 0.0;
-    ALLOC_AND_COPY(state->variable_upper_bound_finite_val, temp_host, var_bytes);
-    free(temp_host);
-
 #define ALLOC_ZERO(dest, bytes)           \
     CUDA_CHECK(cudaMalloc(&dest, bytes)); \
     CUDA_CHECK(cudaMemset(dest, 0, bytes));
@@ -351,6 +341,25 @@ static pdhg_solver_state_t *initialize_solver_state(
     rescale_info->var_rescale = NULL;
     rescale_info_free(rescale_info);
 
+    CUDA_CHECK(cudaMalloc(&state->constraint_lower_bound_finite_val, con_bytes));
+    CUDA_CHECK(cudaMalloc(&state->constraint_upper_bound_finite_val, con_bytes));
+    CUDA_CHECK(cudaMalloc(&state->variable_lower_bound_finite_val, var_bytes));
+    CUDA_CHECK(cudaMalloc(&state->variable_upper_bound_finite_val, var_bytes));
+
+    fill_finite_bounds_kernel<<<state->num_blocks_dual, THREADS_PER_BLOCK>>>(
+        state->constraint_lower_bound,
+        state->constraint_upper_bound,
+        state->constraint_lower_bound_finite_val,
+        state->constraint_upper_bound_finite_val,
+        n_cons);
+
+    fill_finite_bounds_kernel<<<state->num_blocks_primal, THREADS_PER_BLOCK>>>(
+        state->variable_lower_bound,
+        state->variable_upper_bound,
+        state->variable_lower_bound_finite_val,
+        state->variable_upper_bound_finite_val,
+        n_vars);
+
     CUDA_CHECK(cudaFree(state->constraint_matrix->row_ind));
     state->constraint_matrix->row_ind = NULL;
     CUDA_CHECK(cudaFree(state->constraint_matrix_t->row_ind));
@@ -481,6 +490,24 @@ __global__ void build_transpose_map(
     A_to_At[k] = pos;
 }
 
+__global__ void fill_finite_bounds_kernel(
+    const double *__restrict__ lb,
+    const double *__restrict__ ub,
+    double *__restrict__ lb_finite,
+    double *__restrict__ ub_finite,
+    int num_elements)
+{
+    int i = blockIdx.x * blockDim.x + threadIdx.x;
+    if (i >= num_elements)
+        return;
+
+    double Li = lb[i];
+    double Ui = ub[i];
+
+    lb_finite[i] = isfinite(Li) ? Li : 0.0;
+    ub_finite[i] = isfinite(Ui) ? Ui : 0.0;
+}
+
 __global__ void compute_next_pdhg_primal_solution_kernel(
     const double *current_primal, double *reflected_primal,
     const double *dual_product, const double *objective, const double *var_lb,

From 61e88bd02bd34d0c3d2da0d2500f7e4441d263e7 Mon Sep 17 00:00:00 2001
From: LucasBoTang <botang@mie.utoronto.ca>
Date: Fri, 28 Nov 2025 22:47:19 -0500
Subject: [PATCH 12/13] Numerical issue: back to old version

---
 src/preconditioner.cu | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/preconditioner.cu b/src/preconditioner.cu
index fe087fc..0151334 100644
--- a/src/preconditioner.cu
+++ b/src/preconditioner.cu
@@ -22,7 +22,7 @@ limitations under the License.
 #include <cuda_runtime.h>
 #include <cublas_v2.h>
 
-#define SCALING_EPSILON 1e-8
+#define SCALING_EPSILON 1e-12
 
 __global__ void scale_variables_kernel(double *__restrict__ objective_vector,
                                        double *__restrict__ variable_lower_bound,
@@ -425,7 +425,7 @@ __global__ void clamp_sqrt_and_accum_kernel(double *__restrict__ scaling_factors
     for (int t = blockIdx.x * blockDim.x + threadIdx.x; t < num_variables; t += blockDim.x * gridDim.x)
     {
         double v = scaling_factors[t];
-        double s = sqrt(fmax(v, SCALING_EPSILON));
+        double s = (v < SCALING_EPSILON) ? 1.0 : sqrt(v);
         cumulative_rescaling[t] *= s;
         scaling_factors[t] = s;
         inverse_scaling_factors[t] = 1.0 / s;

From 51685beaf9fc7a52a1ab9ca63e706eac1714c969 Mon Sep 17 00:00:00 2001
From: LucasBoTang <botang@mie.utoronto.ca>
Date: Fri, 28 Nov 2025 22:55:19 -0500
Subject: [PATCH 13/13] New ver: 0.1.5

---
 CMakeLists.txt | 2 +-
 pyproject.toml | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 8ccc7ac..7910540 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -8,7 +8,7 @@ project(cupdlpx LANGUAGES C CXX CUDA)
 
 set(CUPDLPX_VERSION_MAJOR 0)
 set(CUPDLPX_VERSION_MINOR 1)
-set(CUPDLPX_VERSION_PATCH 4)
+set(CUPDLPX_VERSION_PATCH 5)
 
 set(CUPDLPX_VERSION "${CUPDLPX_VERSION_MAJOR}.${CUPDLPX_VERSION_MINOR}.${CUPDLPX_VERSION_PATCH}")
 add_compile_definitions(CUPDLPX_VERSION="${CUPDLPX_VERSION}")
diff --git a/pyproject.toml b/pyproject.toml
index f399ca1..4a4626d 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -4,7 +4,7 @@ build-backend = "scikit_build_core.build"
 
 [project]
 name = "cupdlpx"
-version = "0.1.4"
+version = "0.1.5"
 description = "Python bindings for cuPDLPx (GPU-accelerated first-order LP solver)"
 readme = "README.md"
 license = { text = "Apache-2.0" }