diff --git a/CMakeLists.txt b/CMakeLists.txt
index 8ccc7ac..7910540 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -8,7 +8,7 @@ project(cupdlpx LANGUAGES C CXX CUDA)
 
 set(CUPDLPX_VERSION_MAJOR 0)
 set(CUPDLPX_VERSION_MINOR 1)
-set(CUPDLPX_VERSION_PATCH 4)
+set(CUPDLPX_VERSION_PATCH 5)
 
 set(CUPDLPX_VERSION "${CUPDLPX_VERSION_MAJOR}.${CUPDLPX_VERSION_MINOR}.${CUPDLPX_VERSION_PATCH}")
 add_compile_definitions(CUPDLPX_VERSION="${CUPDLPX_VERSION}")
diff --git a/internal/internal_types.h b/internal/internal_types.h
index 1f42582..aa69005 100644
--- a/internal/internal_types.h
+++ b/internal/internal_types.h
@@ -28,7 +28,9 @@ typedef struct
 	int num_nonzeros;
 	int *row_ptr;
 	int *col_ind;
+	int *row_ind;
 	double *val;
+	int *transpose_map;
 } cu_sparse_matrix_csr_t;
 
 typedef struct
@@ -46,6 +48,7 @@ typedef struct
 	int num_blocks_primal;
 	int num_blocks_dual;
 	int num_blocks_primal_dual;
+	int num_blocks_nnz;
 	double objective_vector_norm;
 	double constraint_bound_norm;
 	double *constraint_lower_bound_finite_val;
@@ -129,7 +132,6 @@ typedef struct
 
 typedef struct
 {
-	lp_problem_t *scaled_problem;
 	double *con_rescale;
 	double *var_rescale;
 	double con_bound_rescale;
diff --git a/internal/preconditioner.h b/internal/preconditioner.h
index 8feb664..71a5c2b 100644
--- a/internal/preconditioner.h
+++ b/internal/preconditioner.h
@@ -25,7 +25,7 @@ extern "C"
 
     rescale_info_t *rescale_problem(
         const pdhg_parameters_t *params,
-        const lp_problem_t *original_problem);
+        pdhg_solver_state_t *state);
 
 #ifdef __cplusplus
 }
diff --git a/pyproject.toml b/pyproject.toml
index 0596175..4a4626d 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -4,7 +4,7 @@ build-backend = "scikit_build_core.build"
 
 [project]
 name = "cupdlpx"
-version = "0.1.3"
+version = "0.1.5"
 description = "Python bindings for cuPDLPx (GPU-accelerated first-order LP solver)"
 readme = "README.md"
 license = { text = "Apache-2.0" }
@@ -25,7 +25,7 @@ wheel.packages = ["python/cupdlpx"]
 sdist.include = ["tests/**", "pyproject.toml", "README.md", "LICENSE"]
 
 [tool.scikit-build.cmake.define]
-CMAKE_CUDA_ARCHITECTURES = "75;80;86;89;90;90-virtual"
+CMAKE_CUDA_ARCHITECTURES = "all"
 CMAKE_CUDA_STANDARD = "17"
 CUPDLPX_BUILD_PYTHON = "ON"
 CUPDLPX_BUILD_STATIC_LIB = "ON"
diff --git a/src/cli.c b/src/cli.c
index b9588ad..b4253d6 100644
--- a/src/cli.c
+++ b/src/cli.c
@@ -123,6 +123,7 @@ void save_solver_summary(const cupdlpx_result_t *result, const char *output_dir,
     }
     fprintf(outfile, "Termination Reason: %s\n",
             termination_reason_tToString(result->termination_reason));
+    fprintf(outfile, "Precondition Time (sec): %e\n", result->rescaling_time_sec);
     fprintf(outfile, "Runtime (sec): %e\n", result->cumulative_time_sec);
     fprintf(outfile, "Iterations Count: %d\n", result->total_count);
     fprintf(outfile, "Primal Objective Value: %e\n",
diff --git a/src/preconditioner.c b/src/preconditioner.c
deleted file mode 100644
index 41b7542..0000000
--- a/src/preconditioner.c
+++ /dev/null
@@ -1,306 +0,0 @@
-/*
-Copyright 2025 Haihao Lu
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-*/
-
-#include "preconditioner.h"
-#include "utils.h"
-#include <math.h>
-#include <stdio.h>
-#include <string.h>
-#include <time.h>
-
-#define SCALING_EPSILON 1e-12
-
-static lp_problem_t *deepcopy_problem(const lp_problem_t *prob);
-static void scale_problem(lp_problem_t *problem, const double *con_rescale,
-                          const double *var_rescale);
-static void ruiz_rescaling(lp_problem_t *problem, int num_iters,
-                           double *cum_con_rescale, double *cum_var_rescale);
-static void pock_chambolle_rescaling(lp_problem_t *problem, double alpha,
-                                     double *cum_con_rescale,
-                                     double *cum_var_rescale);
-
-static lp_problem_t *deepcopy_problem(const lp_problem_t *prob)
-{
-    lp_problem_t *new_prob = (lp_problem_t *)safe_malloc(sizeof(lp_problem_t));
-
-    new_prob->num_variables = prob->num_variables;
-    new_prob->num_constraints = prob->num_constraints;
-    new_prob->constraint_matrix_num_nonzeros =
-        prob->constraint_matrix_num_nonzeros;
-    new_prob->objective_constant = prob->objective_constant;
-
-    size_t var_bytes = prob->num_variables * sizeof(double);
-    size_t con_bytes = prob->num_constraints * sizeof(double);
-    size_t nnz_bytes_val = prob->constraint_matrix_num_nonzeros * sizeof(double);
-    size_t nnz_bytes_col = prob->constraint_matrix_num_nonzeros * sizeof(int);
-    size_t row_ptr_bytes = (prob->num_constraints + 1) * sizeof(int);
-
-    new_prob->variable_lower_bound = safe_malloc(var_bytes);
-    new_prob->variable_upper_bound = safe_malloc(var_bytes);
-    new_prob->objective_vector = safe_malloc(var_bytes);
-    new_prob->constraint_lower_bound = safe_malloc(con_bytes);
-    new_prob->constraint_upper_bound = safe_malloc(con_bytes);
-    new_prob->constraint_matrix_row_pointers = safe_malloc(row_ptr_bytes);
-    new_prob->constraint_matrix_col_indices = safe_malloc(nnz_bytes_col);
-    new_prob->constraint_matrix_values = safe_malloc(nnz_bytes_val);
-
-    memcpy(new_prob->variable_lower_bound, prob->variable_lower_bound, var_bytes);
-    memcpy(new_prob->variable_upper_bound, prob->variable_upper_bound, var_bytes);
-    memcpy(new_prob->objective_vector, prob->objective_vector, var_bytes);
-    memcpy(new_prob->constraint_lower_bound, prob->constraint_lower_bound,
-           con_bytes);
-    memcpy(new_prob->constraint_upper_bound, prob->constraint_upper_bound,
-           con_bytes);
-    memcpy(new_prob->constraint_matrix_row_pointers,
-           prob->constraint_matrix_row_pointers, row_ptr_bytes);
-    memcpy(new_prob->constraint_matrix_col_indices,
-           prob->constraint_matrix_col_indices, nnz_bytes_col);
-    memcpy(new_prob->constraint_matrix_values, prob->constraint_matrix_values,
-           nnz_bytes_val);
-
-    if (prob->primal_start)
-    {
-        new_prob->primal_start = safe_malloc(var_bytes);
-        memcpy(new_prob->primal_start, prob->primal_start, var_bytes);
-    }
-    else
-    {
-        new_prob->primal_start = NULL;
-    }
-    if (prob->dual_start)
-    {
-        new_prob->dual_start = safe_malloc(con_bytes);
-        memcpy(new_prob->dual_start, prob->dual_start, con_bytes);
-    }
-    else
-    {
-        new_prob->dual_start = NULL;
-    }
-
-    return new_prob;
-}
-
-static void scale_problem(lp_problem_t *problem,
-                          const double *constraint_rescaling,
-                          const double *variable_rescaling)
-{
-    for (int i = 0; i < problem->num_variables; ++i)
-    {
-        problem->objective_vector[i] /= variable_rescaling[i];
-        problem->variable_upper_bound[i] *= variable_rescaling[i];
-        problem->variable_lower_bound[i] *= variable_rescaling[i];
-    }
-    for (int i = 0; i < problem->num_constraints; ++i)
-    {
-        problem->constraint_lower_bound[i] /= constraint_rescaling[i];
-        problem->constraint_upper_bound[i] /= constraint_rescaling[i];
-    }
-
-    for (int row = 0; row < problem->num_constraints; ++row)
-    {
-        for (int nz_idx = problem->constraint_matrix_row_pointers[row];
-             nz_idx < problem->constraint_matrix_row_pointers[row + 1]; ++nz_idx)
-        {
-            int col = problem->constraint_matrix_col_indices[nz_idx];
-            problem->constraint_matrix_values[nz_idx] /=
-                (constraint_rescaling[row] * variable_rescaling[col]);
-        }
-    }
-}
-
-static void ruiz_rescaling(lp_problem_t *problem, int num_iterations,
-                           double *cum_constraint_rescaling,
-                           double *cum_variable_rescaling)
-{
-    int num_cons = problem->num_constraints;
-    int num_vars = problem->num_variables;
-    double *con_rescale = safe_malloc(num_cons * sizeof(double));
-    double *var_rescale = safe_malloc(num_vars * sizeof(double));
-
-    for (int iter = 0; iter < num_iterations; ++iter)
-    {
-        for (int i = 0; i < num_vars; ++i)
-            var_rescale[i] = 0.0;
-        for (int i = 0; i < num_cons; ++i)
-            con_rescale[i] = 0.0;
-
-        for (int row = 0; row < num_cons; ++row)
-        {
-            for (int nz_idx = problem->constraint_matrix_row_pointers[row];
-                 nz_idx < problem->constraint_matrix_row_pointers[row + 1];
-                 ++nz_idx)
-            {
-                int col = problem->constraint_matrix_col_indices[nz_idx];
-                if (col < 0 || col >= num_vars)
-                {
-                    fprintf(stderr,
-                            "Error: Invalid column index %d at nz_idx %d for row %d. "
-                            "Must be in [0, %d).\n",
-                            col, nz_idx, row, num_vars);
-                }
-                double val = fabs(problem->constraint_matrix_values[nz_idx]);
-                if (val > var_rescale[col])
-                    var_rescale[col] = val;
-                if (val > con_rescale[row])
-                    con_rescale[row] = val;
-            }
-        }
-
-        for (int i = 0; i < num_vars; ++i)
-            var_rescale[i] =
-                (var_rescale[i] < SCALING_EPSILON) ? 1.0 : sqrt(var_rescale[i]);
-        for (int i = 0; i < num_cons; ++i)
-            con_rescale[i] =
-                (con_rescale[i] < SCALING_EPSILON) ? 1.0 : sqrt(con_rescale[i]);
-
-        scale_problem(problem, con_rescale, var_rescale);
-        for (int i = 0; i < num_vars; ++i)
-            cum_variable_rescaling[i] *= var_rescale[i];
-        for (int i = 0; i < num_cons; ++i)
-            cum_constraint_rescaling[i] *= con_rescale[i];
-    }
-    free(con_rescale);
-    free(var_rescale);
-}
-
-static void pock_chambolle_rescaling(lp_problem_t *problem, double alpha,
-                                     double *cum_constraint_rescaling,
-                                     double *cum_variable_rescaling)
-{
-    int num_cons = problem->num_constraints;
-    int num_vars = problem->num_variables;
-    double *con_rescale = safe_calloc(num_cons, sizeof(double));
-    double *var_rescale = safe_calloc(num_vars, sizeof(double));
-
-    for (int row = 0; row < num_cons; ++row)
-    {
-        for (int nz_idx = problem->constraint_matrix_row_pointers[row];
-             nz_idx < problem->constraint_matrix_row_pointers[row + 1]; ++nz_idx)
-        {
-            int col = problem->constraint_matrix_col_indices[nz_idx];
-            double val = fabs(problem->constraint_matrix_values[nz_idx]);
-            var_rescale[col] += pow(val, 2.0 - alpha);
-            con_rescale[row] += pow(val, alpha);
-        }
-    }
-
-    for (int i = 0; i < num_vars; ++i)
-        var_rescale[i] =
-            (var_rescale[i] < SCALING_EPSILON) ? 1.0 : sqrt(var_rescale[i]);
-    for (int i = 0; i < num_cons; ++i)
-        con_rescale[i] =
-            (con_rescale[i] < SCALING_EPSILON) ? 1.0 : sqrt(con_rescale[i]);
-
-    scale_problem(problem, con_rescale, var_rescale);
-    for (int i = 0; i < num_vars; ++i)
-        cum_variable_rescaling[i] *= var_rescale[i];
-    for (int i = 0; i < num_cons; ++i)
-        cum_constraint_rescaling[i] *= con_rescale[i];
-
-    free(con_rescale);
-    free(var_rescale);
-}
-
-rescale_info_t *rescale_problem(const pdhg_parameters_t *params,
-                                const lp_problem_t *original_problem)
-{
-    clock_t start_rescaling = clock();
-    rescale_info_t *rescale_info =
-        (rescale_info_t *)safe_calloc(1, sizeof(rescale_info_t));
-    rescale_info->scaled_problem = deepcopy_problem(original_problem);
-    if (rescale_info->scaled_problem == NULL)
-    {
-        fprintf(stderr,
-                "Failed to create a copy of the problem. Aborting rescale.\n");
-        return NULL;
-    }
-    int num_cons = original_problem->num_constraints;
-    int num_vars = original_problem->num_variables;
-
-    rescale_info->con_rescale = safe_malloc(num_cons * sizeof(double));
-    rescale_info->var_rescale = safe_malloc(num_vars * sizeof(double));
-    for (int i = 0; i < num_cons; ++i)
-        rescale_info->con_rescale[i] = 1.0;
-    for (int i = 0; i < num_vars; ++i)
-        rescale_info->var_rescale[i] = 1.0;
-    if (params->l_inf_ruiz_iterations > 0)
-    {
-        ruiz_rescaling(rescale_info->scaled_problem, params->l_inf_ruiz_iterations,
-                       rescale_info->con_rescale, rescale_info->var_rescale);
-    }
-    if (params->has_pock_chambolle_alpha)
-    {
-        pock_chambolle_rescaling(
-            rescale_info->scaled_problem, params->pock_chambolle_alpha,
-            rescale_info->con_rescale, rescale_info->var_rescale);
-    }
-    if (params->bound_objective_rescaling)
-    {
-        double bound_norm_sq = 0.0;
-        for (int i = 0; i < num_cons; ++i)
-        {
-            if (isfinite(rescale_info->scaled_problem->constraint_lower_bound[i]) &&
-                (rescale_info->scaled_problem->constraint_lower_bound[i] !=
-                 rescale_info->scaled_problem->constraint_upper_bound[i]))
-            {
-                bound_norm_sq +=
-                    rescale_info->scaled_problem->constraint_lower_bound[i] *
-                    rescale_info->scaled_problem->constraint_lower_bound[i];
-            }
-            if (isfinite(rescale_info->scaled_problem->constraint_upper_bound[i]))
-            {
-                bound_norm_sq +=
-                    rescale_info->scaled_problem->constraint_upper_bound[i] *
-                    rescale_info->scaled_problem->constraint_upper_bound[i];
-            }
-        }
-
-        double obj_norm_sq = 0.0;
-        for (int i = 0; i < num_vars; ++i)
-        {
-            obj_norm_sq += rescale_info->scaled_problem->objective_vector[i] *
-                           rescale_info->scaled_problem->objective_vector[i];
-        }
-
-        rescale_info->con_bound_rescale = 1.0 / (sqrt(bound_norm_sq) + 1.0);
-        rescale_info->obj_vec_rescale = 1.0 / (sqrt(obj_norm_sq) + 1.0);
-
-        for (int i = 0; i < num_cons; ++i)
-        {
-            rescale_info->scaled_problem->constraint_lower_bound[i] *=
-                rescale_info->con_bound_rescale;
-            rescale_info->scaled_problem->constraint_upper_bound[i] *=
-                rescale_info->con_bound_rescale;
-        }
-        for (int i = 0; i < num_vars; ++i)
-        {
-            rescale_info->scaled_problem->variable_lower_bound[i] *=
-                rescale_info->con_bound_rescale;
-            rescale_info->scaled_problem->variable_upper_bound[i] *=
-                rescale_info->con_bound_rescale;
-            rescale_info->scaled_problem->objective_vector[i] *=
-                rescale_info->obj_vec_rescale;
-        }
-    }
-    else
-    {
-        rescale_info->con_bound_rescale = 1.0;
-        rescale_info->obj_vec_rescale = 1.0;
-    }
-    rescale_info->rescaling_time_sec =
-        (double)(clock() - start_rescaling) / CLOCKS_PER_SEC;
-    return rescale_info;
-}
\ No newline at end of file
diff --git a/src/preconditioner.cu b/src/preconditioner.cu
new file mode 100644
index 0000000..0151334
--- /dev/null
+++ b/src/preconditioner.cu
@@ -0,0 +1,519 @@
+/*
+Copyright 2025 Haihao Lu
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/
+
+#include "preconditioner.h"
+#include "utils.h"
+#include <time.h>
+#include <stdio.h>
+#include <math.h>
+#include <cuda_runtime.h>
+#include <cublas_v2.h>
+
+#define SCALING_EPSILON 1e-12
+
+__global__ void scale_variables_kernel(double *__restrict__ objective_vector,
+                                       double *__restrict__ variable_lower_bound,
+                                       double *__restrict__ variable_upper_bound,
+                                       double *__restrict__ initial_primal_solution,
+                                       const double *__restrict__ variable_rescaling,
+                                       const double *__restrict__ inverse_variable_rescaling,
+                                       int num_variables);
+__global__ void scale_constraints_kernel(double *__restrict__ constraint_lower_bound,
+                                         double *__restrict__ constraint_upper_bound,
+                                         double *__restrict__ initial_dual_solution,
+                                         const double *__restrict__ constraint_rescaling,
+                                         const double *__restrict__ inverse_constraint_rescaling,
+                                         int num_constraints);
+__global__ void scale_csr_nnz_kernel(const int *__restrict__ constraint_row_ind,
+                                     const int *__restrict__ constraint_col_ind,
+                                     double *__restrict__ constraint_matrix_val,
+                                     double *__restrict__ constraint_matrix_transpose_val,
+                                     const int *__restrict__ constraint_to_transpose_position,
+                                     const double *__restrict__ inverse_variable_rescaling,
+                                     const double *__restrict__ inverse_constraint_rescaling,
+                                     int nnz);
+__global__ void compute_csr_row_absmax_kernel(const int *__restrict__ row_ptr,
+                                              const double *__restrict__ matrix_vals,
+                                              int num_rows,
+                                              double *__restrict__ row_absmax_values);
+__global__ void compute_csr_row_powsum_kernel(const int *__restrict__ row_ptr,
+                                              const double *__restrict__ matrix_vals,
+                                              int num_rows,
+                                              double degree,
+                                              double *__restrict__ row_powsum_values);
+__global__ void clamp_sqrt_and_accum_kernel(double *__restrict__ scaling_factors,
+                                            double *__restrict__ inverse_scaling_factors,
+                                            double *__restrict__ cumulative_rescaling,
+                                            int num_variables);
+__global__ void reduce_bound_norm_sq_kernel(
+    const double *__restrict__ constraint_lower_bound,
+    const double *__restrict__ constraint_upper_bound,
+    int num_constraints,
+    double *__restrict__ block_sums);
+__global__ void scale_bounds_kernel(
+    double *__restrict__ constraint_lower_bound,
+    double *__restrict__ constraint_upper_bound,
+    double *__restrict__ initial_dual_solution,
+    int num_constraints,
+    double constraint_scale,
+    double objective_scale);
+__global__ void scale_objective_kernel(
+    double *__restrict__ objective_vector,
+    double *__restrict__ variable_lower_bound,
+    double *__restrict__ variable_upper_bound,
+    double *__restrict__ initial_primal_solution,
+    int num_variables,
+    double constraint_scale,
+    double objective_scale);
+__global__ void fill_ones_kernel(double *__restrict__ x, int num_variables);
+static void scale_problem(pdhg_solver_state_t *state, double *constraint_rescaling, double *variable_rescaling, double *inverse_constraint_rescaling, double *inverse_variable_rescaling);
+static void ruiz_rescaling(pdhg_solver_state_t *state, int num_iters, rescale_info_t *rescale_info,
+                           double *constraint_rescaling, double *variable_rescaling, double *inverse_constraint_rescaling, double *inverse_variable_rescaling);
+static void pock_chambolle_rescaling(pdhg_solver_state_t *state, const double alpha, rescale_info_t *rescale_info,
+                                     double *constraint_rescaling, double *variable_rescaling, double *inverse_constraint_rescaling, double *inverse_variable_rescaling);
+static void bound_objective_rescaling(pdhg_solver_state_t *state, rescale_info_t *rescale_info);
+
+static void scale_problem(
+    pdhg_solver_state_t *state,
+    double *constraint_rescaling,
+    double *variable_rescaling,
+    double *inverse_constraint_rescaling,
+    double *inverse_variable_rescaling)
+{
+    int num_variables = state->num_variables;
+    int num_constraints = state->num_constraints;
+
+    scale_variables_kernel<<<state->num_blocks_primal, THREADS_PER_BLOCK>>>(
+        state->objective_vector,
+        state->variable_lower_bound,
+        state->variable_upper_bound,
+        state->initial_primal_solution,
+        variable_rescaling,
+        inverse_variable_rescaling,
+        num_variables);
+
+    scale_constraints_kernel<<<state->num_blocks_dual, THREADS_PER_BLOCK>>>(
+        state->constraint_lower_bound,
+        state->constraint_upper_bound,
+        state->initial_dual_solution,
+        constraint_rescaling,
+        inverse_constraint_rescaling,
+        num_constraints);
+
+    scale_csr_nnz_kernel<<<state->num_blocks_nnz, THREADS_PER_BLOCK>>>(
+        state->constraint_matrix->row_ind,
+        state->constraint_matrix->col_ind,
+        state->constraint_matrix->val,
+        state->constraint_matrix_t->val,
+        state->constraint_matrix->transpose_map,
+        inverse_variable_rescaling,
+        inverse_constraint_rescaling,
+        state->constraint_matrix->num_nonzeros);
+}
+
+static void ruiz_rescaling(
+    pdhg_solver_state_t *state,
+    int num_iterations,
+    rescale_info_t *rescale_info,
+    double *constraint_rescaling,
+    double *variable_rescaling,
+    double *inverse_constraint_rescaling,
+    double *inverse_variable_rescaling)
+{
+    const int num_constraints = state->num_constraints;
+    const int num_variables = state->num_variables;
+
+    for (int iter = 0; iter < num_iterations; ++iter)
+    {
+        compute_csr_row_absmax_kernel<<<state->num_blocks_dual, THREADS_PER_BLOCK>>>(
+            state->constraint_matrix->row_ptr,
+            state->constraint_matrix->val,
+            num_constraints,
+            constraint_rescaling);
+        clamp_sqrt_and_accum_kernel<<<state->num_blocks_dual, THREADS_PER_BLOCK>>>(
+            constraint_rescaling,
+            inverse_constraint_rescaling,
+            rescale_info->con_rescale,
+            num_constraints);
+
+        compute_csr_row_absmax_kernel<<<state->num_blocks_primal, THREADS_PER_BLOCK>>>(
+            state->constraint_matrix_t->row_ptr,
+            state->constraint_matrix_t->val,
+            num_variables,
+            variable_rescaling);
+        clamp_sqrt_and_accum_kernel<<<state->num_blocks_primal, THREADS_PER_BLOCK>>>(
+            variable_rescaling,
+            inverse_variable_rescaling,
+            rescale_info->var_rescale,
+            num_variables);
+
+        scale_problem(state, constraint_rescaling, variable_rescaling, inverse_constraint_rescaling, inverse_variable_rescaling);
+    }
+}
+
+static void pock_chambolle_rescaling(
+    pdhg_solver_state_t *state,
+    const double alpha,
+    rescale_info_t *rescale_info,
+    double *constraint_rescaling,
+    double *variable_rescaling,
+    double *inverse_constraint_rescaling,
+    double *inverse_variable_rescaling)
+{
+    const int num_constraints = state->num_constraints;
+    const int num_variables = state->num_variables;
+
+    compute_csr_row_powsum_kernel<<<state->num_blocks_dual, THREADS_PER_BLOCK>>>(
+        state->constraint_matrix->row_ptr,
+        state->constraint_matrix->val,
+        num_constraints,
+        alpha,
+        constraint_rescaling);
+    clamp_sqrt_and_accum_kernel<<<state->num_blocks_dual, THREADS_PER_BLOCK>>>(
+        constraint_rescaling,
+        inverse_constraint_rescaling,
+        rescale_info->con_rescale,
+        num_constraints);
+
+    compute_csr_row_powsum_kernel<<<state->num_blocks_primal, THREADS_PER_BLOCK>>>(
+        state->constraint_matrix_t->row_ptr,
+        state->constraint_matrix_t->val,
+        num_variables,
+        2.0 - alpha,
+        variable_rescaling);
+    clamp_sqrt_and_accum_kernel<<<state->num_blocks_primal, THREADS_PER_BLOCK>>>(
+        variable_rescaling,
+        inverse_variable_rescaling,
+        rescale_info->var_rescale,
+        num_variables);
+
+    scale_problem(state, constraint_rescaling, variable_rescaling, inverse_constraint_rescaling, inverse_variable_rescaling);
+}
+
+static void bound_objective_rescaling(
+    pdhg_solver_state_t *state,
+    rescale_info_t *rescale_info)
+{
+    const int num_constraints = state->num_constraints;
+    const int num_variables = state->num_variables;
+
+    double *bnd_norm_sq_d = NULL;
+    CUDA_CHECK(cudaMalloc(&bnd_norm_sq_d, sizeof(double)));
+    CUDA_CHECK(cudaMemset(bnd_norm_sq_d, 0, sizeof(double)));
+    reduce_bound_norm_sq_kernel<<<1, THREADS_PER_BLOCK, THREADS_PER_BLOCK * sizeof(double)>>>(
+        state->constraint_lower_bound,
+        state->constraint_upper_bound,
+        num_constraints,
+        bnd_norm_sq_d);
+
+    double bnd_norm_sq_h = 0.0;
+    CUDA_CHECK(cudaMemcpy(&bnd_norm_sq_h, bnd_norm_sq_d, sizeof(double), cudaMemcpyDeviceToHost));
+    CUDA_CHECK(cudaFree(bnd_norm_sq_d));
+    double bnd_norm = sqrt(bnd_norm_sq_h);
+
+    double obj_norm = 0.0;
+    CUBLAS_CHECK(cublasDnrm2(state->blas_handle,
+                             state->num_variables,
+                             state->objective_vector, 1,
+                             &obj_norm));
+
+    double constraint_scale = 1.0 / (bnd_norm + 1.0);
+    double objective_scale = 1.0 / (obj_norm + 1.0);
+
+    scale_bounds_kernel<<<state->num_blocks_dual, THREADS_PER_BLOCK>>>(
+        state->constraint_lower_bound,
+        state->constraint_upper_bound,
+        state->initial_dual_solution,
+        num_constraints,
+        constraint_scale,
+        objective_scale);
+
+    scale_objective_kernel<<<state->num_blocks_primal, THREADS_PER_BLOCK>>>(
+        state->objective_vector,
+        state->variable_lower_bound,
+        state->variable_upper_bound,
+        state->initial_primal_solution,
+        num_variables,
+        constraint_scale,
+        objective_scale);
+
+    rescale_info->con_bound_rescale = constraint_scale;
+    rescale_info->obj_vec_rescale = objective_scale;
+}
+
+rescale_info_t *rescale_problem(
+    const pdhg_parameters_t *params,
+    pdhg_solver_state_t *state)
+{
+    printf("[Precondition] Start\n");
+
+    int num_variables = state->num_variables;
+    int num_constraints = state->num_constraints;
+
+    clock_t start_rescaling = clock();
+    rescale_info_t *rescale_info = (rescale_info_t *)safe_calloc(1, sizeof(rescale_info_t));
+    CUDA_CHECK(cudaMalloc(&rescale_info->con_rescale, num_constraints * sizeof(double)));
+    CUDA_CHECK(cudaMalloc(&rescale_info->var_rescale, num_variables * sizeof(double)));
+    fill_ones_kernel<<<state->num_blocks_dual, THREADS_PER_BLOCK>>>(
+        rescale_info->con_rescale, num_constraints);
+    fill_ones_kernel<<<state->num_blocks_primal, THREADS_PER_BLOCK>>>(
+        rescale_info->var_rescale, num_variables);
+
+    double *constraint_rescaling = NULL, *variable_rescaling = NULL, *inverse_constraint_rescaling = NULL, *inverse_variable_rescaling = NULL;
+    CUDA_CHECK(cudaMalloc(&constraint_rescaling, num_constraints * sizeof(double)));
+    CUDA_CHECK(cudaMalloc(&variable_rescaling, num_variables * sizeof(double)));
+    CUDA_CHECK(cudaMalloc(&inverse_constraint_rescaling, num_constraints * sizeof(double)));
+    CUDA_CHECK(cudaMalloc(&inverse_variable_rescaling, num_variables * sizeof(double)));
+
+    if (params->l_inf_ruiz_iterations > 0)
+    {
+        printf("[Precondition] Ruiz scaling (%d iterations)\n", params->l_inf_ruiz_iterations);
+        ruiz_rescaling(state, params->l_inf_ruiz_iterations, rescale_info, constraint_rescaling, variable_rescaling, inverse_constraint_rescaling, inverse_variable_rescaling);
+    }
+    if (params->has_pock_chambolle_alpha)
+    {
+        printf("[Precondition] Pock-Chambolle scaling (alpha=%.4f)\n", params->pock_chambolle_alpha);
+        pock_chambolle_rescaling(state, params->pock_chambolle_alpha, rescale_info, constraint_rescaling, variable_rescaling, inverse_constraint_rescaling, inverse_variable_rescaling);
+    }
+
+    rescale_info->con_bound_rescale = 1.0;
+    rescale_info->obj_vec_rescale = 1.0;
+    if (params->bound_objective_rescaling)
+    {
+        printf("[Precondition] Bound-objective scaling\n");
+        bound_objective_rescaling(state, rescale_info);
+    }
+
+    rescale_info->rescaling_time_sec = (double)(clock() - start_rescaling) / CLOCKS_PER_SEC;
+
+    CUDA_CHECK(cudaFree(constraint_rescaling));
+    CUDA_CHECK(cudaFree(variable_rescaling));
+    CUDA_CHECK(cudaFree(inverse_constraint_rescaling));
+    CUDA_CHECK(cudaFree(inverse_variable_rescaling));
+
+    return rescale_info;
+}
+
+__global__ void scale_variables_kernel(double *__restrict__ objective_vector,
+                                       double *__restrict__ variable_lower_bound,
+                                       double *__restrict__ variable_upper_bound,
+                                       double *__restrict__ initial_primal_solution,
+                                       const double *__restrict__ variable_rescaling,
+                                       const double *__restrict__ inverse_variable_rescaling,
+                                       int num_variables)
+{
+    int j = blockIdx.x * blockDim.x + threadIdx.x;
+    if (j >= num_variables)
+        return;
+    double dj = variable_rescaling[j];
+    double inv_dj = inverse_variable_rescaling[j];
+    objective_vector[j] *= inv_dj;
+    variable_lower_bound[j] *= dj;
+    variable_upper_bound[j] *= dj;
+    initial_primal_solution[j] *= dj;
+}
+
+__global__ void scale_constraints_kernel(double *__restrict__ constraint_lower_bound,
+                                         double *__restrict__ constraint_upper_bound,
+                                         double *__restrict__ initial_dual_solution,
+                                         const double *__restrict__ constraint_rescaling,
+                                         const double *__restrict__ inverse_constraint_rescaling,
+                                         int num_constraints)
+{
+    int i = blockIdx.x * blockDim.x + threadIdx.x;
+    if (i >= num_constraints)
+        return;
+    double inv_ei = inverse_constraint_rescaling[i];
+    double ei = constraint_rescaling[i];
+    constraint_lower_bound[i] *= inv_ei;
+    constraint_upper_bound[i] *= inv_ei;
+    initial_dual_solution[i] *= ei;
+}
+
+__global__ void scale_csr_nnz_kernel(const int *__restrict__ constraint_row_ind,
+                                     const int *__restrict__ constraint_col_ind,
+                                     double *__restrict__ constraint_matrix_val,
+                                     double *__restrict__ constraint_matrix_transpose_val,
+                                     const int *__restrict__ constraint_to_transpose_position,
+                                     const double *__restrict__ inverse_variable_rescaling,
+                                     const double *__restrict__ inverse_constraint_rescaling,
+                                     int nnz)
+{
+    for (int k = blockIdx.x * blockDim.x + threadIdx.x;
+         k < nnz;
+         k += gridDim.x * blockDim.x)
+    {
+        int i = constraint_row_ind[k];
+        int j = constraint_col_ind[k];
+        double scale = inverse_variable_rescaling[j] * inverse_constraint_rescaling[i];
+        constraint_matrix_val[k] *= scale;
+        constraint_matrix_transpose_val[constraint_to_transpose_position[k]] *= scale;
+    }
+}
+
+__global__ void compute_csr_row_absmax_kernel(const int *__restrict__ row_ptr,
+                                              const double *__restrict__ matrix_vals,
+                                              int num_rows,
+                                              double *__restrict__ row_absmax_values)
+{
+    int i = blockIdx.x * blockDim.x + threadIdx.x;
+    if (i >= num_rows)
+        return;
+    int s = row_ptr[i], e = row_ptr[i + 1];
+    double m = 0.0;
+    for (int k = s; k < e; ++k)
+    {
+        double v = fabs(matrix_vals[k]);
+        if (!isfinite(v))
+            v = 0.0;
+        if (v > m)
+            m = v;
+    }
+    row_absmax_values[i] = m;
+}
+
+__device__ __forceinline__ double pow_fast(double v, double p)
+{
+    if (p == 2.0)
+        return v * v;
+    if (p == 1.0)
+        return v;
+    if (p == 0.5)
+        return sqrt(v);
+    return pow(v, p);
+}
+
+__global__ void compute_csr_row_powsum_kernel(const int *__restrict__ row_ptr,
+                                              const double *__restrict__ matrix_vals,
+                                              int num_rows,
+                                              double degree,
+                                              double *__restrict__ row_powsum_values)
+{
+    int i = blockIdx.x * blockDim.x + threadIdx.x;
+    if (i >= num_rows)
+        return;
+    int s = row_ptr[i], e = row_ptr[i + 1];
+    double acc = 0.0;
+    for (int k = s; k < e; ++k)
+    {
+        double v = fabs(matrix_vals[k]);
+        if (!isfinite(v))
+            v = 0.0;
+        acc += pow_fast(v, degree);
+    }
+    row_powsum_values[i] = acc;
+}
+
+__global__ void clamp_sqrt_and_accum_kernel(double *__restrict__ scaling_factors,
+                                            double *__restrict__ inverse_scaling_factors,
+                                            double *__restrict__ cumulative_rescaling,
+                                            int num_variables)
+{
+    for (int t = blockIdx.x * blockDim.x + threadIdx.x; t < num_variables; t += blockDim.x * gridDim.x)
+    {
+        double v = scaling_factors[t];
+        double s = (v < SCALING_EPSILON) ? 1.0 : sqrt(v);
+        cumulative_rescaling[t] *= s;
+        scaling_factors[t] = s;
+        inverse_scaling_factors[t] = 1.0 / s;
+    }
+}
+
+__global__ void reduce_bound_norm_sq_kernel(
+    const double *__restrict__ constraint_lower_bound,
+    const double *__restrict__ constraint_upper_bound,
+    int num_constraints,
+    double *__restrict__ block_sums)
+{
+    extern __shared__ double sdata[];
+    int tid = threadIdx.x;
+    int global_tid = blockIdx.x * blockDim.x + tid;
+    int stride = blockDim.x * gridDim.x;
+
+    double acc = 0.0;
+    for (int i = global_tid; i < num_constraints; i += stride)
+    {
+        double Li = constraint_lower_bound[i], Ui = constraint_upper_bound[i];
+        bool fL = isfinite(Li), fU = isfinite(Ui);
+
+        if (fL && (!fU || fabs(Li - Ui) > SCALING_EPSILON))
+        {
+            acc += Li * Li;
+        }
+        if (fU)
+        {
+            acc += Ui * Ui;
+        }
+    }
+
+    sdata[tid] = acc;
+    __syncthreads();
+
+    for (int s = blockDim.x / 2; s > 0; s >>= 1)
+    {
+        if (tid < s)
+        {
+            sdata[tid] += sdata[tid + s];
+        }
+        __syncthreads();
+    }
+
+    if (tid == 0)
+    {
+        block_sums[blockIdx.x] = sdata[0];
+    }
+}
+
+__global__ void scale_bounds_kernel(
+    double *__restrict__ constraint_lower_bound,
+    double *__restrict__ constraint_upper_bound,
+    double *__restrict__ initial_dual_solution,
+    int num_constraints,
+    double constraint_scale,
+    double objective_scale)
+{
+    int i = blockIdx.x * blockDim.x + threadIdx.x;
+    if (i >= num_constraints)
+        return;
+    constraint_lower_bound[i] *= constraint_scale;
+    constraint_upper_bound[i] *= constraint_scale;
+    initial_dual_solution[i] *= objective_scale;
+}
+
+__global__ void scale_objective_kernel(
+    double *__restrict__ objective_vector,
+    double *__restrict__ variable_lower_bound,
+    double *__restrict__ variable_upper_bound,
+    double *__restrict__ initial_primal_solution,
+    int num_variables,
+    double constraint_scale,
+    double objective_scale)
+{
+    int j = blockIdx.x * blockDim.x + threadIdx.x;
+    if (j >= num_variables)
+        return;
+    variable_lower_bound[j] *= constraint_scale;
+    variable_upper_bound[j] *= constraint_scale;
+    objective_vector[j] *= objective_scale;
+    initial_primal_solution[j] *= constraint_scale;
+}
+
+__global__ void fill_ones_kernel(double *__restrict__ x, int num_variables)
+{
+    int i = blockIdx.x * blockDim.x + threadIdx.x;
+    if (i < num_variables)
+        x[i] = 1.0;
+}
diff --git a/src/solver.cu b/src/solver.cu
index a9e5f2b..8a3cdea 100644
--- a/src/solver.cu
+++ b/src/solver.cu
@@ -27,28 +27,63 @@ limitations under the License.
 #include <stdio.h>
 #include <time.h>
 
-__global__ void compute_next_pdhg_primal_solution_kernel(
-    const double *current_primal, double *reflected_primal,
-    const double *dual_product, const double *objective, const double *var_lb,
-    const double *var_ub, int n, double step_size);
-__global__ void compute_next_pdhg_primal_solution_major_kernel(
-    const double *current_primal, double *pdhg_primal, double *reflected_primal,
-    const double *dual_product, const double *objective, const double *var_lb,
-    const double *var_ub, int n, double step_size, double *dual_slack);
-__global__ void compute_next_pdhg_dual_solution_kernel(
-    const double *current_dual, double *reflected_dual,
-    const double *primal_product, const double *const_lb,
-    const double *const_ub, int n, double step_size);
-__global__ void compute_next_pdhg_dual_solution_major_kernel(
-    const double *current_dual, double *pdhg_dual, double *reflected_dual,
-    const double *primal_product, const double *const_lb,
-    const double *const_ub, int n, double step_size);
-__global__ void
-halpern_update_kernel(const double *initial_primal, double *current_primal,
-                      const double *reflected_primal,
-                      const double *initial_dual, double *current_dual,
-                      const double *reflected_dual, int n_vars, int n_cons,
-                      double weight, double reflection_coeff);
+__global__ void build_row_ind(const int *__restrict__ row_ptr,
+                              int num_rows,
+                              int *__restrict__ row_ind);
+__global__ void build_transpose_map(const int *__restrict__ A_row_ind,
+                                    const int *__restrict__ A_col_ind,
+                                    const int *__restrict__ At_row_ptr,
+                                    const int *__restrict__ At_col_ind,
+                                    int nnz,
+                                    int *__restrict__ A_to_At);
+__global__ void fill_finite_bounds_kernel(const double *__restrict__ lower_bound,
+                                          const double *__restrict__ upper_bound,
+                                          double *__restrict__ lower_bound_finite_val,
+                                          double *__restrict__ upper_bound_finite_val,
+                                          int num_elements);
+__global__ void compute_next_pdhg_primal_solution_kernel(const double *current_primal,
+                                                         double *reflected_primal,
+                                                         const double *dual_product,
+                                                         const double *objective,
+                                                         const double *var_lb,
+                                                         const double *var_ub,
+                                                         int n,
+                                                         double step_size);
+__global__ void compute_next_pdhg_primal_solution_major_kernel(const double *current_primal,
+                                                               double *pdhg_primal,
+                                                               double *reflected_primal,
+                                                               const double *dual_product,
+                                                               const double *objective,
+                                                               const double *var_lb,
+                                                               const double *var_ub,
+                                                               int n,
+                                                               double step_size,
+                                                               double *dual_slack);
+__global__ void compute_next_pdhg_dual_solution_kernel(const double *current_dual,
+                                                       double *reflected_dual,
+                                                       const double *primal_product,
+                                                       const double *const_lb,
+                                                       const double *const_ub,
+                                                       int n,
+                                                       double step_size);
+__global__ void compute_next_pdhg_dual_solution_major_kernel(const double *current_dual,
+                                                             double *pdhg_dual,
+                                                             double *reflected_dual,
+                                                             const double *primal_product,
+                                                             const double *const_lb,
+                                                             const double *const_ub,
+                                                             int n,
+                                                             double step_size);
+__global__ void halpern_update_kernel(const double *initial_primal,
+                                      double *current_primal,
+                                      const double *reflected_primal,
+                                      const double *initial_dual,
+                                      double *current_dual,
+                                      const double *reflected_dual,
+                                      int n_vars,
+                                      int n_cons,
+                                      double weight,
+                                      double reflection_coeff);
 __global__ void rescale_solution_kernel(double *primal_solution,
                                         double *dual_solution,
                                         const double *variable_rescaling,
@@ -56,29 +91,27 @@ __global__ void rescale_solution_kernel(double *primal_solution,
                                         const double objective_vector_rescaling,
                                         const double constraint_bound_rescaling,
                                         int n_vars, int n_cons);
-__global__ void compute_delta_solution_kernel(
-    const double *initial_primal, const double *pdhg_primal,
-    double *delta_primal, const double *initial_dual, const double *pdhg_dual,
-    double *delta_dual, int n_vars, int n_cons);
+__global__ void compute_delta_solution_kernel(const double *initial_primal,
+                                              const double *pdhg_primal,
+                                              double *delta_primal,
+                                              const double *initial_dual,
+                                              const double *pdhg_dual,
+                                              double *delta_dual,
+                                              int n_vars,
+                                              int n_cons);
 static void compute_next_pdhg_primal_solution(pdhg_solver_state_t *state);
 static void compute_next_pdhg_dual_solution(pdhg_solver_state_t *state);
-static void halpern_update(pdhg_solver_state_t *state,
-                           double reflection_coefficient);
+static void halpern_update(pdhg_solver_state_t *state, double reflection_coefficient);
 static void rescale_solution(pdhg_solver_state_t *state);
 static cupdlpx_result_t *create_result_from_state(pdhg_solver_state_t *state);
-static void perform_restart(pdhg_solver_state_t *state,
-                            const pdhg_parameters_t *params);
-static void
-initialize_step_size_and_primal_weight(pdhg_solver_state_t *state,
-                                       const pdhg_parameters_t *params);
-static pdhg_solver_state_t *
-initialize_solver_state(const lp_problem_t *original_problem,
-                        const rescale_info_t *rescale_info);
+static void perform_restart(pdhg_solver_state_t *state, const pdhg_parameters_t *params);
+static void initialize_step_size_and_primal_weight(pdhg_solver_state_t *state, const pdhg_parameters_t *params);
+static pdhg_solver_state_t *initialize_solver_state(const lp_problem_t *original_problem, const pdhg_parameters_t *params);
 static void compute_fixed_point_error(pdhg_solver_state_t *state);
 void pdhg_solver_state_free(pdhg_solver_state_t *state);
 void rescale_info_free(rescale_info_t *info);
 
-static void perform_primal_restart(pdhg_solver_state_t *state);    
+static void perform_primal_restart(pdhg_solver_state_t *state);
 static void perform_dual_restart(pdhg_solver_state_t *state);
 void primal_feasibility_polish(const pdhg_parameters_t *params, pdhg_solver_state_t *state, const pdhg_solver_state_t *ori_state);
 void dual_feasibility_polish(const pdhg_parameters_t *params, pdhg_solver_state_t *state, const pdhg_solver_state_t *ori_state);
@@ -92,16 +125,12 @@ static pdhg_solver_state_t *initialize_primal_feas_polish_state(
 static pdhg_solver_state_t *initialize_dual_feas_polish_state(
     const pdhg_solver_state_t *original_state);
 
-
 cupdlpx_result_t *optimize(const pdhg_parameters_t *params,
                            const lp_problem_t *original_problem)
 {
     print_initial_info(params, original_problem);
-    rescale_info_t *rescale_info = rescale_problem(params, original_problem);
-    pdhg_solver_state_t *state =
-        initialize_solver_state(original_problem, rescale_info);
+    pdhg_solver_state_t *state = initialize_solver_state(original_problem, params);
 
-    rescale_info_free(rescale_info);
     initialize_step_size_and_primal_weight(state, params);
     clock_t start_time = clock();
     bool do_restart = false;
@@ -157,8 +186,8 @@ cupdlpx_result_t *optimize(const pdhg_parameters_t *params,
 
     pdhg_final_log(state, params->verbose, state->termination_reason);
 
-    if (params->feasibility_polishing && 
-        state->termination_reason != TERMINATION_REASON_DUAL_INFEASIBLE && 
+    if (params->feasibility_polishing &&
+        state->termination_reason != TERMINATION_REASON_DUAL_INFEASIBLE &&
         state->termination_reason != TERMINATION_REASON_PRIMAL_INFEASIBLE)
     {
         feasibility_polish(params, state);
@@ -169,15 +198,15 @@ cupdlpx_result_t *optimize(const pdhg_parameters_t *params,
     return results;
 }
 
-static pdhg_solver_state_t *
-initialize_solver_state(const lp_problem_t *original_problem,
-                        const rescale_info_t *rescale_info)
+static pdhg_solver_state_t *initialize_solver_state(
+    const lp_problem_t *original_problem,
+    const pdhg_parameters_t *params)
 {
-    pdhg_solver_state_t *state =
-        (pdhg_solver_state_t *)safe_calloc(1, sizeof(pdhg_solver_state_t));
+    pdhg_solver_state_t *state = (pdhg_solver_state_t *)safe_calloc(1, sizeof(pdhg_solver_state_t));
 
     int n_vars = original_problem->num_variables;
     int n_cons = original_problem->num_constraints;
+    int nnz = original_problem->constraint_matrix_num_nonzeros;
     size_t var_bytes = n_vars * sizeof(double);
     size_t con_bytes = n_cons * sizeof(double);
 
@@ -185,99 +214,85 @@ initialize_solver_state(const lp_problem_t *original_problem,
     state->num_constraints = n_cons;
     state->objective_constant = original_problem->objective_constant;
 
-    state->constraint_matrix =
-        (cu_sparse_matrix_csr_t *)safe_malloc(sizeof(cu_sparse_matrix_csr_t));
-    state->constraint_matrix_t =
-        (cu_sparse_matrix_csr_t *)safe_malloc(sizeof(cu_sparse_matrix_csr_t));
+    state->constraint_matrix = (cu_sparse_matrix_csr_t *)safe_malloc(sizeof(cu_sparse_matrix_csr_t));
+    state->constraint_matrix_t = (cu_sparse_matrix_csr_t *)safe_malloc(sizeof(cu_sparse_matrix_csr_t));
 
     state->constraint_matrix->num_rows = n_cons;
     state->constraint_matrix->num_cols = n_vars;
-    state->constraint_matrix->num_nonzeros =
-        original_problem->constraint_matrix_num_nonzeros;
+    state->constraint_matrix->num_nonzeros = original_problem->constraint_matrix_num_nonzeros;
 
     state->constraint_matrix_t->num_rows = n_vars;
     state->constraint_matrix_t->num_cols = n_cons;
-    state->constraint_matrix_t->num_nonzeros =
-        original_problem->constraint_matrix_num_nonzeros;
+    state->constraint_matrix_t->num_nonzeros = original_problem->constraint_matrix_num_nonzeros;
 
     state->termination_reason = TERMINATION_REASON_UNSPECIFIED;
 
-    state->rescaling_time_sec = rescale_info->rescaling_time_sec;
+    state->num_blocks_primal = (state->num_variables + THREADS_PER_BLOCK - 1) / THREADS_PER_BLOCK;
+    state->num_blocks_dual = (state->num_constraints + THREADS_PER_BLOCK - 1) / THREADS_PER_BLOCK;
+    state->num_blocks_primal_dual = (state->num_variables + state->num_constraints + THREADS_PER_BLOCK - 1) / THREADS_PER_BLOCK;
+    state->num_blocks_nnz = (state->constraint_matrix->num_nonzeros + THREADS_PER_BLOCK - 1) / THREADS_PER_BLOCK;
+
+    CUSPARSE_CHECK(cusparseCreate(&state->sparse_handle));
+    CUBLAS_CHECK(cublasCreate(&state->blas_handle));
+    CUBLAS_CHECK(cublasSetPointerMode(state->blas_handle, CUBLAS_POINTER_MODE_HOST));
 
 #define ALLOC_AND_COPY(dest, src, bytes)  \
     CUDA_CHECK(cudaMalloc(&dest, bytes)); \
     CUDA_CHECK(cudaMemcpy(dest, src, bytes, cudaMemcpyHostToDevice));
 
-    ALLOC_AND_COPY(state->constraint_matrix->row_ptr,
-                   rescale_info->scaled_problem->constraint_matrix_row_pointers,
-                   (n_cons + 1) * sizeof(int));
-    ALLOC_AND_COPY(state->constraint_matrix->col_ind,
-                   rescale_info->scaled_problem->constraint_matrix_col_indices,
-                   rescale_info->scaled_problem->constraint_matrix_num_nonzeros *
-                       sizeof(int));
-    ALLOC_AND_COPY(state->constraint_matrix->val,
-                   rescale_info->scaled_problem->constraint_matrix_values,
-                   rescale_info->scaled_problem->constraint_matrix_num_nonzeros *
-                       sizeof(double));
-
-    CUDA_CHECK(cudaMalloc(&state->constraint_matrix_t->row_ptr,
-                          (n_vars + 1) * sizeof(int)));
-    CUDA_CHECK(
-        cudaMalloc(&state->constraint_matrix_t->col_ind,
-                   rescale_info->scaled_problem->constraint_matrix_num_nonzeros *
-                       sizeof(int)));
-    CUDA_CHECK(
-        cudaMalloc(&state->constraint_matrix_t->val,
-                   rescale_info->scaled_problem->constraint_matrix_num_nonzeros *
-                       sizeof(double)));
+    ALLOC_AND_COPY(state->constraint_matrix->row_ptr, original_problem->constraint_matrix_row_pointers, (n_cons + 1) * sizeof(int));
+    ALLOC_AND_COPY(state->constraint_matrix->col_ind, original_problem->constraint_matrix_col_indices, original_problem->constraint_matrix_num_nonzeros * sizeof(int));
+    ALLOC_AND_COPY(state->constraint_matrix->val, original_problem->constraint_matrix_values, original_problem->constraint_matrix_num_nonzeros * sizeof(double));
 
-    CUSPARSE_CHECK(cusparseCreate(&state->sparse_handle));
-    CUBLAS_CHECK(cublasCreate(&state->blas_handle));
-    CUBLAS_CHECK(
-        cublasSetPointerMode(state->blas_handle, CUBLAS_POINTER_MODE_HOST));
+    CUDA_CHECK(cudaMalloc(&state->constraint_matrix->row_ind, nnz * sizeof(int)));
+    build_row_ind<<<state->num_blocks_dual, THREADS_PER_BLOCK>>>(
+        state->constraint_matrix->row_ptr, n_cons, state->constraint_matrix->row_ind);
+    CUDA_CHECK(cudaGetLastError());
+
+    CUDA_CHECK(cudaMalloc(&state->constraint_matrix_t->row_ptr, (n_vars + 1) * sizeof(int)));
+    CUDA_CHECK(cudaMalloc(&state->constraint_matrix_t->col_ind, original_problem->constraint_matrix_num_nonzeros * sizeof(int)));
+    CUDA_CHECK(cudaMalloc(&state->constraint_matrix_t->val, original_problem->constraint_matrix_num_nonzeros * sizeof(double)));
 
     size_t buffer_size = 0;
     void *buffer = nullptr;
     CUSPARSE_CHECK(cusparseCsr2cscEx2_bufferSize(
-        state->sparse_handle, state->constraint_matrix->num_rows,
-        state->constraint_matrix->num_cols,
-        state->constraint_matrix->num_nonzeros, state->constraint_matrix->val,
-        state->constraint_matrix->row_ptr, state->constraint_matrix->col_ind,
-        state->constraint_matrix_t->val, state->constraint_matrix_t->row_ptr,
-        state->constraint_matrix_t->col_ind, CUDA_R_64F, CUSPARSE_ACTION_NUMERIC,
-        CUSPARSE_INDEX_BASE_ZERO, CUSPARSE_CSR2CSC_ALG_DEFAULT, &buffer_size));
+        state->sparse_handle, state->constraint_matrix->num_rows, state->constraint_matrix->num_cols, state->constraint_matrix->num_nonzeros,
+        state->constraint_matrix->val, state->constraint_matrix->row_ptr, state->constraint_matrix->col_ind,
+        state->constraint_matrix_t->val, state->constraint_matrix_t->row_ptr, state->constraint_matrix_t->col_ind,
+        CUDA_R_64F, CUSPARSE_ACTION_NUMERIC, CUSPARSE_INDEX_BASE_ZERO,
+        CUSPARSE_CSR2CSC_ALG_DEFAULT, &buffer_size));
     CUDA_CHECK(cudaMalloc(&buffer, buffer_size));
 
     CUSPARSE_CHECK(cusparseCsr2cscEx2(
-        state->sparse_handle, state->constraint_matrix->num_rows,
-        state->constraint_matrix->num_cols,
-        state->constraint_matrix->num_nonzeros, state->constraint_matrix->val,
-        state->constraint_matrix->row_ptr, state->constraint_matrix->col_ind,
-        state->constraint_matrix_t->val, state->constraint_matrix_t->row_ptr,
-        state->constraint_matrix_t->col_ind, CUDA_R_64F, CUSPARSE_ACTION_NUMERIC,
-        CUSPARSE_INDEX_BASE_ZERO, CUSPARSE_CSR2CSC_ALG_DEFAULT, buffer));
+        state->sparse_handle, state->constraint_matrix->num_rows, state->constraint_matrix->num_cols, state->constraint_matrix->num_nonzeros,
+        state->constraint_matrix->val, state->constraint_matrix->row_ptr, state->constraint_matrix->col_ind,
+        state->constraint_matrix_t->val, state->constraint_matrix_t->row_ptr, state->constraint_matrix_t->col_ind,
+        CUDA_R_64F, CUSPARSE_ACTION_NUMERIC, CUSPARSE_INDEX_BASE_ZERO,
+        CUSPARSE_CSR2CSC_ALG_DEFAULT, buffer));
 
     CUDA_CHECK(cudaFree(buffer));
 
-    ALLOC_AND_COPY(state->variable_lower_bound,
-                   rescale_info->scaled_problem->variable_lower_bound, var_bytes);
-    ALLOC_AND_COPY(state->variable_upper_bound,
-                   rescale_info->scaled_problem->variable_upper_bound, var_bytes);
-    ALLOC_AND_COPY(state->objective_vector,
-                   rescale_info->scaled_problem->objective_vector, var_bytes);
-    ALLOC_AND_COPY(state->constraint_lower_bound,
-                   rescale_info->scaled_problem->constraint_lower_bound,
-                   con_bytes);
-    ALLOC_AND_COPY(state->constraint_upper_bound,
-                   rescale_info->scaled_problem->constraint_upper_bound,
-                   con_bytes);
-    ALLOC_AND_COPY(state->constraint_rescaling, rescale_info->con_rescale,
-                   con_bytes);
-    ALLOC_AND_COPY(state->variable_rescaling, rescale_info->var_rescale,
-                   var_bytes);
+    CUDA_CHECK(cudaMalloc(&state->constraint_matrix_t->row_ind, nnz * sizeof(int)));
+    build_row_ind<<<state->num_blocks_primal, THREADS_PER_BLOCK>>>(
+        state->constraint_matrix_t->row_ptr, n_vars, state->constraint_matrix_t->row_ind);
+    CUDA_CHECK(cudaGetLastError());
 
-    state->constraint_bound_rescaling = rescale_info->con_bound_rescale;
-    state->objective_vector_rescaling = rescale_info->obj_vec_rescale;
+    CUDA_CHECK(cudaMalloc(&state->constraint_matrix->transpose_map, nnz * sizeof(int)));
+    state->constraint_matrix_t->transpose_map = NULL;
+    build_transpose_map<<<state->num_blocks_nnz, THREADS_PER_BLOCK>>>(
+        state->constraint_matrix->row_ind,
+        state->constraint_matrix->col_ind,
+        state->constraint_matrix_t->row_ptr,
+        state->constraint_matrix_t->col_ind,
+        nnz,
+        state->constraint_matrix->transpose_map);
+    CUDA_CHECK(cudaGetLastError());
+
+    ALLOC_AND_COPY(state->variable_lower_bound, original_problem->variable_lower_bound, var_bytes);
+    ALLOC_AND_COPY(state->variable_upper_bound, original_problem->variable_upper_bound, var_bytes);
+    ALLOC_AND_COPY(state->objective_vector, original_problem->objective_vector, var_bytes);
+    ALLOC_AND_COPY(state->constraint_lower_bound, original_problem->constraint_lower_bound, con_bytes);
+    ALLOC_AND_COPY(state->constraint_upper_bound, original_problem->constraint_upper_bound, con_bytes);
 
 #define ALLOC_ZERO(dest, bytes)           \
     CUDA_CHECK(cudaMalloc(&dest, bytes)); \
@@ -303,91 +318,77 @@ initialize_solver_state(const lp_problem_t *original_problem,
 
     if (original_problem->primal_start)
     {
-        double *rescaled = (double *)safe_malloc(var_bytes);
-        for (int i = 0; i < n_vars; ++i)
-            rescaled[i] = original_problem->primal_start[i] *
-                          rescale_info->var_rescale[i] *
-                          rescale_info->con_bound_rescale;
-        CUDA_CHECK(cudaMemcpy(state->initial_primal_solution, rescaled, var_bytes,
-                              cudaMemcpyHostToDevice));
-        free(rescaled);
+        CUDA_CHECK(cudaMemcpy(state->initial_primal_solution,
+                              original_problem->primal_start,
+                              var_bytes, cudaMemcpyHostToDevice));
     }
     if (original_problem->dual_start)
     {
-        double *rescaled = (double *)safe_malloc(con_bytes);
-        for (int i = 0; i < n_cons; ++i)
-            rescaled[i] = original_problem->dual_start[i] *
-                          rescale_info->con_rescale[i] *
-                          rescale_info->obj_vec_rescale;
-        CUDA_CHECK(cudaMemcpy(state->initial_dual_solution, rescaled, con_bytes,
-                              cudaMemcpyHostToDevice));
-        free(rescaled);
+        CUDA_CHECK(cudaMemcpy(state->initial_dual_solution,
+                              original_problem->dual_start,
+                              con_bytes, cudaMemcpyHostToDevice));
     }
 
-    double *temp_host = (double *)safe_malloc(fmax(var_bytes, con_bytes));
-    for (int i = 0; i < n_cons; ++i)
-        temp_host[i] =
-            isfinite(rescale_info->scaled_problem->constraint_lower_bound[i])
-                ? rescale_info->scaled_problem->constraint_lower_bound[i]
-                : 0.0;
-    ALLOC_AND_COPY(state->constraint_lower_bound_finite_val, temp_host,
-                   con_bytes);
-    for (int i = 0; i < n_cons; ++i)
-        temp_host[i] =
-            isfinite(rescale_info->scaled_problem->constraint_upper_bound[i])
-                ? rescale_info->scaled_problem->constraint_upper_bound[i]
-                : 0.0;
-    ALLOC_AND_COPY(state->constraint_upper_bound_finite_val, temp_host,
-                   con_bytes);
-    for (int i = 0; i < n_vars; ++i)
-        temp_host[i] =
-            isfinite(rescale_info->scaled_problem->variable_lower_bound[i])
-                ? rescale_info->scaled_problem->variable_lower_bound[i]
-                : 0.0;
-    ALLOC_AND_COPY(state->variable_lower_bound_finite_val, temp_host, var_bytes);
-    for (int i = 0; i < n_vars; ++i)
-        temp_host[i] =
-            isfinite(rescale_info->scaled_problem->variable_upper_bound[i])
-                ? rescale_info->scaled_problem->variable_upper_bound[i]
-                : 0.0;
-    ALLOC_AND_COPY(state->variable_upper_bound_finite_val, temp_host, var_bytes);
-    free(temp_host);
+    rescale_info_t *rescale_info = rescale_problem(params, state);
 
-    double sum_of_squares = 0.0;
+    state->constraint_rescaling = rescale_info->con_rescale;
+    state->variable_rescaling = rescale_info->var_rescale;
+    state->constraint_bound_rescaling = rescale_info->con_bound_rescale;
+    state->objective_vector_rescaling = rescale_info->obj_vec_rescale;
+    state->rescaling_time_sec = rescale_info->rescaling_time_sec;
+
+    rescale_info->con_rescale = NULL;
+    rescale_info->var_rescale = NULL;
+    rescale_info_free(rescale_info);
+
+    CUDA_CHECK(cudaMalloc(&state->constraint_lower_bound_finite_val, con_bytes));
+    CUDA_CHECK(cudaMalloc(&state->constraint_upper_bound_finite_val, con_bytes));
+    CUDA_CHECK(cudaMalloc(&state->variable_lower_bound_finite_val, var_bytes));
+    CUDA_CHECK(cudaMalloc(&state->variable_upper_bound_finite_val, var_bytes));
+
+    fill_finite_bounds_kernel<<<state->num_blocks_dual, THREADS_PER_BLOCK>>>(
+        state->constraint_lower_bound,
+        state->constraint_upper_bound,
+        state->constraint_lower_bound_finite_val,
+        state->constraint_upper_bound_finite_val,
+        n_cons);
+
+    fill_finite_bounds_kernel<<<state->num_blocks_primal, THREADS_PER_BLOCK>>>(
+        state->variable_lower_bound,
+        state->variable_upper_bound,
+        state->variable_lower_bound_finite_val,
+        state->variable_upper_bound_finite_val,
+        n_vars);
+
+    CUDA_CHECK(cudaFree(state->constraint_matrix->row_ind));
+    state->constraint_matrix->row_ind = NULL;
+    CUDA_CHECK(cudaFree(state->constraint_matrix_t->row_ind));
+    state->constraint_matrix_t->row_ind = NULL;
+    CUDA_CHECK(cudaFree(state->constraint_matrix->transpose_map));
+    state->constraint_matrix->transpose_map = NULL;
 
+    double sum_of_squares = 0.0;
     for (int i = 0; i < n_vars; ++i)
     {
-        sum_of_squares += original_problem->objective_vector[i] *
-                          original_problem->objective_vector[i];
+        sum_of_squares += original_problem->objective_vector[i] * original_problem->objective_vector[i];
     }
     state->objective_vector_norm = sqrt(sum_of_squares);
 
     sum_of_squares = 0.0;
-
     for (int i = 0; i < n_cons; ++i)
     {
         double lower = original_problem->constraint_lower_bound[i];
         double upper = original_problem->constraint_upper_bound[i];
-
         if (isfinite(lower) && (lower != upper))
         {
             sum_of_squares += lower * lower;
         }
-
         if (isfinite(upper))
         {
             sum_of_squares += upper * upper;
         }
     }
-
     state->constraint_bound_norm = sqrt(sum_of_squares);
-    state->num_blocks_primal =
-        (state->num_variables + THREADS_PER_BLOCK - 1) / THREADS_PER_BLOCK;
-    state->num_blocks_dual =
-        (state->num_constraints + THREADS_PER_BLOCK - 1) / THREADS_PER_BLOCK;
-    state->num_blocks_primal_dual =
-        (state->num_variables + state->num_constraints + THREADS_PER_BLOCK - 1) /
-        THREADS_PER_BLOCK;
 
     state->best_primal_dual_residual_gap = INFINITY;
     state->last_trial_fixed_point_error = INFINITY;
@@ -397,82 +398,116 @@ initialize_solver_state(const lp_problem_t *original_problem,
     size_t primal_spmv_buffer_size;
     size_t dual_spmv_buffer_size;
 
-    CUSPARSE_CHECK(cusparseCreateCsr(
-        &state->matA, state->num_constraints, state->num_variables,
-        state->constraint_matrix->num_nonzeros, state->constraint_matrix->row_ptr,
-        state->constraint_matrix->col_ind, state->constraint_matrix->val,
-        CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_BASE_ZERO,
-        CUDA_R_64F));
-
+    CUSPARSE_CHECK(cusparseCreateCsr(&state->matA, state->num_constraints, state->num_variables, state->constraint_matrix->num_nonzeros, state->constraint_matrix->row_ptr, state->constraint_matrix->col_ind, state->constraint_matrix->val, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_BASE_ZERO, CUDA_R_64F));
     CUDA_CHECK(cudaGetLastError());
 
-    CUSPARSE_CHECK(cusparseCreateCsr(
-        &state->matAt, state->num_variables, state->num_constraints,
-        state->constraint_matrix_t->num_nonzeros,
-        state->constraint_matrix_t->row_ptr, state->constraint_matrix_t->col_ind,
-        state->constraint_matrix_t->val, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I,
-        CUSPARSE_INDEX_BASE_ZERO, CUDA_R_64F));
+    CUSPARSE_CHECK(cusparseCreateCsr(&state->matAt, state->num_variables, state->num_constraints, state->constraint_matrix_t->num_nonzeros, state->constraint_matrix_t->row_ptr, state->constraint_matrix_t->col_ind, state->constraint_matrix_t->val, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_BASE_ZERO, CUDA_R_64F));
     CUDA_CHECK(cudaGetLastError());
 
-    CUSPARSE_CHECK(cusparseCreateDnVec(&state->vec_primal_sol,
-                                       state->num_variables,
-                                       state->pdhg_primal_solution, CUDA_R_64F));
-    CUSPARSE_CHECK(cusparseCreateDnVec(&state->vec_dual_sol,
-                                       state->num_constraints,
-                                       state->pdhg_dual_solution, CUDA_R_64F));
-    CUSPARSE_CHECK(cusparseCreateDnVec(&state->vec_primal_prod,
-                                       state->num_constraints,
-                                       state->primal_product, CUDA_R_64F));
-    CUSPARSE_CHECK(cusparseCreateDnVec(&state->vec_dual_prod,
-                                       state->num_variables, state->dual_product,
-                                       CUDA_R_64F));
-    CUSPARSE_CHECK(cusparseSpMV_bufferSize(
-        state->sparse_handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &HOST_ONE,
-        state->matA, state->vec_primal_sol, &HOST_ZERO, state->vec_primal_prod,
-        CUDA_R_64F, CUSPARSE_SPMV_CSR_ALG2, &primal_spmv_buffer_size));
+    CUSPARSE_CHECK(cusparseCreateDnVec(&state->vec_primal_sol, state->num_variables, state->pdhg_primal_solution, CUDA_R_64F));
+    CUSPARSE_CHECK(cusparseCreateDnVec(&state->vec_dual_sol, state->num_constraints, state->pdhg_dual_solution, CUDA_R_64F));
+    CUSPARSE_CHECK(cusparseCreateDnVec(&state->vec_primal_prod, state->num_constraints, state->primal_product, CUDA_R_64F));
+    CUSPARSE_CHECK(cusparseCreateDnVec(&state->vec_dual_prod, state->num_variables, state->dual_product, CUDA_R_64F));
+    CUSPARSE_CHECK(cusparseSpMV_bufferSize(state->sparse_handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &HOST_ONE, state->matA, state->vec_primal_sol, &HOST_ZERO, state->vec_primal_prod, CUDA_R_64F, CUSPARSE_SPMV_CSR_ALG2, &primal_spmv_buffer_size));
 
-    CUSPARSE_CHECK(cusparseSpMV_bufferSize(
-        state->sparse_handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &HOST_ONE,
-        state->matAt, state->vec_dual_sol, &HOST_ZERO, state->vec_dual_prod,
-        CUDA_R_64F, CUSPARSE_SPMV_CSR_ALG2, &dual_spmv_buffer_size));
+    CUSPARSE_CHECK(cusparseSpMV_bufferSize(state->sparse_handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &HOST_ONE, state->matAt, state->vec_dual_sol, &HOST_ZERO, state->vec_dual_prod, CUDA_R_64F, CUSPARSE_SPMV_CSR_ALG2, &dual_spmv_buffer_size));
     CUDA_CHECK(cudaMalloc(&state->primal_spmv_buffer, primal_spmv_buffer_size));
-    CUSPARSE_CHECK(cusparseSpMV_preprocess(
-        state->sparse_handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &HOST_ONE,
-        state->matA, state->vec_primal_sol, &HOST_ZERO, state->vec_primal_prod,
-        CUDA_R_64F, CUSPARSE_SPMV_CSR_ALG2, state->primal_spmv_buffer));
+    CUSPARSE_CHECK(cusparseSpMV_preprocess(state->sparse_handle, CUSPARSE_OPERATION_NON_TRANSPOSE,
+                                           &HOST_ONE, state->matA, state->vec_primal_sol, &HOST_ZERO, state->vec_primal_prod,
+                                           CUDA_R_64F, CUSPARSE_SPMV_CSR_ALG2, state->primal_spmv_buffer));
 
     CUDA_CHECK(cudaMalloc(&state->dual_spmv_buffer, dual_spmv_buffer_size));
-    CUSPARSE_CHECK(cusparseSpMV_preprocess(
-        state->sparse_handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &HOST_ONE,
-        state->matAt, state->vec_dual_sol, &HOST_ZERO, state->vec_dual_prod,
-        CUDA_R_64F, CUSPARSE_SPMV_CSR_ALG2, state->dual_spmv_buffer));
+    CUSPARSE_CHECK(cusparseSpMV_preprocess(state->sparse_handle, CUSPARSE_OPERATION_NON_TRANSPOSE,
+                                           &HOST_ONE, state->matAt, state->vec_dual_sol, &HOST_ZERO, state->vec_dual_prod,
+                                           CUDA_R_64F, CUSPARSE_SPMV_CSR_ALG2, state->dual_spmv_buffer));
 
-    CUDA_CHECK(
-        cudaMalloc(&state->ones_primal_d, state->num_variables * sizeof(double)));
-    CUDA_CHECK(
-        cudaMalloc(&state->ones_dual_d, state->num_constraints * sizeof(double)));
+    CUDA_CHECK(cudaMalloc(&state->ones_primal_d, state->num_variables * sizeof(double)));
+    CUDA_CHECK(cudaMalloc(&state->ones_dual_d, state->num_constraints * sizeof(double)));
 
-    double *ones_primal_h =
-        (double *)safe_malloc(state->num_variables * sizeof(double));
+    double *ones_primal_h = (double *)safe_malloc(state->num_variables * sizeof(double));
     for (int i = 0; i < state->num_variables; ++i)
         ones_primal_h[i] = 1.0;
-    CUDA_CHECK(cudaMemcpy(state->ones_primal_d, ones_primal_h,
-                          state->num_variables * sizeof(double),
-                          cudaMemcpyHostToDevice));
+    CUDA_CHECK(cudaMemcpy(state->ones_primal_d, ones_primal_h, state->num_variables * sizeof(double), cudaMemcpyHostToDevice));
     free(ones_primal_h);
 
-    double *ones_dual_h =
-        (double *)safe_malloc(state->num_constraints * sizeof(double));
+    double *ones_dual_h = (double *)safe_malloc(state->num_constraints * sizeof(double));
     for (int i = 0; i < state->num_constraints; ++i)
         ones_dual_h[i] = 1.0;
-    CUDA_CHECK(cudaMemcpy(state->ones_dual_d, ones_dual_h,
-                          state->num_constraints * sizeof(double),
-                          cudaMemcpyHostToDevice));
+    CUDA_CHECK(cudaMemcpy(state->ones_dual_d, ones_dual_h, state->num_constraints * sizeof(double), cudaMemcpyHostToDevice));
     free(ones_dual_h);
 
     return state;
 }
 
+__global__ void build_row_ind(const int *__restrict__ row_ptr,
+                              int num_rows,
+                              int *__restrict__ row_ind)
+{
+    int i = blockIdx.x * blockDim.x + threadIdx.x;
+    if (i >= num_rows)
+        return;
+
+    int s = row_ptr[i];
+    int e = row_ptr[i + 1];
+
+    for (int k = s; k < e; ++k)
+    {
+        row_ind[k] = i;
+    }
+}
+
+__global__ void build_transpose_map(
+    const int *__restrict__ A_row_ind,
+    const int *__restrict__ A_col_ind,
+    const int *__restrict__ At_row_ptr,
+    const int *__restrict__ At_col_ind,
+    int nnz,
+    int *__restrict__ A_to_At)
+{
+    int k = blockIdx.x * blockDim.x + threadIdx.x;
+    if (k >= nnz)
+        return;
+
+    int i = A_row_ind[k];
+    int j = A_col_ind[k];
+
+    int start = At_row_ptr[j];
+    int end = At_row_ptr[j + 1];
+
+    int pos = -1;
+    for (int idx = start; idx < end; ++idx)
+    {
+        if (At_col_ind[idx] == i)
+        {
+            pos = idx;
+            break;
+        }
+    }
+
+    if (pos < 0)
+        return;
+
+    A_to_At[k] = pos;
+}
+
+__global__ void fill_finite_bounds_kernel(
+    const double *__restrict__ lb,
+    const double *__restrict__ ub,
+    double *__restrict__ lb_finite,
+    double *__restrict__ ub_finite,
+    int num_elements)
+{
+    int i = blockIdx.x * blockDim.x + threadIdx.x;
+    if (i >= num_elements)
+        return;
+
+    double Li = lb[i];
+    double Ui = ub[i];
+
+    lb_finite[i] = isfinite(Li) ? Li : 0.0;
+    ub_finite[i] = isfinite(Ui) ? Ui : 0.0;
+}
+
 __global__ void compute_next_pdhg_primal_solution_kernel(
     const double *current_primal, double *reflected_primal,
     const double *dual_product, const double *objective, const double *var_lb,
@@ -754,9 +789,8 @@ static void perform_restart(pdhg_solver_state_t *state,
     state->last_trial_fixed_point_error = INFINITY;
 }
 
-static void
-initialize_step_size_and_primal_weight(pdhg_solver_state_t *state,
-                                       const pdhg_parameters_t *params)
+static void initialize_step_size_and_primal_weight(pdhg_solver_state_t *state,
+                                                   const pdhg_parameters_t *params)
 {
     double max_sv = estimate_maximum_singular_value(
         state->sparse_handle, state->blas_handle, state->constraint_matrix,
@@ -823,6 +857,27 @@ void pdhg_solver_state_free(pdhg_solver_state_t *state)
         return;
     }
 
+    if (state->matA)
+        CUSPARSE_CHECK(cusparseDestroySpMat(state->matA));
+    if (state->matAt)
+        CUSPARSE_CHECK(cusparseDestroySpMat(state->matAt));
+    if (state->vec_primal_sol)
+        CUSPARSE_CHECK(cusparseDestroyDnVec(state->vec_primal_sol));
+    if (state->vec_dual_sol)
+        CUSPARSE_CHECK(cusparseDestroyDnVec(state->vec_dual_sol));
+    if (state->vec_primal_prod)
+        CUSPARSE_CHECK(cusparseDestroyDnVec(state->vec_primal_prod));
+    if (state->vec_dual_prod)
+        CUSPARSE_CHECK(cusparseDestroyDnVec(state->vec_dual_prod));
+    if (state->primal_spmv_buffer)
+        CUDA_CHECK(cudaFree(state->primal_spmv_buffer));
+    if (state->dual_spmv_buffer)
+        CUDA_CHECK(cudaFree(state->dual_spmv_buffer));
+    if (state->sparse_handle)
+        CUSPARSE_CHECK(cusparseDestroy(state->sparse_handle));
+    if (state->blas_handle)
+        CUBLAS_CHECK(cublasDestroy(state->blas_handle));
+
     if (state->variable_lower_bound)
         CUDA_CHECK(cudaFree(state->variable_lower_bound));
     if (state->variable_upper_bound)
@@ -833,14 +888,22 @@ void pdhg_solver_state_free(pdhg_solver_state_t *state)
         CUDA_CHECK(cudaFree(state->constraint_matrix->row_ptr));
     if (state->constraint_matrix->col_ind)
         CUDA_CHECK(cudaFree(state->constraint_matrix->col_ind));
+    if (state->constraint_matrix->row_ind)
+        CUDA_CHECK(cudaFree(state->constraint_matrix->row_ind));
     if (state->constraint_matrix->val)
         CUDA_CHECK(cudaFree(state->constraint_matrix->val));
+    if (state->constraint_matrix->transpose_map)
+        CUDA_CHECK(cudaFree(state->constraint_matrix->transpose_map));
     if (state->constraint_matrix_t->row_ptr)
         CUDA_CHECK(cudaFree(state->constraint_matrix_t->row_ptr));
     if (state->constraint_matrix_t->col_ind)
         CUDA_CHECK(cudaFree(state->constraint_matrix_t->col_ind));
+    if (state->constraint_matrix_t->row_ind)
+        CUDA_CHECK(cudaFree(state->constraint_matrix_t->row_ind));
     if (state->constraint_matrix_t->val)
         CUDA_CHECK(cudaFree(state->constraint_matrix_t->val));
+    if (state->constraint_matrix_t->transpose_map)
+        CUDA_CHECK(cudaFree(state->constraint_matrix_t->transpose_map));
     if (state->constraint_lower_bound)
         CUDA_CHECK(cudaFree(state->constraint_lower_bound));
     if (state->constraint_upper_bound)
@@ -894,6 +957,11 @@ void pdhg_solver_state_free(pdhg_solver_state_t *state)
     if (state->ones_dual_d)
         CUDA_CHECK(cudaFree(state->ones_dual_d));
 
+    if (state->constraint_matrix)
+        free(state->constraint_matrix);
+    if (state->constraint_matrix_t)
+        free(state->constraint_matrix_t);
+
     free(state);
 }
 
@@ -904,9 +972,8 @@ void rescale_info_free(rescale_info_t *info)
         return;
     }
 
-    lp_problem_free(info->scaled_problem);
-    free(info->con_rescale);
-    free(info->var_rescale);
+    CUDA_CHECK(cudaFree(info->con_rescale));
+    CUDA_CHECK(cudaFree(info->var_rescale));
 
     free(info);
 }
@@ -979,14 +1046,14 @@ void set_default_parameters(pdhg_parameters_t *params)
     params->restart_params.i_smooth = 0.3;
 }
 
-//Feasibility Polishing
+// Feasibility Polishing
 void feasibility_polish(const pdhg_parameters_t *params, pdhg_solver_state_t *state)
 {
     clock_t start_time = clock();
     if (state->relative_primal_residual < params->termination_criteria.eps_feas_polish_relative &&
         state->relative_dual_residual < params->termination_criteria.eps_feas_polish_relative)
     {
-        
+
         printf("Skipping feasibility polishing as the solution is already sufficiently feasible.\n");
         return;
     }
@@ -1000,7 +1067,7 @@ void feasibility_polish(const pdhg_parameters_t *params, pdhg_solver_state_t *st
         original_primal_weight = (state->objective_vector_norm + 1.0) / (state->constraint_bound_norm + 1.0);
     }
 
-    //PRIMAL FEASIBILITY POLISHING
+    // PRIMAL FEASIBILITY POLISHING
     pdhg_solver_state_t *primal_state = initialize_primal_feas_polish_state(state);
     primal_state->primal_weight = original_primal_weight;
     primal_state->best_primal_weight = original_primal_weight;
@@ -1016,8 +1083,8 @@ void feasibility_polish(const pdhg_parameters_t *params, pdhg_solver_state_t *st
         state->primal_objective_value = primal_state->primal_objective_value;
     }
     state->feasibility_iteration += primal_state->total_count - 1;
-    
-    //DUAL FEASIBILITY POLISHING
+
+    // DUAL FEASIBILITY POLISHING
     pdhg_solver_state_t *dual_state = initialize_dual_feas_polish_state(state);
     dual_state->primal_weight = original_primal_weight;
     dual_state->best_primal_weight = original_primal_weight;
@@ -1036,7 +1103,7 @@ void feasibility_polish(const pdhg_parameters_t *params, pdhg_solver_state_t *st
 
     state->objective_gap = fabs(state->primal_objective_value - state->dual_objective_value);
     state->relative_objective_gap = state->objective_gap / (1.0 + fabs(state->primal_objective_value) + fabs(state->dual_objective_value));
-    
+
     // FINAL LOGGING
     pdhg_feas_polish_final_log(primal_state, dual_state, params->verbose);
     primal_feas_polish_state_free(primal_state);
@@ -1150,22 +1217,22 @@ static pdhg_solver_state_t *initialize_primal_feas_polish_state(
     CUDA_CHECK(cudaMalloc(&dest, bytes)); \
     CUDA_CHECK(cudaMemset(dest, 0, bytes));
 
-    //RESET PROBLEM TO FEASIBILITY PROBLEM
+    // RESET PROBLEM TO FEASIBILITY PROBLEM
     ALLOC_ZERO(primal_state->objective_vector, num_var * sizeof(double));
     primal_state->objective_constant = 0.0;
 
-#define ALLOC_AND_COPY_DEV(dest, src, bytes)  \
-    CUDA_CHECK(cudaMalloc(&dest, bytes)); \
+#define ALLOC_AND_COPY_DEV(dest, src, bytes) \
+    CUDA_CHECK(cudaMalloc(&dest, bytes));    \
     CUDA_CHECK(cudaMemcpy(dest, src, bytes, cudaMemcpyDeviceToDevice));
 
-    //ALLOCATE AND COPY SOLUTION VECTORS
+    // ALLOCATE AND COPY SOLUTION VECTORS
     ALLOC_AND_COPY_DEV(primal_state->initial_primal_solution, original_state->initial_primal_solution, num_var * sizeof(double));
     ALLOC_AND_COPY_DEV(primal_state->current_primal_solution, original_state->current_primal_solution, num_var * sizeof(double));
     ALLOC_AND_COPY_DEV(primal_state->pdhg_primal_solution, original_state->pdhg_primal_solution, num_var * sizeof(double));
     ALLOC_AND_COPY_DEV(primal_state->reflected_primal_solution, original_state->reflected_primal_solution, num_var * sizeof(double));
     ALLOC_AND_COPY_DEV(primal_state->primal_product, original_state->primal_product, num_cons * sizeof(double));
 
-    //ALLOC ZERO FOR OTHERS
+    // ALLOC ZERO FOR OTHERS
     ALLOC_ZERO(primal_state->initial_dual_solution, num_cons * sizeof(double));
     ALLOC_ZERO(primal_state->current_dual_solution, num_cons * sizeof(double));
     ALLOC_ZERO(primal_state->pdhg_dual_solution, num_cons * sizeof(double));
@@ -1179,7 +1246,7 @@ static pdhg_solver_state_t *initialize_primal_feas_polish_state(
     ALLOC_ZERO(primal_state->delta_primal_solution, num_var * sizeof(double));
     ALLOC_ZERO(primal_state->delta_dual_solution, num_cons * sizeof(double));
 
-    //RESET SCALAR
+    // RESET SCALAR
     primal_state->primal_weight_error_sum = 0.0;
     primal_state->primal_weight_last_error = 0.0;
     primal_state->best_primal_weight = 0.0;
@@ -1195,7 +1262,7 @@ static pdhg_solver_state_t *initialize_primal_feas_polish_state(
     primal_state->cumulative_time_sec = 0.0;
     primal_state->best_primal_dual_residual_gap = INFINITY;
 
-    //IGNORE DUAL RESIDUAL AND OBJECTIVE GAP
+    // IGNORE DUAL RESIDUAL AND OBJECTIVE GAP
     primal_state->relative_dual_residual = 0.0;
     primal_state->absolute_dual_residual = 0.0;
     primal_state->relative_objective_gap = 0.0;
@@ -1206,13 +1273,15 @@ static pdhg_solver_state_t *initialize_primal_feas_polish_state(
 
 void primal_feas_polish_state_free(pdhg_solver_state_t *state)
 {
-    #define SAFE_CUDA_FREE(p)   \
-    if ((p) != NULL) {          \
-        CUDA_CHECK(cudaFree(p));\
-        (p) = NULL;             \
-    }                           \
-    
-    if (!state) return;
+#define SAFE_CUDA_FREE(p)        \
+    if ((p) != NULL)             \
+    {                            \
+        CUDA_CHECK(cudaFree(p)); \
+        (p) = NULL;              \
+    }
+
+    if (!state)
+        return;
     SAFE_CUDA_FREE(state->objective_vector);
     SAFE_CUDA_FREE(state->initial_primal_solution);
     SAFE_CUDA_FREE(state->current_primal_solution);
@@ -1225,7 +1294,7 @@ void primal_feas_polish_state_free(pdhg_solver_state_t *state)
     SAFE_CUDA_FREE(state->pdhg_dual_solution);
     SAFE_CUDA_FREE(state->reflected_dual_solution);
     SAFE_CUDA_FREE(state->primal_product);
-    
+
     SAFE_CUDA_FREE(state->primal_slack);
     SAFE_CUDA_FREE(state->dual_slack);
     SAFE_CUDA_FREE(state->primal_residual);
@@ -1245,7 +1314,6 @@ __global__ void zero_finite_value_vectors_kernel(
             vec[idx] = 0.0;
     }
 }
-    
 
 static pdhg_solver_state_t *initialize_dual_feas_polish_state(
     const pdhg_solver_state_t *original_state)
@@ -1255,18 +1323,18 @@ static pdhg_solver_state_t *initialize_dual_feas_polish_state(
     int num_var = original_state->num_variables;
     int num_cons = original_state->num_constraints;
 
-    #define ALLOC_AND_COPY_DEV(dest, src, bytes)  \
-        CUDA_CHECK(cudaMalloc(&dest, bytes)); \
-        CUDA_CHECK(cudaMemcpy(dest, src, bytes, cudaMemcpyDeviceToDevice));
-
-    //RESET PROBLEM TO DUAL FEASIBILITY PROBLEM
-    #define SET_FINITE_TO_ZERO(vec, n) \
-        { \
-            int threads = 256; \
-            int blocks = (n + threads - 1) / threads; \
-            zero_finite_value_vectors_kernel<<<blocks, threads>>>(vec, n); \
-            CUDA_CHECK(cudaDeviceSynchronize()); \
-        }
+#define ALLOC_AND_COPY_DEV(dest, src, bytes) \
+    CUDA_CHECK(cudaMalloc(&dest, bytes));    \
+    CUDA_CHECK(cudaMemcpy(dest, src, bytes, cudaMemcpyDeviceToDevice));
+
+// RESET PROBLEM TO DUAL FEASIBILITY PROBLEM
+#define SET_FINITE_TO_ZERO(vec, n)                                     \
+    {                                                                  \
+        int threads = 256;                                             \
+        int blocks = (n + threads - 1) / threads;                      \
+        zero_finite_value_vectors_kernel<<<blocks, threads>>>(vec, n); \
+        CUDA_CHECK(cudaDeviceSynchronize());                           \
+    }
 
     ALLOC_AND_COPY_DEV(dual_state->constraint_lower_bound, original_state->constraint_lower_bound, num_cons * sizeof(double));
     ALLOC_AND_COPY_DEV(dual_state->constraint_upper_bound, original_state->constraint_upper_bound, num_cons * sizeof(double));
@@ -1278,16 +1346,16 @@ static pdhg_solver_state_t *initialize_dual_feas_polish_state(
     SET_FINITE_TO_ZERO(dual_state->variable_lower_bound, num_var);
     SET_FINITE_TO_ZERO(dual_state->variable_upper_bound, num_var);
 
-    #define ALLOC_ZERO(dest, bytes)           \
-        CUDA_CHECK(cudaMalloc(&dest, bytes)); \
-        CUDA_CHECK(cudaMemset(dest, 0, bytes));
+#define ALLOC_ZERO(dest, bytes)           \
+    CUDA_CHECK(cudaMalloc(&dest, bytes)); \
+    CUDA_CHECK(cudaMemset(dest, 0, bytes));
 
     ALLOC_ZERO(dual_state->constraint_lower_bound_finite_val, num_cons * sizeof(double));
     ALLOC_ZERO(dual_state->constraint_upper_bound_finite_val, num_cons * sizeof(double));
     ALLOC_ZERO(dual_state->variable_lower_bound_finite_val, num_var * sizeof(double));
     ALLOC_ZERO(dual_state->variable_upper_bound_finite_val, num_var * sizeof(double));
 
-    //ALLOCATE AND COPY SOLUTION VECTORS
+    // ALLOCATE AND COPY SOLUTION VECTORS
     ALLOC_AND_COPY_DEV(dual_state->initial_dual_solution, original_state->initial_dual_solution, num_cons * sizeof(double));
     ALLOC_AND_COPY_DEV(dual_state->current_dual_solution, original_state->current_dual_solution, num_cons * sizeof(double));
     ALLOC_AND_COPY_DEV(dual_state->pdhg_dual_solution, original_state->pdhg_dual_solution, num_cons * sizeof(double));
@@ -1295,7 +1363,7 @@ static pdhg_solver_state_t *initialize_dual_feas_polish_state(
     ALLOC_AND_COPY_DEV(dual_state->dual_product, original_state->dual_product, num_var * sizeof(double));
     ALLOC_AND_COPY_DEV(dual_state->dual_slack, original_state->dual_slack, num_var * sizeof(double));
 
-    //ALLOC ZERO FOR OTHERS
+    // ALLOC ZERO FOR OTHERS
     ALLOC_ZERO(dual_state->initial_primal_solution, num_var * sizeof(double));
     ALLOC_ZERO(dual_state->current_primal_solution, num_var * sizeof(double));
     ALLOC_ZERO(dual_state->pdhg_primal_solution, num_var * sizeof(double));
@@ -1307,7 +1375,7 @@ static pdhg_solver_state_t *initialize_dual_feas_polish_state(
     ALLOC_ZERO(dual_state->delta_primal_solution, num_var * sizeof(double));
     ALLOC_ZERO(dual_state->delta_dual_solution, num_cons * sizeof(double));
 
-    //RESET SCALAR
+    // RESET SCALAR
     dual_state->primal_weight_error_sum = 0.0;
     dual_state->primal_weight_last_error = 0.0;
     dual_state->best_primal_weight = 0.0;
@@ -1323,7 +1391,7 @@ static pdhg_solver_state_t *initialize_dual_feas_polish_state(
     dual_state->cumulative_time_sec = 0.0;
     dual_state->best_primal_dual_residual_gap = INFINITY;
 
-    //IGNORE PRIMAL RESIDUAL AND OBJECTIVE GAP
+    // IGNORE PRIMAL RESIDUAL AND OBJECTIVE GAP
     dual_state->relative_primal_residual = 0.0;
     dual_state->absolute_primal_residual = 0.0;
     dual_state->relative_objective_gap = 0.0;
@@ -1333,13 +1401,15 @@ static pdhg_solver_state_t *initialize_dual_feas_polish_state(
 
 void dual_feas_polish_state_free(pdhg_solver_state_t *state)
 {
-    #define SAFE_CUDA_FREE(p)   \
-    if ((p) != NULL) {          \
-        CUDA_CHECK(cudaFree(p));\
-        (p) = NULL;             \
-    }                           \
-    
-    if (!state) return;
+#define SAFE_CUDA_FREE(p)        \
+    if ((p) != NULL)             \
+    {                            \
+        CUDA_CHECK(cudaFree(p)); \
+        (p) = NULL;              \
+    }
+
+    if (!state)
+        return;
     SAFE_CUDA_FREE(state->constraint_lower_bound);
     SAFE_CUDA_FREE(state->constraint_upper_bound);
     SAFE_CUDA_FREE(state->variable_lower_bound);
@@ -1360,7 +1430,7 @@ void dual_feas_polish_state_free(pdhg_solver_state_t *state)
     SAFE_CUDA_FREE(state->pdhg_dual_solution);
     SAFE_CUDA_FREE(state->reflected_dual_solution);
     SAFE_CUDA_FREE(state->primal_product);
-    
+
     SAFE_CUDA_FREE(state->primal_slack);
     SAFE_CUDA_FREE(state->dual_slack);
     SAFE_CUDA_FREE(state->primal_residual);
@@ -1439,5 +1509,3 @@ static void compute_dual_fixed_point_error(pdhg_solver_state_t *state)
                                    &dual_norm));
     state->fixed_point_error = dual_norm * dual_norm / state->primal_weight;
 }
-
-
diff --git a/src/utils.cu b/src/utils.cu
index 172e731..5864c56 100644
--- a/src/utils.cu
+++ b/src/utils.cu
@@ -373,13 +373,14 @@ void pdhg_final_log(const pdhg_solver_state_t *state, bool verbose,
                "--------------------\n");
     }
     printf("Solution Summary\n");
-    printf("  Status        : %s\n", termination_reason_to_string(reason));
-    printf("  Iterations    : %d\n", state->total_count - 1);
-    printf("  Solve time    : %.3g sec\n", state->cumulative_time_sec);
-    printf("  Primal obj    : %.10g\n", state->primal_objective_value);
-    printf("  Dual obj      : %.10g\n", state->dual_objective_value);
-    printf("  Primal infeas : %.3e\n", state->relative_primal_residual);
-    printf("  Dual infeas   : %.3e\n", state->relative_dual_residual);
+    printf("  Status            : %s\n", termination_reason_to_string(reason));
+    printf("  Precondition time : %.5g sec\n", state->rescaling_time_sec);
+    printf("  Iterations        : %d\n", state->total_count - 1);
+    printf("  Solve time        : %.3g sec\n", state->cumulative_time_sec);
+    printf("  Primal obj        : %.10g\n", state->primal_objective_value);
+    printf("  Dual obj          : %.10g\n", state->dual_objective_value);
+    printf("  Primal infeas     : %.3e\n", state->relative_primal_residual);
+    printf("  Dual infeas       : %.3e\n", state->relative_dual_residual);
 }
 
 void display_iteration_stats(const pdhg_solver_state_t *state, bool verbose)
@@ -1013,7 +1014,7 @@ void print_initial_feas_polish_info(bool is_primal_polish, const pdhg_parameters
     printf("---------------------------------------------------------------------------------------\n");
     if (is_primal_polish) printf("%s %s |  %s  | %s | %s \n",  "  iter", "  time ", "pr obj", " abs pr res ", " rel pr res ");
     // else printf("%s %s | %s | %s \n",  "  iter", "  time ", " abs du res ", " rel du res ");
-    else printf("%s %s |  %s  | %s | %s \n", "  iter", "  time ", "du obj", " abs du res ", " rel du res ");  
+    else printf("%s %s |  %s  | %s | %s \n", "  iter", "  time ", "du obj", " abs du res ", " rel du res ");
     printf("---------------------------------------------------------------------------------------\n");
 }
 
@@ -1137,7 +1138,7 @@ void compute_dual_feas_polish_residual(pdhg_solver_state_t *state, const pdhg_so
     CUSPARSE_CHECK(cusparseSpMV(state->sparse_handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &HOST_ONE, state->matAt, state->vec_dual_sol, &HOST_ZERO, state->vec_dual_prod, CUDA_R_64F, CUSPARSE_SPMV_CSR_ALG2, state->dual_spmv_buffer));
 
     compute_dual_feas_polish_residual_kerenl<<<state->num_blocks_primal_dual, THREADS_PER_BLOCK>>>(
-        state->dual_residual, 
+        state->dual_residual,
         state->pdhg_dual_solution,
         state->dual_product,
         state->dual_slack, state->objective_vector,