From f6052920346bf266403e0c0295f553c3ccf76d26 Mon Sep 17 00:00:00 2001
From: AaronDC60 <aaron.de.clercq1@gmail.com>
Date: Fri, 27 Jun 2025 17:35:57 +0200
Subject: [PATCH 1/4] Add option to change the number of datapoints

---
 include/data/dataset.h                 |  9 +++++++++
 mcmpy/include/py_dataset.h             |  3 +++
 mcmpy/src/py_dataset.cpp               |  4 +++-
 src/data/complexity.cpp                |  2 +-
 src/data/dataset.cpp                   | 10 ++++++++++
 src/data/evidence.cpp                  |  9 +++++----
 src/data/likelihood.cpp                |  5 +++--
 src/search/mcm_search/annealing.cpp    | 13 +++++++------
 src/search/mcm_search/div_and_conq.cpp | 14 +++++++-------
 src/search/mcm_search/greedy.cpp       | 13 +++++++------
 src/utilities/partition.cpp            | 26 --------------------------
 11 files changed, 55 insertions(+), 53 deletions(-)

diff --git a/include/data/dataset.h b/include/data/dataset.h
index 2c7a21c..653460a 100644
--- a/include/data/dataset.h
+++ b/include/data/dataset.h
@@ -31,6 +31,14 @@ class Data {
      */
     Data(const std::vector<std::pair<std::vector<__uint128_t>, unsigned int>>& _dataset, int n_var, int n_states, int n_samples);
 
+    /**
+     * Change the number of assumed datapoints in the dataset.
+     * This can be changed to perform an analysis of the dataset under the assumption that it is larger or smaller.
+     * 
+     * @param n_datapoints          The new number of assumed datapoints in the dataset.
+     */
+    void set_N_assumed(int n_datapoints);
+
     /**
      * Calculate the entropy of the dataset.
      * 
@@ -128,6 +136,7 @@ class Data {
     int N; // Number of datapoints
     int N_unique; // Number of different datapoints
     int n_ints; // Number of 128bit integers necessary to represent the data
+    int N_assumed; // The assumed number of datapoints in the dataset
 
     std::vector<__uint128_t> pow_q; // Vector containing the first n powers of q used to speed up the calculation of the evidence
 };
diff --git a/mcmpy/include/py_dataset.h b/mcmpy/include/py_dataset.h
index c4ca24f..f138340 100644
--- a/mcmpy/include/py_dataset.h
+++ b/mcmpy/include/py_dataset.h
@@ -65,9 +65,12 @@ class PyData {
 
     int get_n() {return this->data.n;};
     int get_N() {return this->data.N;};
+    int get_N_assumed() {return this->data.N_assumed;};
     int get_N_unique() {return this->data.N_unique;};
     int get_q() {return this->data.q;};
 
+    void set_N_assumed(int n_datapoints) {this->data.set_N_assumed(n_datapoints);};
+
     Data data;
 };
 
diff --git a/mcmpy/src/py_dataset.cpp b/mcmpy/src/py_dataset.cpp
index de74447..8ecaf11 100644
--- a/mcmpy/src/py_dataset.cpp
+++ b/mcmpy/src/py_dataset.cpp
@@ -187,8 +187,10 @@ void bind_data_class(py::module &m) {
 
         .def("entropy", &PyData::entropy, py::arg("base") = -1)
         .def("entropy_of_spin_operator", &PyData::entropy_of_spin_op, py::arg("spin_op"))
+
         .def_property_readonly("n", &PyData::get_n)
         .def_property_readonly("q", &PyData::get_q)
         .def_property_readonly("N", &PyData::get_N)
-        .def_property_readonly("N_unique", &PyData::get_N_unique);
+        .def_property_readonly("N_unique", &PyData::get_N_unique)
+        .def_property("N_assumed", &PyData::get_N_assumed, &PyData::set_N_assumed);
 }
diff --git a/src/data/complexity.cpp b/src/data/complexity.cpp
index 26aad50..5562997 100644
--- a/src/data/complexity.cpp
+++ b/src/data/complexity.cpp
@@ -22,7 +22,7 @@ double Data::calc_param_complexity_icc(__uint128_t component){
     // Determine the size of the component
     int r = bit_count(component);
     __uint128_t n_param = this->pow_q[r] - 1;
-    return (n_param / 2.) * log(this->N / (2 * M_PI));
+    return (n_param / 2.) * log(this->N_assumed / (2 * M_PI));
 }
 
 double Data::calc_param_complexity(std::vector<__uint128_t>& partition){
diff --git a/src/data/dataset.cpp b/src/data/dataset.cpp
index 36158fb..b95dbb9 100644
--- a/src/data/dataset.cpp
+++ b/src/data/dataset.cpp
@@ -17,6 +17,7 @@ Data::Data(const std::string& filename, int n_var, int n_states){
     this->n_ints = ceil(log2(q));
     // Process the dataset
     this->N = processing(filename, n_var, n_ints, n_states, dataset);
+    this->N_assumed = this->N;
     this->N_unique = dataset.size();
 
     // Precompute the powers of q to speed up the calculation of the evidence (q^r)
@@ -34,6 +35,7 @@ Data::Data(const std::vector<std::pair<std::vector<__uint128_t>, unsigned int>>&
     this->q = n_states;
     this->dataset = _dataset;
     this->N = n_samples;
+    this->N_assumed = this->N;
     this->N_unique = _dataset.size();
 
     // Calculate the number of integers necessary to represent the data
@@ -48,6 +50,14 @@ Data::Data(const std::vector<std::pair<std::vector<__uint128_t>, unsigned int>>&
     }
 }
 
+void Data::set_N_assumed(int n_datapoints){
+    // Check if the given number of datapoints is valid
+    if (n_datapoints < 1){
+        throw std::invalid_argument("The number of datapoints should be a positive number.");
+    }
+    this->N_assumed = n_datapoints;
+}
+
 double Data::entropy(int base){
     // Default base is q
     if (base == -1){
diff --git a/src/data/evidence.cpp b/src/data/evidence.cpp
index 73905d5..1d6aef4 100644
--- a/src/data/evidence.cpp
+++ b/src/data/evidence.cpp
@@ -5,21 +5,22 @@ double Data::calc_log_ev_icc(__uint128_t component){
     double log_evidence = 0;
     // Determine the size of the component
     int r = bit_count(component);
+    double alpha = this->N_assumed / this->N;
     // Contributions from the datapoint frequencies
     std::unordered_map<std::vector<__uint128_t>, unsigned int, HashVector128> counts = build_histogram(*this, component);
     std::unordered_map<std::vector<__uint128_t>, unsigned int, HashVector128>::iterator count_iter = counts.begin();
     while (count_iter != counts.end()){
-        log_evidence += (lgamma(count_iter->second + 0.5) - 0.5 * log(M_PI));
+        log_evidence += (lgamma(alpha * count_iter->second + 0.5) - 0.5 * log(M_PI));
         ++count_iter;
     }
 
     // Calculate prefactor
     if (r > 25){
         // Approximate for large components because lgamma overflows
-        log_evidence -= (r * log(this->q) * this->N);
+        log_evidence -= (r * log(this->q) * this->N_assumed);
     }
     else{
-        log_evidence += lgamma(this->pow_q[r]/2.) - lgamma(this->N + pow_q[r]/2.);
+        log_evidence += lgamma(this->pow_q[r]/2.) - lgamma(this->N_assumed + pow_q[r]/2.);
     }
     
     return log_evidence;
@@ -37,7 +38,7 @@ double Data::calc_log_ev(std::vector<__uint128_t>& partition){
         }
     }
     // Contribution from variables not in the model
-    log_ev -= this->N * (this->n - r) * log(this->q);
+    log_ev -= this->N_assumed * (this->n - r) * log(this->q);
 
     return log_ev;
 }
diff --git a/src/data/likelihood.cpp b/src/data/likelihood.cpp
index e61d62c..e5b5e33 100644
--- a/src/data/likelihood.cpp
+++ b/src/data/likelihood.cpp
@@ -4,13 +4,14 @@
 double Data::calc_log_likelihood_icc(__uint128_t component){
     double log_likelihood = 0;
     double N_datapoints = this->N;
+    double alpha = this->N_assumed / this->N;
     // Determine the size of the component
     int r = bit_count(component);
     // Get the datapoint frequencies
     std::unordered_map<std::vector<__uint128_t>, unsigned int, HashVector128> counts = build_histogram(*this, component);
     std::unordered_map<std::vector<__uint128_t>, unsigned int, HashVector128>::iterator count_iter = counts.begin();
     while (count_iter != counts.end()){
-        log_likelihood += count_iter->second * log(count_iter->second / N_datapoints);
+        log_likelihood += alpha * count_iter->second * log(count_iter->second / N_datapoints);
         ++count_iter;
     }
     return log_likelihood;
@@ -28,7 +29,7 @@ double Data::calc_log_likelihood(std::vector<__uint128_t>& partition){
         }
     }
     // Contribution from variables not in the model
-    log_likelihood -= this->N * (this->n - r) * log(this->q);
+    log_likelihood -= this->N_assumed * (this->n - r) * log(this->q);
     
     return log_likelihood;
 }
\ No newline at end of file
diff --git a/src/search/mcm_search/annealing.cpp b/src/search/mcm_search/annealing.cpp
index 80e4ba7..ddced3a 100644
--- a/src/search/mcm_search/annealing.cpp
+++ b/src/search/mcm_search/annealing.cpp
@@ -51,15 +51,16 @@ MCM MCMSearch::simulated_annealing(Data& data, MCM* init_mcm, std::string file_n
         *this->output_file << "Number of variables: " << data.n << "\n";
         *this->output_file << "Number of states per variable: " << data.q << "\n";
         *this->output_file << "Number of datapoints: " << data.N << "\n";
+        *this->output_file << "Number of assumed datapoints: " << data.N_assumed << "\n";
         *this->output_file << "Number of unique datapoint: " << data.N_unique << "\n";
         *this->output_file << "Entropy of the data: " << data.entropy() << " q-its \n\n";
 
         *this->output_file << "Initial partition \n";
         *this->output_file << "----------------- \n\n";
 
-        *this->output_file << "Log-evidence: " << mcm_tmp.log_ev << " = " << mcm_tmp.log_ev / (data.N * log(data.q)) << " q-its/datapoint \n\n";
+        *this->output_file << "Log-evidence: " << mcm_tmp.log_ev << " = " << mcm_tmp.log_ev / (data.N_assumed * log(data.q)) << " q-its/datapoint \n\n";
 
-        print_partition_details_to_file(*this->output_file, mcm_tmp, data.N, data.q);
+        print_partition_details_to_file(*this->output_file, mcm_tmp, data.N_assumed, data.q);
         *this->output_file << "\n";
 
         *this->output_file << "Start annealing \n";
@@ -114,7 +115,7 @@ MCM MCMSearch::simulated_annealing(Data& data, MCM* init_mcm, std::string file_n
 
             // Write to output file
             if (this->output_file){
-                *this->output_file << "Iteration " << i << " \t\t Temperature: " << settings.temp << " \t\t Log-evidence (q-its/datapoint): " << this->mcm_out.log_ev / (this->data->N * log(this->data->q)) << "\n";
+                *this->output_file << "Iteration " << i << " \t\t Temperature: " << settings.temp << " \t\t Log-evidence (q-its/datapoint): " << this->mcm_out.log_ev / (this->data->N_assumed * log(this->data->q)) << "\n";
             }
         }
         else{steps_since_improve++;}
@@ -161,10 +162,10 @@ MCM MCMSearch::simulated_annealing(Data& data, MCM* init_mcm, std::string file_n
         *this->output_file << "\nFinal partition \n";
         *this->output_file << "----------------- \n\n";
 
-        *this->output_file << "Log-evidence: " << this->mcm_out.log_ev << " = " << this->mcm_out.log_ev / (data.N * log(data.q)) << " q-its/datapoint \n";
-        *this->output_file << "Max-Log-likelihood: " << max_log_likelihood << " = " << max_log_likelihood / (data.N * log(data.q)) << " q-its/datapoint \n\n";
+        *this->output_file << "Log-evidence: " << this->mcm_out.log_ev << " = " << this->mcm_out.log_ev / (data.N_assumed * log(data.q)) << " q-its/datapoint \n";
+        *this->output_file << "Max-Log-likelihood: " << max_log_likelihood << " = " << max_log_likelihood / (data.N_assumed * log(data.q)) << " q-its/datapoint \n\n";
 
-        print_partition_details_to_file(*this->output_file, this->mcm_out, data.N, data.q);
+        print_partition_details_to_file(*this->output_file, this->mcm_out, data.N_assumed, data.q);
         *this->output_file << "\n";
         this->output_file.reset();
     }
diff --git a/src/search/mcm_search/div_and_conq.cpp b/src/search/mcm_search/div_and_conq.cpp
index 96966b8..5e5e441 100644
--- a/src/search/mcm_search/div_and_conq.cpp
+++ b/src/search/mcm_search/div_and_conq.cpp
@@ -50,15 +50,16 @@ MCM MCMSearch::divide_and_conquer(Data& data, MCM* init_mcm, std::string file_na
         *this->output_file << "Number of variables: " << data.n << "\n";
         *this->output_file << "Number of states per variable: " << data.q << "\n";
         *this->output_file << "Number of datapoints: " << data.N << "\n";
+        *this->output_file << "Number of assumed datapoints: " << data.N_assumed << "\n";
         *this->output_file << "Number of unique datapoint: " << data.N_unique << "\n";
         *this->output_file << "Entropy of the data: " << data.entropy() << " q-its \n\n";
 
         *this->output_file << "Initial partition \n";
         *this->output_file << "----------------- \n\n";
 
-        *this->output_file << "Log-evidence: " << this->mcm_out.log_ev << " = " << this->mcm_out.log_ev / (data.N * log(data.q)) << " q-its/datapoint \n\n";
+        *this->output_file << "Log-evidence: " << this->mcm_out.log_ev << " = " << this->mcm_out.log_ev / (data.N_assumed * log(data.q)) << " q-its/datapoint \n\n";
 
-        print_partition_details_to_file(*this->output_file, this->mcm_out, data.N, data.q);
+        print_partition_details_to_file(*this->output_file, this->mcm_out, data.N_assumed, data.q);
         *this->output_file << "\n";
     
         *this->output_file << "Start dividing \n";
@@ -83,10 +84,10 @@ MCM MCMSearch::divide_and_conquer(Data& data, MCM* init_mcm, std::string file_na
         *this->output_file << "\nFinal partition \n";
         *this->output_file << "----------------- \n\n";
 
-        *this->output_file << "Log-evidence: " << this->mcm_out.log_ev << " = " << this->mcm_out.log_ev / (data.N * log(data.q)) << " q-its/datapoint \n";
-        *this->output_file << "Max-Log-likelihood: " << max_log_likelihood << " = " << max_log_likelihood / (data.N * log(data.q)) << " q-its/datapoint \n\n";
+        *this->output_file << "Log-evidence: " << this->mcm_out.log_ev << " = " << this->mcm_out.log_ev / (data.N_assumed * log(data.q)) << " q-its/datapoint \n";
+        *this->output_file << "Max-Log-likelihood: " << max_log_likelihood << " = " << max_log_likelihood / (data.N_assumed * log(data.q)) << " q-its/datapoint \n\n";
 
-        print_partition_details_to_file(*this->output_file, this->mcm_out, data.N, data.q);
+        print_partition_details_to_file(*this->output_file, this->mcm_out, data.N_assumed, data.q);
         *this->output_file << "\n";
         this->output_file.reset();
     }
@@ -174,8 +175,7 @@ int MCMSearch::division(int move_from, int move_to){
     this->log_evidence_trajectory.push_back(this->mcm_out.log_ev);
 
     if (this->output_file){
-        std::cout << "Should only print with output file";
-        *this->output_file << "Splitting component " << move_from << "\t Log-evidence (q-its/datapoint): " << this->mcm_out.log_ev / (this->data->N * log(this->data->q)) << "\n";
+        *this->output_file << "Splitting component " << move_from << "\t Log-evidence (q-its/datapoint): " << this->mcm_out.log_ev / (this->data->N_assumed * log(this->data->q)) << "\n";
         *this->output_file << "\t Component " << move_from << " : \t" + int_to_string(this->mcm_out.partition[move_from], this->mcm_out.n) << "\n";
         *this->output_file << "\t Component " << move_to << " : \t" + int_to_string(this->mcm_out.partition[move_to], this->mcm_out.n) << "\n\n";
     }
diff --git a/src/search/mcm_search/greedy.cpp b/src/search/mcm_search/greedy.cpp
index 6acb93f..2378003 100644
--- a/src/search/mcm_search/greedy.cpp
+++ b/src/search/mcm_search/greedy.cpp
@@ -49,15 +49,16 @@ MCM MCMSearch::greedy_search(Data& data, MCM* init_mcm, std::string file_name){
         *this->output_file << "Number of variables: " << data.n << "\n";
         *this->output_file << "Number of states per variable: " << data.q << "\n";
         *this->output_file << "Number of datapoints: " << data.N << "\n";
+        *this->output_file << "Number of assumed datapoints: " << data.N_assumed << "\n";
         *this->output_file << "Number of unique datapoint: " << data.N_unique << "\n";
         *this->output_file << "Entropy of the data: " << data.entropy() << " q-its \n\n";
 
         *this->output_file << "Initial partition \n";
         *this->output_file << "----------------- \n\n";
 
-        *this->output_file << "Log-evidence: " << this->mcm_out.log_ev << " = " << this->mcm_out.log_ev / (data.N * log(data.q)) << " q-its/datapoint \n\n";
+        *this->output_file << "Log-evidence: " << this->mcm_out.log_ev << " = " << this->mcm_out.log_ev / (data.N_assumed * log(data.q)) << " q-its/datapoint \n\n";
 
-        print_partition_details_to_file(*this->output_file, this->mcm_out, data.N, data.q);
+        print_partition_details_to_file(*this->output_file, this->mcm_out, data.N_assumed, data.q);
         *this->output_file << "\n";
     }
 
@@ -76,10 +77,10 @@ MCM MCMSearch::greedy_search(Data& data, MCM* init_mcm, std::string file_name){
         *this->output_file << "\nFinal partition \n";
         *this->output_file << "----------------- \n\n";
 
-        *this->output_file << "Log-evidence: " << this->mcm_out.log_ev << " = " << this->mcm_out.log_ev / (data.N * log(data.q)) << " q-its/datapoint \n";
-        *this->output_file << "Max-Log-likelihood: " << max_log_likelihood << " = " << max_log_likelihood / (data.N * log(data.q)) << " q-its/datapoint \n\n";
+        *this->output_file << "Log-evidence: " << this->mcm_out.log_ev << " = " << this->mcm_out.log_ev / (data.N_assumed * log(data.q)) << " q-its/datapoint \n";
+        *this->output_file << "Max-Log-likelihood: " << max_log_likelihood << " = " << max_log_likelihood / (data.N_assumed * log(data.q)) << " q-its/datapoint \n\n";
 
-        print_partition_details_to_file(*this->output_file, this->mcm_out, data.N, data.q);
+        print_partition_details_to_file(*this->output_file, this->mcm_out, data.N_assumed, data.q);
         *this->output_file << "\n";
         this->output_file.reset();
     }
@@ -150,7 +151,7 @@ void MCMSearch::hierarchical_merging(){
 
             // Write to the output file
             if (this->output_file){
-                *this->output_file << "Merging components " << best_i << " and " << best_j << " \t Log-evidence (q-its/datapoint): "<<  (this->mcm_out.log_ev) / (this->data->N * log(this->data->q)) << "\n";
+                *this->output_file << "Merging components " << best_i << " and " << best_j << " \t Log-evidence (q-its/datapoint): "<<  (this->mcm_out.log_ev) / (this->data->N_assumed * log(this->data->q)) << "\n";
             }
         }
     }
diff --git a/src/utilities/partition.cpp b/src/utilities/partition.cpp
index 99e5569..675a29f 100644
--- a/src/utilities/partition.cpp
+++ b/src/utilities/partition.cpp
@@ -1,32 +1,6 @@
 #include "utilities/partition.h"
 #include "utilities/miscellaneous.h"
 
-/*
-std::vector<int> generate_random_partition(int n){
-    // Check if the number of variables is a valid number
-    if (n > 128){
-        throw std::domain_error("The maximum system size is 128 variables.");
-    }
-    if (n < 0){
-        throw std::domain_error("The number of variables should be a positive number.");
-    }
-
-    std::vector<int> growth_string(n, 0);
-
-    int index;
-    int max_index = 2;
-    for (int i = 1; i < n; i++){
-        index = rand()/(RAND_MAX/max_index);
-        growth_string[i] = index;
-
-        if (index == (max_index-1)){
-            max_index += 1;
-        }
-    }
-
-    return growth_string;
-}*/
-
 std::vector<__uint128_t> generate_random_partition(int n){
     // Check if the number of variables is a valid number
     if (n > 128){

From c96a71a881d6953ed62383f0faaf50bbbe86b8c5 Mon Sep 17 00:00:00 2001
From: AaronDC60 <aaron.de.clercq1@gmail.com>
Date: Fri, 27 Jun 2025 17:52:04 +0200
Subject: [PATCH 2/4] Update documentation

---
 docs/api/data.rst | 7 +++++++
 1 file changed, 7 insertions(+)

diff --git a/docs/api/data.rst b/docs/api/data.rst
index 85bbaee..28c8f2e 100644
--- a/docs/api/data.rst
+++ b/docs/api/data.rst
@@ -240,6 +240,13 @@ to infer either the best minimally complex (MCM) or the optimal basis representa
       
       The actual number of datapoints in the dataset (read-only).
 
+   .. py:attribute:: N_assumed
+      :type: int
+
+      The assumed number of datapoints in the dataset.
+
+      This attribute can be changed to perform an analysis of the dataset under the assumption that it is either larger or smaller.
+
    .. py:attribute:: N_unique
       :type: int
       

From 49b08534280c0cc4247bdf48bc2f32040d31ae27 Mon Sep 17 00:00:00 2001
From: AaronDC60 <aaron.de.clercq1@gmail.com>
Date: Fri, 27 Jun 2025 19:05:54 +0200
Subject: [PATCH 3/4] Change variable name

---
 docs/api/data.rst                      |  6 +++---
 include/data/dataset.h                 | 10 +++++-----
 mcmpy/include/py_dataset.h             |  4 ++--
 mcmpy/src/py_dataset.cpp               |  2 +-
 src/data/complexity.cpp                |  2 +-
 src/data/dataset.cpp                   |  8 ++++----
 src/data/evidence.cpp                  |  8 ++++----
 src/data/likelihood.cpp                |  4 ++--
 src/search/mcm_search/annealing.cpp    | 14 +++++++-------
 src/search/mcm_search/div_and_conq.cpp | 14 +++++++-------
 src/search/mcm_search/greedy.cpp       | 14 +++++++-------
 11 files changed, 43 insertions(+), 43 deletions(-)

diff --git a/docs/api/data.rst b/docs/api/data.rst
index 28c8f2e..2630757 100644
--- a/docs/api/data.rst
+++ b/docs/api/data.rst
@@ -240,12 +240,12 @@ to infer either the best minimally complex (MCM) or the optimal basis representa
       
       The actual number of datapoints in the dataset (read-only).
 
-   .. py:attribute:: N_assumed
+   .. py:attribute:: N_synthetic
       :type: int
 
-      The assumed number of datapoints in the dataset.
+      The synthetic number of datapoints in the dataset.
 
-      This attribute can be changed to perform an analysis of the dataset under the assumption that it is either larger or smaller.
+      This attribute can be changed to perform an analysis of the dataset as if it is either larger or smaller.
 
    .. py:attribute:: N_unique
       :type: int
diff --git a/include/data/dataset.h b/include/data/dataset.h
index 653460a..c8cd02b 100644
--- a/include/data/dataset.h
+++ b/include/data/dataset.h
@@ -32,12 +32,12 @@ class Data {
     Data(const std::vector<std::pair<std::vector<__uint128_t>, unsigned int>>& _dataset, int n_var, int n_states, int n_samples);
 
     /**
-     * Change the number of assumed datapoints in the dataset.
-     * This can be changed to perform an analysis of the dataset under the assumption that it is larger or smaller.
+     * Change the value for the number of datapoints in the dataset that is used for analysis.
+     * This can be changed to perform an analysis of the dataset as if it is larger or smaller.
      * 
-     * @param n_datapoints          The new number of assumed datapoints in the dataset.
+     * @param n_datapoints          The synthetic number of datapoints in the dataset.
      */
-    void set_N_assumed(int n_datapoints);
+    void set_N_synthetic(int n_datapoints);
 
     /**
      * Calculate the entropy of the dataset.
@@ -136,7 +136,7 @@ class Data {
     int N; // Number of datapoints
     int N_unique; // Number of different datapoints
     int n_ints; // Number of 128bit integers necessary to represent the data
-    int N_assumed; // The assumed number of datapoints in the dataset
+    int N_synthetic; // The synthetic number of datapoints in the dataset
 
     std::vector<__uint128_t> pow_q; // Vector containing the first n powers of q used to speed up the calculation of the evidence
 };
diff --git a/mcmpy/include/py_dataset.h b/mcmpy/include/py_dataset.h
index f138340..e3a5648 100644
--- a/mcmpy/include/py_dataset.h
+++ b/mcmpy/include/py_dataset.h
@@ -65,11 +65,11 @@ class PyData {
 
     int get_n() {return this->data.n;};
     int get_N() {return this->data.N;};
-    int get_N_assumed() {return this->data.N_assumed;};
+    int get_N_synthetic() {return this->data.N_synthetic;};
     int get_N_unique() {return this->data.N_unique;};
     int get_q() {return this->data.q;};
 
-    void set_N_assumed(int n_datapoints) {this->data.set_N_assumed(n_datapoints);};
+    void set_N_synthetic(int n_datapoints) {this->data.set_N_synthetic(n_datapoints);};
 
     Data data;
 };
diff --git a/mcmpy/src/py_dataset.cpp b/mcmpy/src/py_dataset.cpp
index 8ecaf11..ade3dd7 100644
--- a/mcmpy/src/py_dataset.cpp
+++ b/mcmpy/src/py_dataset.cpp
@@ -192,5 +192,5 @@ void bind_data_class(py::module &m) {
         .def_property_readonly("q", &PyData::get_q)
         .def_property_readonly("N", &PyData::get_N)
         .def_property_readonly("N_unique", &PyData::get_N_unique)
-        .def_property("N_assumed", &PyData::get_N_assumed, &PyData::set_N_assumed);
+        .def_property("N_synthetic", &PyData::get_N_synthetic, &PyData::set_N_synthetic);
 }
diff --git a/src/data/complexity.cpp b/src/data/complexity.cpp
index 5562997..dbbf3f0 100644
--- a/src/data/complexity.cpp
+++ b/src/data/complexity.cpp
@@ -22,7 +22,7 @@ double Data::calc_param_complexity_icc(__uint128_t component){
     // Determine the size of the component
     int r = bit_count(component);
     __uint128_t n_param = this->pow_q[r] - 1;
-    return (n_param / 2.) * log(this->N_assumed / (2 * M_PI));
+    return (n_param / 2.) * log(this->N_synthetic / (2 * M_PI));
 }
 
 double Data::calc_param_complexity(std::vector<__uint128_t>& partition){
diff --git a/src/data/dataset.cpp b/src/data/dataset.cpp
index b95dbb9..37d35f4 100644
--- a/src/data/dataset.cpp
+++ b/src/data/dataset.cpp
@@ -17,7 +17,7 @@ Data::Data(const std::string& filename, int n_var, int n_states){
     this->n_ints = ceil(log2(q));
     // Process the dataset
     this->N = processing(filename, n_var, n_ints, n_states, dataset);
-    this->N_assumed = this->N;
+    this->N_synthetic = this->N;
     this->N_unique = dataset.size();
 
     // Precompute the powers of q to speed up the calculation of the evidence (q^r)
@@ -35,7 +35,7 @@ Data::Data(const std::vector<std::pair<std::vector<__uint128_t>, unsigned int>>&
     this->q = n_states;
     this->dataset = _dataset;
     this->N = n_samples;
-    this->N_assumed = this->N;
+    this->N_synthetic = this->N;
     this->N_unique = _dataset.size();
 
     // Calculate the number of integers necessary to represent the data
@@ -50,12 +50,12 @@ Data::Data(const std::vector<std::pair<std::vector<__uint128_t>, unsigned int>>&
     }
 }
 
-void Data::set_N_assumed(int n_datapoints){
+void Data::set_N_synthetic(int n_datapoints){
     // Check if the given number of datapoints is valid
     if (n_datapoints < 1){
         throw std::invalid_argument("The number of datapoints should be a positive number.");
     }
-    this->N_assumed = n_datapoints;
+    this->N_synthetic = n_datapoints;
 }
 
 double Data::entropy(int base){
diff --git a/src/data/evidence.cpp b/src/data/evidence.cpp
index 1d6aef4..2f16a65 100644
--- a/src/data/evidence.cpp
+++ b/src/data/evidence.cpp
@@ -5,7 +5,7 @@ double Data::calc_log_ev_icc(__uint128_t component){
     double log_evidence = 0;
     // Determine the size of the component
     int r = bit_count(component);
-    double alpha = this->N_assumed / this->N;
+    double alpha = this->N_synthetic / this->N;
     // Contributions from the datapoint frequencies
     std::unordered_map<std::vector<__uint128_t>, unsigned int, HashVector128> counts = build_histogram(*this, component);
     std::unordered_map<std::vector<__uint128_t>, unsigned int, HashVector128>::iterator count_iter = counts.begin();
@@ -17,10 +17,10 @@ double Data::calc_log_ev_icc(__uint128_t component){
     // Calculate prefactor
     if (r > 25){
         // Approximate for large components because lgamma overflows
-        log_evidence -= (r * log(this->q) * this->N_assumed);
+        log_evidence -= (r * log(this->q) * this->N_synthetic);
     }
     else{
-        log_evidence += lgamma(this->pow_q[r]/2.) - lgamma(this->N_assumed + pow_q[r]/2.);
+        log_evidence += lgamma(this->pow_q[r]/2.) - lgamma(this->N_synthetic + pow_q[r]/2.);
     }
     
     return log_evidence;
@@ -38,7 +38,7 @@ double Data::calc_log_ev(std::vector<__uint128_t>& partition){
         }
     }
     // Contribution from variables not in the model
-    log_ev -= this->N_assumed * (this->n - r) * log(this->q);
+    log_ev -= this->N_synthetic * (this->n - r) * log(this->q);
 
     return log_ev;
 }
diff --git a/src/data/likelihood.cpp b/src/data/likelihood.cpp
index e5b5e33..d1bfafb 100644
--- a/src/data/likelihood.cpp
+++ b/src/data/likelihood.cpp
@@ -4,7 +4,7 @@
 double Data::calc_log_likelihood_icc(__uint128_t component){
     double log_likelihood = 0;
     double N_datapoints = this->N;
-    double alpha = this->N_assumed / this->N;
+    double alpha = this->N_synthetic / this->N;
     // Determine the size of the component
     int r = bit_count(component);
     // Get the datapoint frequencies
@@ -29,7 +29,7 @@ double Data::calc_log_likelihood(std::vector<__uint128_t>& partition){
         }
     }
     // Contribution from variables not in the model
-    log_likelihood -= this->N_assumed * (this->n - r) * log(this->q);
+    log_likelihood -= this->N_synthetic * (this->n - r) * log(this->q);
     
     return log_likelihood;
 }
\ No newline at end of file
diff --git a/src/search/mcm_search/annealing.cpp b/src/search/mcm_search/annealing.cpp
index ddced3a..26b669d 100644
--- a/src/search/mcm_search/annealing.cpp
+++ b/src/search/mcm_search/annealing.cpp
@@ -51,16 +51,16 @@ MCM MCMSearch::simulated_annealing(Data& data, MCM* init_mcm, std::string file_n
         *this->output_file << "Number of variables: " << data.n << "\n";
         *this->output_file << "Number of states per variable: " << data.q << "\n";
         *this->output_file << "Number of datapoints: " << data.N << "\n";
-        *this->output_file << "Number of assumed datapoints: " << data.N_assumed << "\n";
+        *this->output_file << "Number of synthetic datapoints: " << data.N_synthetic << "\n";
         *this->output_file << "Number of unique datapoint: " << data.N_unique << "\n";
         *this->output_file << "Entropy of the data: " << data.entropy() << " q-its \n\n";
 
         *this->output_file << "Initial partition \n";
         *this->output_file << "----------------- \n\n";
 
-        *this->output_file << "Log-evidence: " << mcm_tmp.log_ev << " = " << mcm_tmp.log_ev / (data.N_assumed * log(data.q)) << " q-its/datapoint \n\n";
+        *this->output_file << "Log-evidence: " << mcm_tmp.log_ev << " = " << mcm_tmp.log_ev / (data.N_synthetic * log(data.q)) << " q-its/datapoint \n\n";
 
-        print_partition_details_to_file(*this->output_file, mcm_tmp, data.N_assumed, data.q);
+        print_partition_details_to_file(*this->output_file, mcm_tmp, data.N_synthetic, data.q);
         *this->output_file << "\n";
 
         *this->output_file << "Start annealing \n";
@@ -115,7 +115,7 @@ MCM MCMSearch::simulated_annealing(Data& data, MCM* init_mcm, std::string file_n
 
             // Write to output file
             if (this->output_file){
-                *this->output_file << "Iteration " << i << " \t\t Temperature: " << settings.temp << " \t\t Log-evidence (q-its/datapoint): " << this->mcm_out.log_ev / (this->data->N_assumed * log(this->data->q)) << "\n";
+                *this->output_file << "Iteration " << i << " \t\t Temperature: " << settings.temp << " \t\t Log-evidence (q-its/datapoint): " << this->mcm_out.log_ev / (this->data->N_synthetic * log(this->data->q)) << "\n";
             }
         }
         else{steps_since_improve++;}
@@ -162,10 +162,10 @@ MCM MCMSearch::simulated_annealing(Data& data, MCM* init_mcm, std::string file_n
         *this->output_file << "\nFinal partition \n";
         *this->output_file << "----------------- \n\n";
 
-        *this->output_file << "Log-evidence: " << this->mcm_out.log_ev << " = " << this->mcm_out.log_ev / (data.N_assumed * log(data.q)) << " q-its/datapoint \n";
-        *this->output_file << "Max-Log-likelihood: " << max_log_likelihood << " = " << max_log_likelihood / (data.N_assumed * log(data.q)) << " q-its/datapoint \n\n";
+        *this->output_file << "Log-evidence: " << this->mcm_out.log_ev << " = " << this->mcm_out.log_ev / (data.N_synthetic * log(data.q)) << " q-its/datapoint \n";
+        *this->output_file << "Max-Log-likelihood: " << max_log_likelihood << " = " << max_log_likelihood / (data.N_synthetic * log(data.q)) << " q-its/datapoint \n\n";
 
-        print_partition_details_to_file(*this->output_file, this->mcm_out, data.N_assumed, data.q);
+        print_partition_details_to_file(*this->output_file, this->mcm_out, data.N_synthetic, data.q);
         *this->output_file << "\n";
         this->output_file.reset();
     }
diff --git a/src/search/mcm_search/div_and_conq.cpp b/src/search/mcm_search/div_and_conq.cpp
index 5e5e441..361f987 100644
--- a/src/search/mcm_search/div_and_conq.cpp
+++ b/src/search/mcm_search/div_and_conq.cpp
@@ -50,16 +50,16 @@ MCM MCMSearch::divide_and_conquer(Data& data, MCM* init_mcm, std::string file_na
         *this->output_file << "Number of variables: " << data.n << "\n";
         *this->output_file << "Number of states per variable: " << data.q << "\n";
         *this->output_file << "Number of datapoints: " << data.N << "\n";
-        *this->output_file << "Number of assumed datapoints: " << data.N_assumed << "\n";
+        *this->output_file << "Number of synthetic datapoints: " << data.N_synthetic << "\n";
         *this->output_file << "Number of unique datapoint: " << data.N_unique << "\n";
         *this->output_file << "Entropy of the data: " << data.entropy() << " q-its \n\n";
 
         *this->output_file << "Initial partition \n";
         *this->output_file << "----------------- \n\n";
 
-        *this->output_file << "Log-evidence: " << this->mcm_out.log_ev << " = " << this->mcm_out.log_ev / (data.N_assumed * log(data.q)) << " q-its/datapoint \n\n";
+        *this->output_file << "Log-evidence: " << this->mcm_out.log_ev << " = " << this->mcm_out.log_ev / (data.N_synthetic * log(data.q)) << " q-its/datapoint \n\n";
 
-        print_partition_details_to_file(*this->output_file, this->mcm_out, data.N_assumed, data.q);
+        print_partition_details_to_file(*this->output_file, this->mcm_out, data.N_synthetic, data.q);
         *this->output_file << "\n";
     
         *this->output_file << "Start dividing \n";
@@ -84,10 +84,10 @@ MCM MCMSearch::divide_and_conquer(Data& data, MCM* init_mcm, std::string file_na
         *this->output_file << "\nFinal partition \n";
         *this->output_file << "----------------- \n\n";
 
-        *this->output_file << "Log-evidence: " << this->mcm_out.log_ev << " = " << this->mcm_out.log_ev / (data.N_assumed * log(data.q)) << " q-its/datapoint \n";
-        *this->output_file << "Max-Log-likelihood: " << max_log_likelihood << " = " << max_log_likelihood / (data.N_assumed * log(data.q)) << " q-its/datapoint \n\n";
+        *this->output_file << "Log-evidence: " << this->mcm_out.log_ev << " = " << this->mcm_out.log_ev / (data.N_synthetic * log(data.q)) << " q-its/datapoint \n";
+        *this->output_file << "Max-Log-likelihood: " << max_log_likelihood << " = " << max_log_likelihood / (data.N_synthetic * log(data.q)) << " q-its/datapoint \n\n";
 
-        print_partition_details_to_file(*this->output_file, this->mcm_out, data.N_assumed, data.q);
+        print_partition_details_to_file(*this->output_file, this->mcm_out, data.N_synthetic, data.q);
         *this->output_file << "\n";
         this->output_file.reset();
     }
@@ -175,7 +175,7 @@ int MCMSearch::division(int move_from, int move_to){
     this->log_evidence_trajectory.push_back(this->mcm_out.log_ev);
 
     if (this->output_file){
-        *this->output_file << "Splitting component " << move_from << "\t Log-evidence (q-its/datapoint): " << this->mcm_out.log_ev / (this->data->N_assumed * log(this->data->q)) << "\n";
+        *this->output_file << "Splitting component " << move_from << "\t Log-evidence (q-its/datapoint): " << this->mcm_out.log_ev / (this->data->N_synthetic * log(this->data->q)) << "\n";
         *this->output_file << "\t Component " << move_from << " : \t" + int_to_string(this->mcm_out.partition[move_from], this->mcm_out.n) << "\n";
         *this->output_file << "\t Component " << move_to << " : \t" + int_to_string(this->mcm_out.partition[move_to], this->mcm_out.n) << "\n\n";
     }
diff --git a/src/search/mcm_search/greedy.cpp b/src/search/mcm_search/greedy.cpp
index 2378003..a176b94 100644
--- a/src/search/mcm_search/greedy.cpp
+++ b/src/search/mcm_search/greedy.cpp
@@ -49,16 +49,16 @@ MCM MCMSearch::greedy_search(Data& data, MCM* init_mcm, std::string file_name){
         *this->output_file << "Number of variables: " << data.n << "\n";
         *this->output_file << "Number of states per variable: " << data.q << "\n";
         *this->output_file << "Number of datapoints: " << data.N << "\n";
-        *this->output_file << "Number of assumed datapoints: " << data.N_assumed << "\n";
+        *this->output_file << "Number of synthetic datapoints: " << data.N_synthetic << "\n";
         *this->output_file << "Number of unique datapoint: " << data.N_unique << "\n";
         *this->output_file << "Entropy of the data: " << data.entropy() << " q-its \n\n";
 
         *this->output_file << "Initial partition \n";
         *this->output_file << "----------------- \n\n";
 
-        *this->output_file << "Log-evidence: " << this->mcm_out.log_ev << " = " << this->mcm_out.log_ev / (data.N_assumed * log(data.q)) << " q-its/datapoint \n\n";
+        *this->output_file << "Log-evidence: " << this->mcm_out.log_ev << " = " << this->mcm_out.log_ev / (data.N_synthetic * log(data.q)) << " q-its/datapoint \n\n";
 
-        print_partition_details_to_file(*this->output_file, this->mcm_out, data.N_assumed, data.q);
+        print_partition_details_to_file(*this->output_file, this->mcm_out, data.N_synthetic, data.q);
         *this->output_file << "\n";
     }
 
@@ -77,10 +77,10 @@ MCM MCMSearch::greedy_search(Data& data, MCM* init_mcm, std::string file_name){
         *this->output_file << "\nFinal partition \n";
         *this->output_file << "----------------- \n\n";
 
-        *this->output_file << "Log-evidence: " << this->mcm_out.log_ev << " = " << this->mcm_out.log_ev / (data.N_assumed * log(data.q)) << " q-its/datapoint \n";
-        *this->output_file << "Max-Log-likelihood: " << max_log_likelihood << " = " << max_log_likelihood / (data.N_assumed * log(data.q)) << " q-its/datapoint \n\n";
+        *this->output_file << "Log-evidence: " << this->mcm_out.log_ev << " = " << this->mcm_out.log_ev / (data.N_synthetic * log(data.q)) << " q-its/datapoint \n";
+        *this->output_file << "Max-Log-likelihood: " << max_log_likelihood << " = " << max_log_likelihood / (data.N_synthetic * log(data.q)) << " q-its/datapoint \n\n";
 
-        print_partition_details_to_file(*this->output_file, this->mcm_out, data.N_assumed, data.q);
+        print_partition_details_to_file(*this->output_file, this->mcm_out, data.N_synthetic, data.q);
         *this->output_file << "\n";
         this->output_file.reset();
     }
@@ -151,7 +151,7 @@ void MCMSearch::hierarchical_merging(){
 
             // Write to the output file
             if (this->output_file){
-                *this->output_file << "Merging components " << best_i << " and " << best_j << " \t Log-evidence (q-its/datapoint): "<<  (this->mcm_out.log_ev) / (this->data->N_assumed * log(this->data->q)) << "\n";
+                *this->output_file << "Merging components " << best_i << " and " << best_j << " \t Log-evidence (q-its/datapoint): "<<  (this->mcm_out.log_ev) / (this->data->N_synthetic * log(this->data->q)) << "\n";
             }
         }
     }

From 7397dd90fe25f4035aba6569725b8f65c8daad4b Mon Sep 17 00:00:00 2001
From: AaronDC60 <aaron.de.clercq1@gmail.com>
Date: Fri, 27 Jun 2025 19:18:01 +0200
Subject: [PATCH 4/4] Fix calculation of alpha

---
 src/data/evidence.cpp   | 3 ++-
 src/data/likelihood.cpp | 2 +-
 2 files changed, 3 insertions(+), 2 deletions(-)

diff --git a/src/data/evidence.cpp b/src/data/evidence.cpp
index 2f16a65..87201c7 100644
--- a/src/data/evidence.cpp
+++ b/src/data/evidence.cpp
@@ -5,7 +5,8 @@ double Data::calc_log_ev_icc(__uint128_t component){
     double log_evidence = 0;
     // Determine the size of the component
     int r = bit_count(component);
-    double alpha = this->N_synthetic / this->N;
+    double N_syn = this->N_synthetic;
+    double alpha = N_syn / this->N;
     // Contributions from the datapoint frequencies
     std::unordered_map<std::vector<__uint128_t>, unsigned int, HashVector128> counts = build_histogram(*this, component);
     std::unordered_map<std::vector<__uint128_t>, unsigned int, HashVector128>::iterator count_iter = counts.begin();
diff --git a/src/data/likelihood.cpp b/src/data/likelihood.cpp
index d1bfafb..fa0187e 100644
--- a/src/data/likelihood.cpp
+++ b/src/data/likelihood.cpp
@@ -4,7 +4,7 @@
 double Data::calc_log_likelihood_icc(__uint128_t component){
     double log_likelihood = 0;
     double N_datapoints = this->N;
-    double alpha = this->N_synthetic / this->N;
+    double alpha = this->N_synthetic / N_datapoints;
     // Determine the size of the component
     int r = bit_count(component);
     // Get the datapoint frequencies