DM-Lab-UvA · AaronDC60 · Jun 27, 2025 · Jun 27, 2025 · Jun 27, 2025 · Jun 27, 2025
diff --git a/docs/api/data.rst b/docs/api/data.rst
@@ -240,6 +240,13 @@ to infer either the best minimally complex (MCM) or the optimal basis representa
 
       The actual number of datapoints in the dataset (read-only).
 
+   .. py:attribute:: N_synthetic
+      :type: int
+
+      The synthetic number of datapoints in the dataset.
+
+      This attribute can be changed to perform an analysis of the dataset as if it is either larger or smaller.
+
    .. py:attribute:: N_unique
       :type: int
 

diff --git a/include/data/dataset.h b/include/data/dataset.h
@@ -31,6 +31,14 @@ class Data {
      */
     Data(const std::vector<std::pair<std::vector<__uint128_t>, unsigned int>>& _dataset, int n_var, int n_states, int n_samples);
 
+    /**
+     * Change the value for the number of datapoints in the dataset that is used for analysis.
+     * This can be changed to perform an analysis of the dataset as if it is larger or smaller.
+     * 
+     * @param n_datapoints          The synthetic number of datapoints in the dataset.
+     */
+    void set_N_synthetic(int n_datapoints);
+
     /**
      * Calculate the entropy of the dataset.
      * 
@@ -128,6 +136,7 @@ class Data {
     int N; // Number of datapoints
     int N_unique; // Number of different datapoints
     int n_ints; // Number of 128bit integers necessary to represent the data
+    int N_synthetic; // The synthetic number of datapoints in the dataset
 
     std::vector<__uint128_t> pow_q; // Vector containing the first n powers of q used to speed up the calculation of the evidence
 };
diff --git a/mcmpy/include/py_dataset.h b/mcmpy/include/py_dataset.h
@@ -65,9 +65,12 @@ class PyData {
 
     int get_n() {return this->data.n;};
     int get_N() {return this->data.N;};
+    int get_N_synthetic() {return this->data.N_synthetic;};
     int get_N_unique() {return this->data.N_unique;};
     int get_q() {return this->data.q;};
 
+    void set_N_synthetic(int n_datapoints) {this->data.set_N_synthetic(n_datapoints);};
+
     Data data;
 };
 

diff --git a/mcmpy/src/py_dataset.cpp b/mcmpy/src/py_dataset.cpp
@@ -187,8 +187,10 @@ void bind_data_class(py::module &m) {
 
         .def("entropy", &PyData::entropy, py::arg("base") = -1)
         .def("entropy_of_spin_operator", &PyData::entropy_of_spin_op, py::arg("spin_op"))
+
         .def_property_readonly("n", &PyData::get_n)
         .def_property_readonly("q", &PyData::get_q)
         .def_property_readonly("N", &PyData::get_N)
-        .def_property_readonly("N_unique", &PyData::get_N_unique);
+        .def_property_readonly("N_unique", &PyData::get_N_unique)
+        .def_property("N_synthetic", &PyData::get_N_synthetic, &PyData::set_N_synthetic);
 }
diff --git a/src/data/complexity.cpp b/src/data/complexity.cpp
@@ -22,7 +22,7 @@ double Data::calc_param_complexity_icc(__uint128_t component){
     // Determine the size of the component
     int r = bit_count(component);
     __uint128_t n_param = this->pow_q[r] - 1;
-    return (n_param / 2.) * log(this->N / (2 * M_PI));
+    return (n_param / 2.) * log(this->N_synthetic / (2 * M_PI));
 }
 
 double Data::calc_param_complexity(std::vector<__uint128_t>& partition){

diff --git a/src/data/dataset.cpp b/src/data/dataset.cpp
@@ -17,6 +17,7 @@ Data::Data(const std::string& filename, int n_var, int n_states){
     this->n_ints = ceil(log2(q));
     // Process the dataset
     this->N = processing(filename, n_var, n_ints, n_states, dataset);
+    this->N_synthetic = this->N;
     this->N_unique = dataset.size();
 
     // Precompute the powers of q to speed up the calculation of the evidence (q^r)
@@ -34,6 +35,7 @@ Data::Data(const std::vector<std::pair<std::vector<__uint128_t>, unsigned int>>&
     this->q = n_states;
     this->dataset = _dataset;
     this->N = n_samples;
+    this->N_synthetic = this->N;
     this->N_unique = _dataset.size();
 
     // Calculate the number of integers necessary to represent the data
@@ -48,6 +50,14 @@ Data::Data(const std::vector<std::pair<std::vector<__uint128_t>, unsigned int>>&
     }
 }
 
+void Data::set_N_synthetic(int n_datapoints){
+    // Check if the given number of datapoints is valid
+    if (n_datapoints < 1){
+        throw std::invalid_argument("The number of datapoints should be a positive number.");
+    }
+    this->N_synthetic = n_datapoints;
+}
+
 double Data::entropy(int base){
     // Default base is q
     if (base == -1){

diff --git a/src/data/evidence.cpp b/src/data/evidence.cpp
@@ -5,21 +5,23 @@ double Data::calc_log_ev_icc(__uint128_t component){
     double log_evidence = 0;
     // Determine the size of the component
     int r = bit_count(component);
+    double N_syn = this->N_synthetic;
+    double alpha = N_syn / this->N;
     // Contributions from the datapoint frequencies
     std::unordered_map<std::vector<__uint128_t>, unsigned int, HashVector128> counts = build_histogram(*this, component);
     std::unordered_map<std::vector<__uint128_t>, unsigned int, HashVector128>::iterator count_iter = counts.begin();
     while (count_iter != counts.end()){
-        log_evidence += (lgamma(count_iter->second + 0.5) - 0.5 * log(M_PI));
+        log_evidence += (lgamma(alpha * count_iter->second + 0.5) - 0.5 * log(M_PI));
         ++count_iter;
     }
 
     // Calculate prefactor
     if (r > 25){
         // Approximate for large components because lgamma overflows
-        log_evidence -= (r * log(this->q) * this->N);
+        log_evidence -= (r * log(this->q) * this->N_synthetic);
     }
     else{
-        log_evidence += lgamma(this->pow_q[r]/2.) - lgamma(this->N + pow_q[r]/2.);
+        log_evidence += lgamma(this->pow_q[r]/2.) - lgamma(this->N_synthetic + pow_q[r]/2.);
     }
 
     return log_evidence;
@@ -37,7 +39,7 @@ double Data::calc_log_ev(std::vector<__uint128_t>& partition){
         }
     }
     // Contribution from variables not in the model
-    log_ev -= this->N * (this->n - r) * log(this->q);
+    log_ev -= this->N_synthetic * (this->n - r) * log(this->q);
 
     return log_ev;
 }
diff --git a/src/data/likelihood.cpp b/src/data/likelihood.cpp
@@ -4,13 +4,14 @@
 double Data::calc_log_likelihood_icc(__uint128_t component){
     double log_likelihood = 0;
     double N_datapoints = this->N;
+    double alpha = this->N_synthetic / N_datapoints;
     // Determine the size of the component
     int r = bit_count(component);
     // Get the datapoint frequencies
     std::unordered_map<std::vector<__uint128_t>, unsigned int, HashVector128> counts = build_histogram(*this, component);
     std::unordered_map<std::vector<__uint128_t>, unsigned int, HashVector128>::iterator count_iter = counts.begin();
     while (count_iter != counts.end()){
-        log_likelihood += count_iter->second * log(count_iter->second / N_datapoints);
+        log_likelihood += alpha * count_iter->second * log(count_iter->second / N_datapoints);
         ++count_iter;
     }
     return log_likelihood;
@@ -28,7 +29,7 @@ double Data::calc_log_likelihood(std::vector<__uint128_t>& partition){
         }
     }
     // Contribution from variables not in the model
-    log_likelihood -= this->N * (this->n - r) * log(this->q);
+    log_likelihood -= this->N_synthetic * (this->n - r) * log(this->q);
 
     return log_likelihood;
 }
diff --git a/src/search/mcm_search/annealing.cpp b/src/search/mcm_search/annealing.cpp
@@ -51,15 +51,16 @@ MCM MCMSearch::simulated_annealing(Data& data, MCM* init_mcm, std::string file_n
         *this->output_file << "Number of variables: " << data.n << "\n";
         *this->output_file << "Number of states per variable: " << data.q << "\n";
         *this->output_file << "Number of datapoints: " << data.N << "\n";
+        *this->output_file << "Number of synthetic datapoints: " << data.N_synthetic << "\n";
         *this->output_file << "Number of unique datapoint: " << data.N_unique << "\n";
         *this->output_file << "Entropy of the data: " << data.entropy() << " q-its \n\n";
 
         *this->output_file << "Initial partition \n";
         *this->output_file << "----------------- \n\n";
 
-        *this->output_file << "Log-evidence: " << mcm_tmp.log_ev << " = " << mcm_tmp.log_ev / (data.N * log(data.q)) << " q-its/datapoint \n\n";
+        *this->output_file << "Log-evidence: " << mcm_tmp.log_ev << " = " << mcm_tmp.log_ev / (data.N_synthetic * log(data.q)) << " q-its/datapoint \n\n";
 
-        print_partition_details_to_file(*this->output_file, mcm_tmp, data.N, data.q);
+        print_partition_details_to_file(*this->output_file, mcm_tmp, data.N_synthetic, data.q);
         *this->output_file << "\n";
 
         *this->output_file << "Start annealing \n";
@@ -114,7 +115,7 @@ MCM MCMSearch::simulated_annealing(Data& data, MCM* init_mcm, std::string file_n
 
             // Write to output file
             if (this->output_file){
-                *this->output_file << "Iteration " << i << " \t\t Temperature: " << settings.temp << " \t\t Log-evidence (q-its/datapoint): " << this->mcm_out.log_ev / (this->data->N * log(this->data->q)) << "\n";
+                *this->output_file << "Iteration " << i << " \t\t Temperature: " << settings.temp << " \t\t Log-evidence (q-its/datapoint): " << this->mcm_out.log_ev / (this->data->N_synthetic * log(this->data->q)) << "\n";
             }
         }
         else{steps_since_improve++;}
@@ -161,10 +162,10 @@ MCM MCMSearch::simulated_annealing(Data& data, MCM* init_mcm, std::string file_n
         *this->output_file << "\nFinal partition \n";
         *this->output_file << "----------------- \n\n";
 
-        *this->output_file << "Log-evidence: " << this->mcm_out.log_ev << " = " << this->mcm_out.log_ev / (data.N * log(data.q)) << " q-its/datapoint \n";
-        *this->output_file << "Max-Log-likelihood: " << max_log_likelihood << " = " << max_log_likelihood / (data.N * log(data.q)) << " q-its/datapoint \n\n";
+        *this->output_file << "Log-evidence: " << this->mcm_out.log_ev << " = " << this->mcm_out.log_ev / (data.N_synthetic * log(data.q)) << " q-its/datapoint \n";
+        *this->output_file << "Max-Log-likelihood: " << max_log_likelihood << " = " << max_log_likelihood / (data.N_synthetic * log(data.q)) << " q-its/datapoint \n\n";
 
-        print_partition_details_to_file(*this->output_file, this->mcm_out, data.N, data.q);
+        print_partition_details_to_file(*this->output_file, this->mcm_out, data.N_synthetic, data.q);
         *this->output_file << "\n";
         this->output_file.reset();
     }

diff --git a/src/search/mcm_search/div_and_conq.cpp b/src/search/mcm_search/div_and_conq.cpp
@@ -50,15 +50,16 @@ MCM MCMSearch::divide_and_conquer(Data& data, MCM* init_mcm, std::string file_na
         *this->output_file << "Number of variables: " << data.n << "\n";
         *this->output_file << "Number of states per variable: " << data.q << "\n";
         *this->output_file << "Number of datapoints: " << data.N << "\n";
+        *this->output_file << "Number of synthetic datapoints: " << data.N_synthetic << "\n";
         *this->output_file << "Number of unique datapoint: " << data.N_unique << "\n";
         *this->output_file << "Entropy of the data: " << data.entropy() << " q-its \n\n";
 
         *this->output_file << "Initial partition \n";
         *this->output_file << "----------------- \n\n";
 
-        *this->output_file << "Log-evidence: " << this->mcm_out.log_ev << " = " << this->mcm_out.log_ev / (data.N * log(data.q)) << " q-its/datapoint \n\n";
+        *this->output_file << "Log-evidence: " << this->mcm_out.log_ev << " = " << this->mcm_out.log_ev / (data.N_synthetic * log(data.q)) << " q-its/datapoint \n\n";
 
-        print_partition_details_to_file(*this->output_file, this->mcm_out, data.N, data.q);
+        print_partition_details_to_file(*this->output_file, this->mcm_out, data.N_synthetic, data.q);
         *this->output_file << "\n";
 
         *this->output_file << "Start dividing \n";
@@ -83,10 +84,10 @@ MCM MCMSearch::divide_and_conquer(Data& data, MCM* init_mcm, std::string file_na
         *this->output_file << "\nFinal partition \n";
         *this->output_file << "----------------- \n\n";
 
-        *this->output_file << "Log-evidence: " << this->mcm_out.log_ev << " = " << this->mcm_out.log_ev / (data.N * log(data.q)) << " q-its/datapoint \n";
-        *this->output_file << "Max-Log-likelihood: " << max_log_likelihood << " = " << max_log_likelihood / (data.N * log(data.q)) << " q-its/datapoint \n\n";
+        *this->output_file << "Log-evidence: " << this->mcm_out.log_ev << " = " << this->mcm_out.log_ev / (data.N_synthetic * log(data.q)) << " q-its/datapoint \n";
+        *this->output_file << "Max-Log-likelihood: " << max_log_likelihood << " = " << max_log_likelihood / (data.N_synthetic * log(data.q)) << " q-its/datapoint \n\n";
 
-        print_partition_details_to_file(*this->output_file, this->mcm_out, data.N, data.q);
+        print_partition_details_to_file(*this->output_file, this->mcm_out, data.N_synthetic, data.q);
         *this->output_file << "\n";
         this->output_file.reset();
     }
@@ -174,8 +175,7 @@ int MCMSearch::division(int move_from, int move_to){
     this->log_evidence_trajectory.push_back(this->mcm_out.log_ev);
 
     if (this->output_file){
-        std::cout << "Should only print with output file";
-        *this->output_file << "Splitting component " << move_from << "\t Log-evidence (q-its/datapoint): " << this->mcm_out.log_ev / (this->data->N * log(this->data->q)) << "\n";
+        *this->output_file << "Splitting component " << move_from << "\t Log-evidence (q-its/datapoint): " << this->mcm_out.log_ev / (this->data->N_synthetic * log(this->data->q)) << "\n";
         *this->output_file << "\t Component " << move_from << " : \t" + int_to_string(this->mcm_out.partition[move_from], this->mcm_out.n) << "\n";
         *this->output_file << "\t Component " << move_to << " : \t" + int_to_string(this->mcm_out.partition[move_to], this->mcm_out.n) << "\n\n";
     }

diff --git a/src/search/mcm_search/greedy.cpp b/src/search/mcm_search/greedy.cpp
@@ -49,15 +49,16 @@ MCM MCMSearch::greedy_search(Data& data, MCM* init_mcm, std::string file_name){
         *this->output_file << "Number of variables: " << data.n << "\n";
         *this->output_file << "Number of states per variable: " << data.q << "\n";
         *this->output_file << "Number of datapoints: " << data.N << "\n";
+        *this->output_file << "Number of synthetic datapoints: " << data.N_synthetic << "\n";
         *this->output_file << "Number of unique datapoint: " << data.N_unique << "\n";
         *this->output_file << "Entropy of the data: " << data.entropy() << " q-its \n\n";
 
         *this->output_file << "Initial partition \n";
         *this->output_file << "----------------- \n\n";
 
-        *this->output_file << "Log-evidence: " << this->mcm_out.log_ev << " = " << this->mcm_out.log_ev / (data.N * log(data.q)) << " q-its/datapoint \n\n";
+        *this->output_file << "Log-evidence: " << this->mcm_out.log_ev << " = " << this->mcm_out.log_ev / (data.N_synthetic * log(data.q)) << " q-its/datapoint \n\n";
 
-        print_partition_details_to_file(*this->output_file, this->mcm_out, data.N, data.q);
+        print_partition_details_to_file(*this->output_file, this->mcm_out, data.N_synthetic, data.q);
         *this->output_file << "\n";
     }
 
@@ -76,10 +77,10 @@ MCM MCMSearch::greedy_search(Data& data, MCM* init_mcm, std::string file_name){
         *this->output_file << "\nFinal partition \n";
         *this->output_file << "----------------- \n\n";
 
-        *this->output_file << "Log-evidence: " << this->mcm_out.log_ev << " = " << this->mcm_out.log_ev / (data.N * log(data.q)) << " q-its/datapoint \n";
-        *this->output_file << "Max-Log-likelihood: " << max_log_likelihood << " = " << max_log_likelihood / (data.N * log(data.q)) << " q-its/datapoint \n\n";
+        *this->output_file << "Log-evidence: " << this->mcm_out.log_ev << " = " << this->mcm_out.log_ev / (data.N_synthetic * log(data.q)) << " q-its/datapoint \n";
+        *this->output_file << "Max-Log-likelihood: " << max_log_likelihood << " = " << max_log_likelihood / (data.N_synthetic * log(data.q)) << " q-its/datapoint \n\n";
 
-        print_partition_details_to_file(*this->output_file, this->mcm_out, data.N, data.q);
+        print_partition_details_to_file(*this->output_file, this->mcm_out, data.N_synthetic, data.q);
         *this->output_file << "\n";
         this->output_file.reset();
     }
@@ -150,7 +151,7 @@ void MCMSearch::hierarchical_merging(){
 
             // Write to the output file
             if (this->output_file){
-                *this->output_file << "Merging components " << best_i << " and " << best_j << " \t Log-evidence (q-its/datapoint): "<<  (this->mcm_out.log_ev) / (this->data->N * log(this->data->q)) << "\n";
+                *this->output_file << "Merging components " << best_i << " and " << best_j << " \t Log-evidence (q-its/datapoint): "<<  (this->mcm_out.log_ev) / (this->data->N_synthetic * log(this->data->q)) << "\n";
             }
         }
     }

diff --git a/src/utilities/partition.cpp b/src/utilities/partition.cpp
@@ -1,32 +1,6 @@
 #include "utilities/partition.h"
 #include "utilities/miscellaneous.h"
 
-/*
-std::vector<int> generate_random_partition(int n){
-    // Check if the number of variables is a valid number
-    if (n > 128){
-        throw std::domain_error("The maximum system size is 128 variables.");
-    }
-    if (n < 0){
-        throw std::domain_error("The number of variables should be a positive number.");
-    }
-
-    std::vector<int> growth_string(n, 0);
-
-    int index;
-    int max_index = 2;
-    for (int i = 1; i < n; i++){
-        index = rand()/(RAND_MAX/max_index);
-        growth_string[i] = index;
-
-        if (index == (max_index-1)){
-            max_index += 1;
-        }
-    }
-
-    return growth_string;
-}*/
-
 std::vector<__uint128_t> generate_random_partition(int n){
     // Check if the number of variables is a valid number
     if (n > 128){