From f6052920346bf266403e0c0295f553c3ccf76d26 Mon Sep 17 00:00:00 2001 From: AaronDC60 Date: Fri, 27 Jun 2025 17:35:57 +0200 Subject: [PATCH 1/4] Add option to change the number of datapoints --- include/data/dataset.h | 9 +++++++++ mcmpy/include/py_dataset.h | 3 +++ mcmpy/src/py_dataset.cpp | 4 +++- src/data/complexity.cpp | 2 +- src/data/dataset.cpp | 10 ++++++++++ src/data/evidence.cpp | 9 +++++---- src/data/likelihood.cpp | 5 +++-- src/search/mcm_search/annealing.cpp | 13 +++++++------ src/search/mcm_search/div_and_conq.cpp | 14 +++++++------- src/search/mcm_search/greedy.cpp | 13 +++++++------ src/utilities/partition.cpp | 26 -------------------------- 11 files changed, 55 insertions(+), 53 deletions(-) diff --git a/include/data/dataset.h b/include/data/dataset.h index 2c7a21c..653460a 100644 --- a/include/data/dataset.h +++ b/include/data/dataset.h @@ -31,6 +31,14 @@ class Data { */ Data(const std::vector, unsigned int>>& _dataset, int n_var, int n_states, int n_samples); + /** + * Change the number of assumed datapoints in the dataset. + * This can be changed to perform an analysis of the dataset under the assumption that it is larger or smaller. + * + * @param n_datapoints The new number of assumed datapoints in the dataset. + */ + void set_N_assumed(int n_datapoints); + /** * Calculate the entropy of the dataset. * @@ -128,6 +136,7 @@ class Data { int N; // Number of datapoints int N_unique; // Number of different datapoints int n_ints; // Number of 128bit integers necessary to represent the data + int N_assumed; // The assumed number of datapoints in the dataset std::vector<__uint128_t> pow_q; // Vector containing the first n powers of q used to speed up the calculation of the evidence }; diff --git a/mcmpy/include/py_dataset.h b/mcmpy/include/py_dataset.h index c4ca24f..f138340 100644 --- a/mcmpy/include/py_dataset.h +++ b/mcmpy/include/py_dataset.h @@ -65,9 +65,12 @@ class PyData { int get_n() {return this->data.n;}; int get_N() {return this->data.N;}; + int get_N_assumed() {return this->data.N_assumed;}; int get_N_unique() {return this->data.N_unique;}; int get_q() {return this->data.q;}; + void set_N_assumed(int n_datapoints) {this->data.set_N_assumed(n_datapoints);}; + Data data; }; diff --git a/mcmpy/src/py_dataset.cpp b/mcmpy/src/py_dataset.cpp index de74447..8ecaf11 100644 --- a/mcmpy/src/py_dataset.cpp +++ b/mcmpy/src/py_dataset.cpp @@ -187,8 +187,10 @@ void bind_data_class(py::module &m) { .def("entropy", &PyData::entropy, py::arg("base") = -1) .def("entropy_of_spin_operator", &PyData::entropy_of_spin_op, py::arg("spin_op")) + .def_property_readonly("n", &PyData::get_n) .def_property_readonly("q", &PyData::get_q) .def_property_readonly("N", &PyData::get_N) - .def_property_readonly("N_unique", &PyData::get_N_unique); + .def_property_readonly("N_unique", &PyData::get_N_unique) + .def_property("N_assumed", &PyData::get_N_assumed, &PyData::set_N_assumed); } diff --git a/src/data/complexity.cpp b/src/data/complexity.cpp index 26aad50..5562997 100644 --- a/src/data/complexity.cpp +++ b/src/data/complexity.cpp @@ -22,7 +22,7 @@ double Data::calc_param_complexity_icc(__uint128_t component){ // Determine the size of the component int r = bit_count(component); __uint128_t n_param = this->pow_q[r] - 1; - return (n_param / 2.) * log(this->N / (2 * M_PI)); + return (n_param / 2.) * log(this->N_assumed / (2 * M_PI)); } double Data::calc_param_complexity(std::vector<__uint128_t>& partition){ diff --git a/src/data/dataset.cpp b/src/data/dataset.cpp index 36158fb..b95dbb9 100644 --- a/src/data/dataset.cpp +++ b/src/data/dataset.cpp @@ -17,6 +17,7 @@ Data::Data(const std::string& filename, int n_var, int n_states){ this->n_ints = ceil(log2(q)); // Process the dataset this->N = processing(filename, n_var, n_ints, n_states, dataset); + this->N_assumed = this->N; this->N_unique = dataset.size(); // Precompute the powers of q to speed up the calculation of the evidence (q^r) @@ -34,6 +35,7 @@ Data::Data(const std::vector, unsigned int>>& this->q = n_states; this->dataset = _dataset; this->N = n_samples; + this->N_assumed = this->N; this->N_unique = _dataset.size(); // Calculate the number of integers necessary to represent the data @@ -48,6 +50,14 @@ Data::Data(const std::vector, unsigned int>>& } } +void Data::set_N_assumed(int n_datapoints){ + // Check if the given number of datapoints is valid + if (n_datapoints < 1){ + throw std::invalid_argument("The number of datapoints should be a positive number."); + } + this->N_assumed = n_datapoints; +} + double Data::entropy(int base){ // Default base is q if (base == -1){ diff --git a/src/data/evidence.cpp b/src/data/evidence.cpp index 73905d5..1d6aef4 100644 --- a/src/data/evidence.cpp +++ b/src/data/evidence.cpp @@ -5,21 +5,22 @@ double Data::calc_log_ev_icc(__uint128_t component){ double log_evidence = 0; // Determine the size of the component int r = bit_count(component); + double alpha = this->N_assumed / this->N; // Contributions from the datapoint frequencies std::unordered_map, unsigned int, HashVector128> counts = build_histogram(*this, component); std::unordered_map, unsigned int, HashVector128>::iterator count_iter = counts.begin(); while (count_iter != counts.end()){ - log_evidence += (lgamma(count_iter->second + 0.5) - 0.5 * log(M_PI)); + log_evidence += (lgamma(alpha * count_iter->second + 0.5) - 0.5 * log(M_PI)); ++count_iter; } // Calculate prefactor if (r > 25){ // Approximate for large components because lgamma overflows - log_evidence -= (r * log(this->q) * this->N); + log_evidence -= (r * log(this->q) * this->N_assumed); } else{ - log_evidence += lgamma(this->pow_q[r]/2.) - lgamma(this->N + pow_q[r]/2.); + log_evidence += lgamma(this->pow_q[r]/2.) - lgamma(this->N_assumed + pow_q[r]/2.); } return log_evidence; @@ -37,7 +38,7 @@ double Data::calc_log_ev(std::vector<__uint128_t>& partition){ } } // Contribution from variables not in the model - log_ev -= this->N * (this->n - r) * log(this->q); + log_ev -= this->N_assumed * (this->n - r) * log(this->q); return log_ev; } diff --git a/src/data/likelihood.cpp b/src/data/likelihood.cpp index e61d62c..e5b5e33 100644 --- a/src/data/likelihood.cpp +++ b/src/data/likelihood.cpp @@ -4,13 +4,14 @@ double Data::calc_log_likelihood_icc(__uint128_t component){ double log_likelihood = 0; double N_datapoints = this->N; + double alpha = this->N_assumed / this->N; // Determine the size of the component int r = bit_count(component); // Get the datapoint frequencies std::unordered_map, unsigned int, HashVector128> counts = build_histogram(*this, component); std::unordered_map, unsigned int, HashVector128>::iterator count_iter = counts.begin(); while (count_iter != counts.end()){ - log_likelihood += count_iter->second * log(count_iter->second / N_datapoints); + log_likelihood += alpha * count_iter->second * log(count_iter->second / N_datapoints); ++count_iter; } return log_likelihood; @@ -28,7 +29,7 @@ double Data::calc_log_likelihood(std::vector<__uint128_t>& partition){ } } // Contribution from variables not in the model - log_likelihood -= this->N * (this->n - r) * log(this->q); + log_likelihood -= this->N_assumed * (this->n - r) * log(this->q); return log_likelihood; } \ No newline at end of file diff --git a/src/search/mcm_search/annealing.cpp b/src/search/mcm_search/annealing.cpp index 80e4ba7..ddced3a 100644 --- a/src/search/mcm_search/annealing.cpp +++ b/src/search/mcm_search/annealing.cpp @@ -51,15 +51,16 @@ MCM MCMSearch::simulated_annealing(Data& data, MCM* init_mcm, std::string file_n *this->output_file << "Number of variables: " << data.n << "\n"; *this->output_file << "Number of states per variable: " << data.q << "\n"; *this->output_file << "Number of datapoints: " << data.N << "\n"; + *this->output_file << "Number of assumed datapoints: " << data.N_assumed << "\n"; *this->output_file << "Number of unique datapoint: " << data.N_unique << "\n"; *this->output_file << "Entropy of the data: " << data.entropy() << " q-its \n\n"; *this->output_file << "Initial partition \n"; *this->output_file << "----------------- \n\n"; - *this->output_file << "Log-evidence: " << mcm_tmp.log_ev << " = " << mcm_tmp.log_ev / (data.N * log(data.q)) << " q-its/datapoint \n\n"; + *this->output_file << "Log-evidence: " << mcm_tmp.log_ev << " = " << mcm_tmp.log_ev / (data.N_assumed * log(data.q)) << " q-its/datapoint \n\n"; - print_partition_details_to_file(*this->output_file, mcm_tmp, data.N, data.q); + print_partition_details_to_file(*this->output_file, mcm_tmp, data.N_assumed, data.q); *this->output_file << "\n"; *this->output_file << "Start annealing \n"; @@ -114,7 +115,7 @@ MCM MCMSearch::simulated_annealing(Data& data, MCM* init_mcm, std::string file_n // Write to output file if (this->output_file){ - *this->output_file << "Iteration " << i << " \t\t Temperature: " << settings.temp << " \t\t Log-evidence (q-its/datapoint): " << this->mcm_out.log_ev / (this->data->N * log(this->data->q)) << "\n"; + *this->output_file << "Iteration " << i << " \t\t Temperature: " << settings.temp << " \t\t Log-evidence (q-its/datapoint): " << this->mcm_out.log_ev / (this->data->N_assumed * log(this->data->q)) << "\n"; } } else{steps_since_improve++;} @@ -161,10 +162,10 @@ MCM MCMSearch::simulated_annealing(Data& data, MCM* init_mcm, std::string file_n *this->output_file << "\nFinal partition \n"; *this->output_file << "----------------- \n\n"; - *this->output_file << "Log-evidence: " << this->mcm_out.log_ev << " = " << this->mcm_out.log_ev / (data.N * log(data.q)) << " q-its/datapoint \n"; - *this->output_file << "Max-Log-likelihood: " << max_log_likelihood << " = " << max_log_likelihood / (data.N * log(data.q)) << " q-its/datapoint \n\n"; + *this->output_file << "Log-evidence: " << this->mcm_out.log_ev << " = " << this->mcm_out.log_ev / (data.N_assumed * log(data.q)) << " q-its/datapoint \n"; + *this->output_file << "Max-Log-likelihood: " << max_log_likelihood << " = " << max_log_likelihood / (data.N_assumed * log(data.q)) << " q-its/datapoint \n\n"; - print_partition_details_to_file(*this->output_file, this->mcm_out, data.N, data.q); + print_partition_details_to_file(*this->output_file, this->mcm_out, data.N_assumed, data.q); *this->output_file << "\n"; this->output_file.reset(); } diff --git a/src/search/mcm_search/div_and_conq.cpp b/src/search/mcm_search/div_and_conq.cpp index 96966b8..5e5e441 100644 --- a/src/search/mcm_search/div_and_conq.cpp +++ b/src/search/mcm_search/div_and_conq.cpp @@ -50,15 +50,16 @@ MCM MCMSearch::divide_and_conquer(Data& data, MCM* init_mcm, std::string file_na *this->output_file << "Number of variables: " << data.n << "\n"; *this->output_file << "Number of states per variable: " << data.q << "\n"; *this->output_file << "Number of datapoints: " << data.N << "\n"; + *this->output_file << "Number of assumed datapoints: " << data.N_assumed << "\n"; *this->output_file << "Number of unique datapoint: " << data.N_unique << "\n"; *this->output_file << "Entropy of the data: " << data.entropy() << " q-its \n\n"; *this->output_file << "Initial partition \n"; *this->output_file << "----------------- \n\n"; - *this->output_file << "Log-evidence: " << this->mcm_out.log_ev << " = " << this->mcm_out.log_ev / (data.N * log(data.q)) << " q-its/datapoint \n\n"; + *this->output_file << "Log-evidence: " << this->mcm_out.log_ev << " = " << this->mcm_out.log_ev / (data.N_assumed * log(data.q)) << " q-its/datapoint \n\n"; - print_partition_details_to_file(*this->output_file, this->mcm_out, data.N, data.q); + print_partition_details_to_file(*this->output_file, this->mcm_out, data.N_assumed, data.q); *this->output_file << "\n"; *this->output_file << "Start dividing \n"; @@ -83,10 +84,10 @@ MCM MCMSearch::divide_and_conquer(Data& data, MCM* init_mcm, std::string file_na *this->output_file << "\nFinal partition \n"; *this->output_file << "----------------- \n\n"; - *this->output_file << "Log-evidence: " << this->mcm_out.log_ev << " = " << this->mcm_out.log_ev / (data.N * log(data.q)) << " q-its/datapoint \n"; - *this->output_file << "Max-Log-likelihood: " << max_log_likelihood << " = " << max_log_likelihood / (data.N * log(data.q)) << " q-its/datapoint \n\n"; + *this->output_file << "Log-evidence: " << this->mcm_out.log_ev << " = " << this->mcm_out.log_ev / (data.N_assumed * log(data.q)) << " q-its/datapoint \n"; + *this->output_file << "Max-Log-likelihood: " << max_log_likelihood << " = " << max_log_likelihood / (data.N_assumed * log(data.q)) << " q-its/datapoint \n\n"; - print_partition_details_to_file(*this->output_file, this->mcm_out, data.N, data.q); + print_partition_details_to_file(*this->output_file, this->mcm_out, data.N_assumed, data.q); *this->output_file << "\n"; this->output_file.reset(); } @@ -174,8 +175,7 @@ int MCMSearch::division(int move_from, int move_to){ this->log_evidence_trajectory.push_back(this->mcm_out.log_ev); if (this->output_file){ - std::cout << "Should only print with output file"; - *this->output_file << "Splitting component " << move_from << "\t Log-evidence (q-its/datapoint): " << this->mcm_out.log_ev / (this->data->N * log(this->data->q)) << "\n"; + *this->output_file << "Splitting component " << move_from << "\t Log-evidence (q-its/datapoint): " << this->mcm_out.log_ev / (this->data->N_assumed * log(this->data->q)) << "\n"; *this->output_file << "\t Component " << move_from << " : \t" + int_to_string(this->mcm_out.partition[move_from], this->mcm_out.n) << "\n"; *this->output_file << "\t Component " << move_to << " : \t" + int_to_string(this->mcm_out.partition[move_to], this->mcm_out.n) << "\n\n"; } diff --git a/src/search/mcm_search/greedy.cpp b/src/search/mcm_search/greedy.cpp index 6acb93f..2378003 100644 --- a/src/search/mcm_search/greedy.cpp +++ b/src/search/mcm_search/greedy.cpp @@ -49,15 +49,16 @@ MCM MCMSearch::greedy_search(Data& data, MCM* init_mcm, std::string file_name){ *this->output_file << "Number of variables: " << data.n << "\n"; *this->output_file << "Number of states per variable: " << data.q << "\n"; *this->output_file << "Number of datapoints: " << data.N << "\n"; + *this->output_file << "Number of assumed datapoints: " << data.N_assumed << "\n"; *this->output_file << "Number of unique datapoint: " << data.N_unique << "\n"; *this->output_file << "Entropy of the data: " << data.entropy() << " q-its \n\n"; *this->output_file << "Initial partition \n"; *this->output_file << "----------------- \n\n"; - *this->output_file << "Log-evidence: " << this->mcm_out.log_ev << " = " << this->mcm_out.log_ev / (data.N * log(data.q)) << " q-its/datapoint \n\n"; + *this->output_file << "Log-evidence: " << this->mcm_out.log_ev << " = " << this->mcm_out.log_ev / (data.N_assumed * log(data.q)) << " q-its/datapoint \n\n"; - print_partition_details_to_file(*this->output_file, this->mcm_out, data.N, data.q); + print_partition_details_to_file(*this->output_file, this->mcm_out, data.N_assumed, data.q); *this->output_file << "\n"; } @@ -76,10 +77,10 @@ MCM MCMSearch::greedy_search(Data& data, MCM* init_mcm, std::string file_name){ *this->output_file << "\nFinal partition \n"; *this->output_file << "----------------- \n\n"; - *this->output_file << "Log-evidence: " << this->mcm_out.log_ev << " = " << this->mcm_out.log_ev / (data.N * log(data.q)) << " q-its/datapoint \n"; - *this->output_file << "Max-Log-likelihood: " << max_log_likelihood << " = " << max_log_likelihood / (data.N * log(data.q)) << " q-its/datapoint \n\n"; + *this->output_file << "Log-evidence: " << this->mcm_out.log_ev << " = " << this->mcm_out.log_ev / (data.N_assumed * log(data.q)) << " q-its/datapoint \n"; + *this->output_file << "Max-Log-likelihood: " << max_log_likelihood << " = " << max_log_likelihood / (data.N_assumed * log(data.q)) << " q-its/datapoint \n\n"; - print_partition_details_to_file(*this->output_file, this->mcm_out, data.N, data.q); + print_partition_details_to_file(*this->output_file, this->mcm_out, data.N_assumed, data.q); *this->output_file << "\n"; this->output_file.reset(); } @@ -150,7 +151,7 @@ void MCMSearch::hierarchical_merging(){ // Write to the output file if (this->output_file){ - *this->output_file << "Merging components " << best_i << " and " << best_j << " \t Log-evidence (q-its/datapoint): "<< (this->mcm_out.log_ev) / (this->data->N * log(this->data->q)) << "\n"; + *this->output_file << "Merging components " << best_i << " and " << best_j << " \t Log-evidence (q-its/datapoint): "<< (this->mcm_out.log_ev) / (this->data->N_assumed * log(this->data->q)) << "\n"; } } } diff --git a/src/utilities/partition.cpp b/src/utilities/partition.cpp index 99e5569..675a29f 100644 --- a/src/utilities/partition.cpp +++ b/src/utilities/partition.cpp @@ -1,32 +1,6 @@ #include "utilities/partition.h" #include "utilities/miscellaneous.h" -/* -std::vector generate_random_partition(int n){ - // Check if the number of variables is a valid number - if (n > 128){ - throw std::domain_error("The maximum system size is 128 variables."); - } - if (n < 0){ - throw std::domain_error("The number of variables should be a positive number."); - } - - std::vector growth_string(n, 0); - - int index; - int max_index = 2; - for (int i = 1; i < n; i++){ - index = rand()/(RAND_MAX/max_index); - growth_string[i] = index; - - if (index == (max_index-1)){ - max_index += 1; - } - } - - return growth_string; -}*/ - std::vector<__uint128_t> generate_random_partition(int n){ // Check if the number of variables is a valid number if (n > 128){ From c96a71a881d6953ed62383f0faaf50bbbe86b8c5 Mon Sep 17 00:00:00 2001 From: AaronDC60 Date: Fri, 27 Jun 2025 17:52:04 +0200 Subject: [PATCH 2/4] Update documentation --- docs/api/data.rst | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/docs/api/data.rst b/docs/api/data.rst index 85bbaee..28c8f2e 100644 --- a/docs/api/data.rst +++ b/docs/api/data.rst @@ -240,6 +240,13 @@ to infer either the best minimally complex (MCM) or the optimal basis representa The actual number of datapoints in the dataset (read-only). + .. py:attribute:: N_assumed + :type: int + + The assumed number of datapoints in the dataset. + + This attribute can be changed to perform an analysis of the dataset under the assumption that it is either larger or smaller. + .. py:attribute:: N_unique :type: int From 49b08534280c0cc4247bdf48bc2f32040d31ae27 Mon Sep 17 00:00:00 2001 From: AaronDC60 Date: Fri, 27 Jun 2025 19:05:54 +0200 Subject: [PATCH 3/4] Change variable name --- docs/api/data.rst | 6 +++--- include/data/dataset.h | 10 +++++----- mcmpy/include/py_dataset.h | 4 ++-- mcmpy/src/py_dataset.cpp | 2 +- src/data/complexity.cpp | 2 +- src/data/dataset.cpp | 8 ++++---- src/data/evidence.cpp | 8 ++++---- src/data/likelihood.cpp | 4 ++-- src/search/mcm_search/annealing.cpp | 14 +++++++------- src/search/mcm_search/div_and_conq.cpp | 14 +++++++------- src/search/mcm_search/greedy.cpp | 14 +++++++------- 11 files changed, 43 insertions(+), 43 deletions(-) diff --git a/docs/api/data.rst b/docs/api/data.rst index 28c8f2e..2630757 100644 --- a/docs/api/data.rst +++ b/docs/api/data.rst @@ -240,12 +240,12 @@ to infer either the best minimally complex (MCM) or the optimal basis representa The actual number of datapoints in the dataset (read-only). - .. py:attribute:: N_assumed + .. py:attribute:: N_synthetic :type: int - The assumed number of datapoints in the dataset. + The synthetic number of datapoints in the dataset. - This attribute can be changed to perform an analysis of the dataset under the assumption that it is either larger or smaller. + This attribute can be changed to perform an analysis of the dataset as if it is either larger or smaller. .. py:attribute:: N_unique :type: int diff --git a/include/data/dataset.h b/include/data/dataset.h index 653460a..c8cd02b 100644 --- a/include/data/dataset.h +++ b/include/data/dataset.h @@ -32,12 +32,12 @@ class Data { Data(const std::vector, unsigned int>>& _dataset, int n_var, int n_states, int n_samples); /** - * Change the number of assumed datapoints in the dataset. - * This can be changed to perform an analysis of the dataset under the assumption that it is larger or smaller. + * Change the value for the number of datapoints in the dataset that is used for analysis. + * This can be changed to perform an analysis of the dataset as if it is larger or smaller. * - * @param n_datapoints The new number of assumed datapoints in the dataset. + * @param n_datapoints The synthetic number of datapoints in the dataset. */ - void set_N_assumed(int n_datapoints); + void set_N_synthetic(int n_datapoints); /** * Calculate the entropy of the dataset. @@ -136,7 +136,7 @@ class Data { int N; // Number of datapoints int N_unique; // Number of different datapoints int n_ints; // Number of 128bit integers necessary to represent the data - int N_assumed; // The assumed number of datapoints in the dataset + int N_synthetic; // The synthetic number of datapoints in the dataset std::vector<__uint128_t> pow_q; // Vector containing the first n powers of q used to speed up the calculation of the evidence }; diff --git a/mcmpy/include/py_dataset.h b/mcmpy/include/py_dataset.h index f138340..e3a5648 100644 --- a/mcmpy/include/py_dataset.h +++ b/mcmpy/include/py_dataset.h @@ -65,11 +65,11 @@ class PyData { int get_n() {return this->data.n;}; int get_N() {return this->data.N;}; - int get_N_assumed() {return this->data.N_assumed;}; + int get_N_synthetic() {return this->data.N_synthetic;}; int get_N_unique() {return this->data.N_unique;}; int get_q() {return this->data.q;}; - void set_N_assumed(int n_datapoints) {this->data.set_N_assumed(n_datapoints);}; + void set_N_synthetic(int n_datapoints) {this->data.set_N_synthetic(n_datapoints);}; Data data; }; diff --git a/mcmpy/src/py_dataset.cpp b/mcmpy/src/py_dataset.cpp index 8ecaf11..ade3dd7 100644 --- a/mcmpy/src/py_dataset.cpp +++ b/mcmpy/src/py_dataset.cpp @@ -192,5 +192,5 @@ void bind_data_class(py::module &m) { .def_property_readonly("q", &PyData::get_q) .def_property_readonly("N", &PyData::get_N) .def_property_readonly("N_unique", &PyData::get_N_unique) - .def_property("N_assumed", &PyData::get_N_assumed, &PyData::set_N_assumed); + .def_property("N_synthetic", &PyData::get_N_synthetic, &PyData::set_N_synthetic); } diff --git a/src/data/complexity.cpp b/src/data/complexity.cpp index 5562997..dbbf3f0 100644 --- a/src/data/complexity.cpp +++ b/src/data/complexity.cpp @@ -22,7 +22,7 @@ double Data::calc_param_complexity_icc(__uint128_t component){ // Determine the size of the component int r = bit_count(component); __uint128_t n_param = this->pow_q[r] - 1; - return (n_param / 2.) * log(this->N_assumed / (2 * M_PI)); + return (n_param / 2.) * log(this->N_synthetic / (2 * M_PI)); } double Data::calc_param_complexity(std::vector<__uint128_t>& partition){ diff --git a/src/data/dataset.cpp b/src/data/dataset.cpp index b95dbb9..37d35f4 100644 --- a/src/data/dataset.cpp +++ b/src/data/dataset.cpp @@ -17,7 +17,7 @@ Data::Data(const std::string& filename, int n_var, int n_states){ this->n_ints = ceil(log2(q)); // Process the dataset this->N = processing(filename, n_var, n_ints, n_states, dataset); - this->N_assumed = this->N; + this->N_synthetic = this->N; this->N_unique = dataset.size(); // Precompute the powers of q to speed up the calculation of the evidence (q^r) @@ -35,7 +35,7 @@ Data::Data(const std::vector, unsigned int>>& this->q = n_states; this->dataset = _dataset; this->N = n_samples; - this->N_assumed = this->N; + this->N_synthetic = this->N; this->N_unique = _dataset.size(); // Calculate the number of integers necessary to represent the data @@ -50,12 +50,12 @@ Data::Data(const std::vector, unsigned int>>& } } -void Data::set_N_assumed(int n_datapoints){ +void Data::set_N_synthetic(int n_datapoints){ // Check if the given number of datapoints is valid if (n_datapoints < 1){ throw std::invalid_argument("The number of datapoints should be a positive number."); } - this->N_assumed = n_datapoints; + this->N_synthetic = n_datapoints; } double Data::entropy(int base){ diff --git a/src/data/evidence.cpp b/src/data/evidence.cpp index 1d6aef4..2f16a65 100644 --- a/src/data/evidence.cpp +++ b/src/data/evidence.cpp @@ -5,7 +5,7 @@ double Data::calc_log_ev_icc(__uint128_t component){ double log_evidence = 0; // Determine the size of the component int r = bit_count(component); - double alpha = this->N_assumed / this->N; + double alpha = this->N_synthetic / this->N; // Contributions from the datapoint frequencies std::unordered_map, unsigned int, HashVector128> counts = build_histogram(*this, component); std::unordered_map, unsigned int, HashVector128>::iterator count_iter = counts.begin(); @@ -17,10 +17,10 @@ double Data::calc_log_ev_icc(__uint128_t component){ // Calculate prefactor if (r > 25){ // Approximate for large components because lgamma overflows - log_evidence -= (r * log(this->q) * this->N_assumed); + log_evidence -= (r * log(this->q) * this->N_synthetic); } else{ - log_evidence += lgamma(this->pow_q[r]/2.) - lgamma(this->N_assumed + pow_q[r]/2.); + log_evidence += lgamma(this->pow_q[r]/2.) - lgamma(this->N_synthetic + pow_q[r]/2.); } return log_evidence; @@ -38,7 +38,7 @@ double Data::calc_log_ev(std::vector<__uint128_t>& partition){ } } // Contribution from variables not in the model - log_ev -= this->N_assumed * (this->n - r) * log(this->q); + log_ev -= this->N_synthetic * (this->n - r) * log(this->q); return log_ev; } diff --git a/src/data/likelihood.cpp b/src/data/likelihood.cpp index e5b5e33..d1bfafb 100644 --- a/src/data/likelihood.cpp +++ b/src/data/likelihood.cpp @@ -4,7 +4,7 @@ double Data::calc_log_likelihood_icc(__uint128_t component){ double log_likelihood = 0; double N_datapoints = this->N; - double alpha = this->N_assumed / this->N; + double alpha = this->N_synthetic / this->N; // Determine the size of the component int r = bit_count(component); // Get the datapoint frequencies @@ -29,7 +29,7 @@ double Data::calc_log_likelihood(std::vector<__uint128_t>& partition){ } } // Contribution from variables not in the model - log_likelihood -= this->N_assumed * (this->n - r) * log(this->q); + log_likelihood -= this->N_synthetic * (this->n - r) * log(this->q); return log_likelihood; } \ No newline at end of file diff --git a/src/search/mcm_search/annealing.cpp b/src/search/mcm_search/annealing.cpp index ddced3a..26b669d 100644 --- a/src/search/mcm_search/annealing.cpp +++ b/src/search/mcm_search/annealing.cpp @@ -51,16 +51,16 @@ MCM MCMSearch::simulated_annealing(Data& data, MCM* init_mcm, std::string file_n *this->output_file << "Number of variables: " << data.n << "\n"; *this->output_file << "Number of states per variable: " << data.q << "\n"; *this->output_file << "Number of datapoints: " << data.N << "\n"; - *this->output_file << "Number of assumed datapoints: " << data.N_assumed << "\n"; + *this->output_file << "Number of synthetic datapoints: " << data.N_synthetic << "\n"; *this->output_file << "Number of unique datapoint: " << data.N_unique << "\n"; *this->output_file << "Entropy of the data: " << data.entropy() << " q-its \n\n"; *this->output_file << "Initial partition \n"; *this->output_file << "----------------- \n\n"; - *this->output_file << "Log-evidence: " << mcm_tmp.log_ev << " = " << mcm_tmp.log_ev / (data.N_assumed * log(data.q)) << " q-its/datapoint \n\n"; + *this->output_file << "Log-evidence: " << mcm_tmp.log_ev << " = " << mcm_tmp.log_ev / (data.N_synthetic * log(data.q)) << " q-its/datapoint \n\n"; - print_partition_details_to_file(*this->output_file, mcm_tmp, data.N_assumed, data.q); + print_partition_details_to_file(*this->output_file, mcm_tmp, data.N_synthetic, data.q); *this->output_file << "\n"; *this->output_file << "Start annealing \n"; @@ -115,7 +115,7 @@ MCM MCMSearch::simulated_annealing(Data& data, MCM* init_mcm, std::string file_n // Write to output file if (this->output_file){ - *this->output_file << "Iteration " << i << " \t\t Temperature: " << settings.temp << " \t\t Log-evidence (q-its/datapoint): " << this->mcm_out.log_ev / (this->data->N_assumed * log(this->data->q)) << "\n"; + *this->output_file << "Iteration " << i << " \t\t Temperature: " << settings.temp << " \t\t Log-evidence (q-its/datapoint): " << this->mcm_out.log_ev / (this->data->N_synthetic * log(this->data->q)) << "\n"; } } else{steps_since_improve++;} @@ -162,10 +162,10 @@ MCM MCMSearch::simulated_annealing(Data& data, MCM* init_mcm, std::string file_n *this->output_file << "\nFinal partition \n"; *this->output_file << "----------------- \n\n"; - *this->output_file << "Log-evidence: " << this->mcm_out.log_ev << " = " << this->mcm_out.log_ev / (data.N_assumed * log(data.q)) << " q-its/datapoint \n"; - *this->output_file << "Max-Log-likelihood: " << max_log_likelihood << " = " << max_log_likelihood / (data.N_assumed * log(data.q)) << " q-its/datapoint \n\n"; + *this->output_file << "Log-evidence: " << this->mcm_out.log_ev << " = " << this->mcm_out.log_ev / (data.N_synthetic * log(data.q)) << " q-its/datapoint \n"; + *this->output_file << "Max-Log-likelihood: " << max_log_likelihood << " = " << max_log_likelihood / (data.N_synthetic * log(data.q)) << " q-its/datapoint \n\n"; - print_partition_details_to_file(*this->output_file, this->mcm_out, data.N_assumed, data.q); + print_partition_details_to_file(*this->output_file, this->mcm_out, data.N_synthetic, data.q); *this->output_file << "\n"; this->output_file.reset(); } diff --git a/src/search/mcm_search/div_and_conq.cpp b/src/search/mcm_search/div_and_conq.cpp index 5e5e441..361f987 100644 --- a/src/search/mcm_search/div_and_conq.cpp +++ b/src/search/mcm_search/div_and_conq.cpp @@ -50,16 +50,16 @@ MCM MCMSearch::divide_and_conquer(Data& data, MCM* init_mcm, std::string file_na *this->output_file << "Number of variables: " << data.n << "\n"; *this->output_file << "Number of states per variable: " << data.q << "\n"; *this->output_file << "Number of datapoints: " << data.N << "\n"; - *this->output_file << "Number of assumed datapoints: " << data.N_assumed << "\n"; + *this->output_file << "Number of synthetic datapoints: " << data.N_synthetic << "\n"; *this->output_file << "Number of unique datapoint: " << data.N_unique << "\n"; *this->output_file << "Entropy of the data: " << data.entropy() << " q-its \n\n"; *this->output_file << "Initial partition \n"; *this->output_file << "----------------- \n\n"; - *this->output_file << "Log-evidence: " << this->mcm_out.log_ev << " = " << this->mcm_out.log_ev / (data.N_assumed * log(data.q)) << " q-its/datapoint \n\n"; + *this->output_file << "Log-evidence: " << this->mcm_out.log_ev << " = " << this->mcm_out.log_ev / (data.N_synthetic * log(data.q)) << " q-its/datapoint \n\n"; - print_partition_details_to_file(*this->output_file, this->mcm_out, data.N_assumed, data.q); + print_partition_details_to_file(*this->output_file, this->mcm_out, data.N_synthetic, data.q); *this->output_file << "\n"; *this->output_file << "Start dividing \n"; @@ -84,10 +84,10 @@ MCM MCMSearch::divide_and_conquer(Data& data, MCM* init_mcm, std::string file_na *this->output_file << "\nFinal partition \n"; *this->output_file << "----------------- \n\n"; - *this->output_file << "Log-evidence: " << this->mcm_out.log_ev << " = " << this->mcm_out.log_ev / (data.N_assumed * log(data.q)) << " q-its/datapoint \n"; - *this->output_file << "Max-Log-likelihood: " << max_log_likelihood << " = " << max_log_likelihood / (data.N_assumed * log(data.q)) << " q-its/datapoint \n\n"; + *this->output_file << "Log-evidence: " << this->mcm_out.log_ev << " = " << this->mcm_out.log_ev / (data.N_synthetic * log(data.q)) << " q-its/datapoint \n"; + *this->output_file << "Max-Log-likelihood: " << max_log_likelihood << " = " << max_log_likelihood / (data.N_synthetic * log(data.q)) << " q-its/datapoint \n\n"; - print_partition_details_to_file(*this->output_file, this->mcm_out, data.N_assumed, data.q); + print_partition_details_to_file(*this->output_file, this->mcm_out, data.N_synthetic, data.q); *this->output_file << "\n"; this->output_file.reset(); } @@ -175,7 +175,7 @@ int MCMSearch::division(int move_from, int move_to){ this->log_evidence_trajectory.push_back(this->mcm_out.log_ev); if (this->output_file){ - *this->output_file << "Splitting component " << move_from << "\t Log-evidence (q-its/datapoint): " << this->mcm_out.log_ev / (this->data->N_assumed * log(this->data->q)) << "\n"; + *this->output_file << "Splitting component " << move_from << "\t Log-evidence (q-its/datapoint): " << this->mcm_out.log_ev / (this->data->N_synthetic * log(this->data->q)) << "\n"; *this->output_file << "\t Component " << move_from << " : \t" + int_to_string(this->mcm_out.partition[move_from], this->mcm_out.n) << "\n"; *this->output_file << "\t Component " << move_to << " : \t" + int_to_string(this->mcm_out.partition[move_to], this->mcm_out.n) << "\n\n"; } diff --git a/src/search/mcm_search/greedy.cpp b/src/search/mcm_search/greedy.cpp index 2378003..a176b94 100644 --- a/src/search/mcm_search/greedy.cpp +++ b/src/search/mcm_search/greedy.cpp @@ -49,16 +49,16 @@ MCM MCMSearch::greedy_search(Data& data, MCM* init_mcm, std::string file_name){ *this->output_file << "Number of variables: " << data.n << "\n"; *this->output_file << "Number of states per variable: " << data.q << "\n"; *this->output_file << "Number of datapoints: " << data.N << "\n"; - *this->output_file << "Number of assumed datapoints: " << data.N_assumed << "\n"; + *this->output_file << "Number of synthetic datapoints: " << data.N_synthetic << "\n"; *this->output_file << "Number of unique datapoint: " << data.N_unique << "\n"; *this->output_file << "Entropy of the data: " << data.entropy() << " q-its \n\n"; *this->output_file << "Initial partition \n"; *this->output_file << "----------------- \n\n"; - *this->output_file << "Log-evidence: " << this->mcm_out.log_ev << " = " << this->mcm_out.log_ev / (data.N_assumed * log(data.q)) << " q-its/datapoint \n\n"; + *this->output_file << "Log-evidence: " << this->mcm_out.log_ev << " = " << this->mcm_out.log_ev / (data.N_synthetic * log(data.q)) << " q-its/datapoint \n\n"; - print_partition_details_to_file(*this->output_file, this->mcm_out, data.N_assumed, data.q); + print_partition_details_to_file(*this->output_file, this->mcm_out, data.N_synthetic, data.q); *this->output_file << "\n"; } @@ -77,10 +77,10 @@ MCM MCMSearch::greedy_search(Data& data, MCM* init_mcm, std::string file_name){ *this->output_file << "\nFinal partition \n"; *this->output_file << "----------------- \n\n"; - *this->output_file << "Log-evidence: " << this->mcm_out.log_ev << " = " << this->mcm_out.log_ev / (data.N_assumed * log(data.q)) << " q-its/datapoint \n"; - *this->output_file << "Max-Log-likelihood: " << max_log_likelihood << " = " << max_log_likelihood / (data.N_assumed * log(data.q)) << " q-its/datapoint \n\n"; + *this->output_file << "Log-evidence: " << this->mcm_out.log_ev << " = " << this->mcm_out.log_ev / (data.N_synthetic * log(data.q)) << " q-its/datapoint \n"; + *this->output_file << "Max-Log-likelihood: " << max_log_likelihood << " = " << max_log_likelihood / (data.N_synthetic * log(data.q)) << " q-its/datapoint \n\n"; - print_partition_details_to_file(*this->output_file, this->mcm_out, data.N_assumed, data.q); + print_partition_details_to_file(*this->output_file, this->mcm_out, data.N_synthetic, data.q); *this->output_file << "\n"; this->output_file.reset(); } @@ -151,7 +151,7 @@ void MCMSearch::hierarchical_merging(){ // Write to the output file if (this->output_file){ - *this->output_file << "Merging components " << best_i << " and " << best_j << " \t Log-evidence (q-its/datapoint): "<< (this->mcm_out.log_ev) / (this->data->N_assumed * log(this->data->q)) << "\n"; + *this->output_file << "Merging components " << best_i << " and " << best_j << " \t Log-evidence (q-its/datapoint): "<< (this->mcm_out.log_ev) / (this->data->N_synthetic * log(this->data->q)) << "\n"; } } } From 7397dd90fe25f4035aba6569725b8f65c8daad4b Mon Sep 17 00:00:00 2001 From: AaronDC60 Date: Fri, 27 Jun 2025 19:18:01 +0200 Subject: [PATCH 4/4] Fix calculation of alpha --- src/data/evidence.cpp | 3 ++- src/data/likelihood.cpp | 2 +- 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/src/data/evidence.cpp b/src/data/evidence.cpp index 2f16a65..87201c7 100644 --- a/src/data/evidence.cpp +++ b/src/data/evidence.cpp @@ -5,7 +5,8 @@ double Data::calc_log_ev_icc(__uint128_t component){ double log_evidence = 0; // Determine the size of the component int r = bit_count(component); - double alpha = this->N_synthetic / this->N; + double N_syn = this->N_synthetic; + double alpha = N_syn / this->N; // Contributions from the datapoint frequencies std::unordered_map, unsigned int, HashVector128> counts = build_histogram(*this, component); std::unordered_map, unsigned int, HashVector128>::iterator count_iter = counts.begin(); diff --git a/src/data/likelihood.cpp b/src/data/likelihood.cpp index d1bfafb..fa0187e 100644 --- a/src/data/likelihood.cpp +++ b/src/data/likelihood.cpp @@ -4,7 +4,7 @@ double Data::calc_log_likelihood_icc(__uint128_t component){ double log_likelihood = 0; double N_datapoints = this->N; - double alpha = this->N_synthetic / this->N; + double alpha = this->N_synthetic / N_datapoints; // Determine the size of the component int r = bit_count(component); // Get the datapoint frequencies