diff --git a/docs/api/data.rst b/docs/api/data.rst index 85bbaee..2630757 100644 --- a/docs/api/data.rst +++ b/docs/api/data.rst @@ -240,6 +240,13 @@ to infer either the best minimally complex (MCM) or the optimal basis representa The actual number of datapoints in the dataset (read-only). + .. py:attribute:: N_synthetic + :type: int + + The synthetic number of datapoints in the dataset. + + This attribute can be changed to perform an analysis of the dataset as if it is either larger or smaller. + .. py:attribute:: N_unique :type: int diff --git a/include/data/dataset.h b/include/data/dataset.h index 2c7a21c..c8cd02b 100644 --- a/include/data/dataset.h +++ b/include/data/dataset.h @@ -31,6 +31,14 @@ class Data { */ Data(const std::vector, unsigned int>>& _dataset, int n_var, int n_states, int n_samples); + /** + * Change the value for the number of datapoints in the dataset that is used for analysis. + * This can be changed to perform an analysis of the dataset as if it is larger or smaller. + * + * @param n_datapoints The synthetic number of datapoints in the dataset. + */ + void set_N_synthetic(int n_datapoints); + /** * Calculate the entropy of the dataset. * @@ -128,6 +136,7 @@ class Data { int N; // Number of datapoints int N_unique; // Number of different datapoints int n_ints; // Number of 128bit integers necessary to represent the data + int N_synthetic; // The synthetic number of datapoints in the dataset std::vector<__uint128_t> pow_q; // Vector containing the first n powers of q used to speed up the calculation of the evidence }; diff --git a/mcmpy/include/py_dataset.h b/mcmpy/include/py_dataset.h index c4ca24f..e3a5648 100644 --- a/mcmpy/include/py_dataset.h +++ b/mcmpy/include/py_dataset.h @@ -65,9 +65,12 @@ class PyData { int get_n() {return this->data.n;}; int get_N() {return this->data.N;}; + int get_N_synthetic() {return this->data.N_synthetic;}; int get_N_unique() {return this->data.N_unique;}; int get_q() {return this->data.q;}; + void set_N_synthetic(int n_datapoints) {this->data.set_N_synthetic(n_datapoints);}; + Data data; }; diff --git a/mcmpy/src/py_dataset.cpp b/mcmpy/src/py_dataset.cpp index de74447..ade3dd7 100644 --- a/mcmpy/src/py_dataset.cpp +++ b/mcmpy/src/py_dataset.cpp @@ -187,8 +187,10 @@ void bind_data_class(py::module &m) { .def("entropy", &PyData::entropy, py::arg("base") = -1) .def("entropy_of_spin_operator", &PyData::entropy_of_spin_op, py::arg("spin_op")) + .def_property_readonly("n", &PyData::get_n) .def_property_readonly("q", &PyData::get_q) .def_property_readonly("N", &PyData::get_N) - .def_property_readonly("N_unique", &PyData::get_N_unique); + .def_property_readonly("N_unique", &PyData::get_N_unique) + .def_property("N_synthetic", &PyData::get_N_synthetic, &PyData::set_N_synthetic); } diff --git a/src/data/complexity.cpp b/src/data/complexity.cpp index 26aad50..dbbf3f0 100644 --- a/src/data/complexity.cpp +++ b/src/data/complexity.cpp @@ -22,7 +22,7 @@ double Data::calc_param_complexity_icc(__uint128_t component){ // Determine the size of the component int r = bit_count(component); __uint128_t n_param = this->pow_q[r] - 1; - return (n_param / 2.) * log(this->N / (2 * M_PI)); + return (n_param / 2.) * log(this->N_synthetic / (2 * M_PI)); } double Data::calc_param_complexity(std::vector<__uint128_t>& partition){ diff --git a/src/data/dataset.cpp b/src/data/dataset.cpp index 36158fb..37d35f4 100644 --- a/src/data/dataset.cpp +++ b/src/data/dataset.cpp @@ -17,6 +17,7 @@ Data::Data(const std::string& filename, int n_var, int n_states){ this->n_ints = ceil(log2(q)); // Process the dataset this->N = processing(filename, n_var, n_ints, n_states, dataset); + this->N_synthetic = this->N; this->N_unique = dataset.size(); // Precompute the powers of q to speed up the calculation of the evidence (q^r) @@ -34,6 +35,7 @@ Data::Data(const std::vector, unsigned int>>& this->q = n_states; this->dataset = _dataset; this->N = n_samples; + this->N_synthetic = this->N; this->N_unique = _dataset.size(); // Calculate the number of integers necessary to represent the data @@ -48,6 +50,14 @@ Data::Data(const std::vector, unsigned int>>& } } +void Data::set_N_synthetic(int n_datapoints){ + // Check if the given number of datapoints is valid + if (n_datapoints < 1){ + throw std::invalid_argument("The number of datapoints should be a positive number."); + } + this->N_synthetic = n_datapoints; +} + double Data::entropy(int base){ // Default base is q if (base == -1){ diff --git a/src/data/evidence.cpp b/src/data/evidence.cpp index 73905d5..87201c7 100644 --- a/src/data/evidence.cpp +++ b/src/data/evidence.cpp @@ -5,21 +5,23 @@ double Data::calc_log_ev_icc(__uint128_t component){ double log_evidence = 0; // Determine the size of the component int r = bit_count(component); + double N_syn = this->N_synthetic; + double alpha = N_syn / this->N; // Contributions from the datapoint frequencies std::unordered_map, unsigned int, HashVector128> counts = build_histogram(*this, component); std::unordered_map, unsigned int, HashVector128>::iterator count_iter = counts.begin(); while (count_iter != counts.end()){ - log_evidence += (lgamma(count_iter->second + 0.5) - 0.5 * log(M_PI)); + log_evidence += (lgamma(alpha * count_iter->second + 0.5) - 0.5 * log(M_PI)); ++count_iter; } // Calculate prefactor if (r > 25){ // Approximate for large components because lgamma overflows - log_evidence -= (r * log(this->q) * this->N); + log_evidence -= (r * log(this->q) * this->N_synthetic); } else{ - log_evidence += lgamma(this->pow_q[r]/2.) - lgamma(this->N + pow_q[r]/2.); + log_evidence += lgamma(this->pow_q[r]/2.) - lgamma(this->N_synthetic + pow_q[r]/2.); } return log_evidence; @@ -37,7 +39,7 @@ double Data::calc_log_ev(std::vector<__uint128_t>& partition){ } } // Contribution from variables not in the model - log_ev -= this->N * (this->n - r) * log(this->q); + log_ev -= this->N_synthetic * (this->n - r) * log(this->q); return log_ev; } diff --git a/src/data/likelihood.cpp b/src/data/likelihood.cpp index e61d62c..fa0187e 100644 --- a/src/data/likelihood.cpp +++ b/src/data/likelihood.cpp @@ -4,13 +4,14 @@ double Data::calc_log_likelihood_icc(__uint128_t component){ double log_likelihood = 0; double N_datapoints = this->N; + double alpha = this->N_synthetic / N_datapoints; // Determine the size of the component int r = bit_count(component); // Get the datapoint frequencies std::unordered_map, unsigned int, HashVector128> counts = build_histogram(*this, component); std::unordered_map, unsigned int, HashVector128>::iterator count_iter = counts.begin(); while (count_iter != counts.end()){ - log_likelihood += count_iter->second * log(count_iter->second / N_datapoints); + log_likelihood += alpha * count_iter->second * log(count_iter->second / N_datapoints); ++count_iter; } return log_likelihood; @@ -28,7 +29,7 @@ double Data::calc_log_likelihood(std::vector<__uint128_t>& partition){ } } // Contribution from variables not in the model - log_likelihood -= this->N * (this->n - r) * log(this->q); + log_likelihood -= this->N_synthetic * (this->n - r) * log(this->q); return log_likelihood; } \ No newline at end of file diff --git a/src/search/mcm_search/annealing.cpp b/src/search/mcm_search/annealing.cpp index 80e4ba7..26b669d 100644 --- a/src/search/mcm_search/annealing.cpp +++ b/src/search/mcm_search/annealing.cpp @@ -51,15 +51,16 @@ MCM MCMSearch::simulated_annealing(Data& data, MCM* init_mcm, std::string file_n *this->output_file << "Number of variables: " << data.n << "\n"; *this->output_file << "Number of states per variable: " << data.q << "\n"; *this->output_file << "Number of datapoints: " << data.N << "\n"; + *this->output_file << "Number of synthetic datapoints: " << data.N_synthetic << "\n"; *this->output_file << "Number of unique datapoint: " << data.N_unique << "\n"; *this->output_file << "Entropy of the data: " << data.entropy() << " q-its \n\n"; *this->output_file << "Initial partition \n"; *this->output_file << "----------------- \n\n"; - *this->output_file << "Log-evidence: " << mcm_tmp.log_ev << " = " << mcm_tmp.log_ev / (data.N * log(data.q)) << " q-its/datapoint \n\n"; + *this->output_file << "Log-evidence: " << mcm_tmp.log_ev << " = " << mcm_tmp.log_ev / (data.N_synthetic * log(data.q)) << " q-its/datapoint \n\n"; - print_partition_details_to_file(*this->output_file, mcm_tmp, data.N, data.q); + print_partition_details_to_file(*this->output_file, mcm_tmp, data.N_synthetic, data.q); *this->output_file << "\n"; *this->output_file << "Start annealing \n"; @@ -114,7 +115,7 @@ MCM MCMSearch::simulated_annealing(Data& data, MCM* init_mcm, std::string file_n // Write to output file if (this->output_file){ - *this->output_file << "Iteration " << i << " \t\t Temperature: " << settings.temp << " \t\t Log-evidence (q-its/datapoint): " << this->mcm_out.log_ev / (this->data->N * log(this->data->q)) << "\n"; + *this->output_file << "Iteration " << i << " \t\t Temperature: " << settings.temp << " \t\t Log-evidence (q-its/datapoint): " << this->mcm_out.log_ev / (this->data->N_synthetic * log(this->data->q)) << "\n"; } } else{steps_since_improve++;} @@ -161,10 +162,10 @@ MCM MCMSearch::simulated_annealing(Data& data, MCM* init_mcm, std::string file_n *this->output_file << "\nFinal partition \n"; *this->output_file << "----------------- \n\n"; - *this->output_file << "Log-evidence: " << this->mcm_out.log_ev << " = " << this->mcm_out.log_ev / (data.N * log(data.q)) << " q-its/datapoint \n"; - *this->output_file << "Max-Log-likelihood: " << max_log_likelihood << " = " << max_log_likelihood / (data.N * log(data.q)) << " q-its/datapoint \n\n"; + *this->output_file << "Log-evidence: " << this->mcm_out.log_ev << " = " << this->mcm_out.log_ev / (data.N_synthetic * log(data.q)) << " q-its/datapoint \n"; + *this->output_file << "Max-Log-likelihood: " << max_log_likelihood << " = " << max_log_likelihood / (data.N_synthetic * log(data.q)) << " q-its/datapoint \n\n"; - print_partition_details_to_file(*this->output_file, this->mcm_out, data.N, data.q); + print_partition_details_to_file(*this->output_file, this->mcm_out, data.N_synthetic, data.q); *this->output_file << "\n"; this->output_file.reset(); } diff --git a/src/search/mcm_search/div_and_conq.cpp b/src/search/mcm_search/div_and_conq.cpp index 96966b8..361f987 100644 --- a/src/search/mcm_search/div_and_conq.cpp +++ b/src/search/mcm_search/div_and_conq.cpp @@ -50,15 +50,16 @@ MCM MCMSearch::divide_and_conquer(Data& data, MCM* init_mcm, std::string file_na *this->output_file << "Number of variables: " << data.n << "\n"; *this->output_file << "Number of states per variable: " << data.q << "\n"; *this->output_file << "Number of datapoints: " << data.N << "\n"; + *this->output_file << "Number of synthetic datapoints: " << data.N_synthetic << "\n"; *this->output_file << "Number of unique datapoint: " << data.N_unique << "\n"; *this->output_file << "Entropy of the data: " << data.entropy() << " q-its \n\n"; *this->output_file << "Initial partition \n"; *this->output_file << "----------------- \n\n"; - *this->output_file << "Log-evidence: " << this->mcm_out.log_ev << " = " << this->mcm_out.log_ev / (data.N * log(data.q)) << " q-its/datapoint \n\n"; + *this->output_file << "Log-evidence: " << this->mcm_out.log_ev << " = " << this->mcm_out.log_ev / (data.N_synthetic * log(data.q)) << " q-its/datapoint \n\n"; - print_partition_details_to_file(*this->output_file, this->mcm_out, data.N, data.q); + print_partition_details_to_file(*this->output_file, this->mcm_out, data.N_synthetic, data.q); *this->output_file << "\n"; *this->output_file << "Start dividing \n"; @@ -83,10 +84,10 @@ MCM MCMSearch::divide_and_conquer(Data& data, MCM* init_mcm, std::string file_na *this->output_file << "\nFinal partition \n"; *this->output_file << "----------------- \n\n"; - *this->output_file << "Log-evidence: " << this->mcm_out.log_ev << " = " << this->mcm_out.log_ev / (data.N * log(data.q)) << " q-its/datapoint \n"; - *this->output_file << "Max-Log-likelihood: " << max_log_likelihood << " = " << max_log_likelihood / (data.N * log(data.q)) << " q-its/datapoint \n\n"; + *this->output_file << "Log-evidence: " << this->mcm_out.log_ev << " = " << this->mcm_out.log_ev / (data.N_synthetic * log(data.q)) << " q-its/datapoint \n"; + *this->output_file << "Max-Log-likelihood: " << max_log_likelihood << " = " << max_log_likelihood / (data.N_synthetic * log(data.q)) << " q-its/datapoint \n\n"; - print_partition_details_to_file(*this->output_file, this->mcm_out, data.N, data.q); + print_partition_details_to_file(*this->output_file, this->mcm_out, data.N_synthetic, data.q); *this->output_file << "\n"; this->output_file.reset(); } @@ -174,8 +175,7 @@ int MCMSearch::division(int move_from, int move_to){ this->log_evidence_trajectory.push_back(this->mcm_out.log_ev); if (this->output_file){ - std::cout << "Should only print with output file"; - *this->output_file << "Splitting component " << move_from << "\t Log-evidence (q-its/datapoint): " << this->mcm_out.log_ev / (this->data->N * log(this->data->q)) << "\n"; + *this->output_file << "Splitting component " << move_from << "\t Log-evidence (q-its/datapoint): " << this->mcm_out.log_ev / (this->data->N_synthetic * log(this->data->q)) << "\n"; *this->output_file << "\t Component " << move_from << " : \t" + int_to_string(this->mcm_out.partition[move_from], this->mcm_out.n) << "\n"; *this->output_file << "\t Component " << move_to << " : \t" + int_to_string(this->mcm_out.partition[move_to], this->mcm_out.n) << "\n\n"; } diff --git a/src/search/mcm_search/greedy.cpp b/src/search/mcm_search/greedy.cpp index 6acb93f..a176b94 100644 --- a/src/search/mcm_search/greedy.cpp +++ b/src/search/mcm_search/greedy.cpp @@ -49,15 +49,16 @@ MCM MCMSearch::greedy_search(Data& data, MCM* init_mcm, std::string file_name){ *this->output_file << "Number of variables: " << data.n << "\n"; *this->output_file << "Number of states per variable: " << data.q << "\n"; *this->output_file << "Number of datapoints: " << data.N << "\n"; + *this->output_file << "Number of synthetic datapoints: " << data.N_synthetic << "\n"; *this->output_file << "Number of unique datapoint: " << data.N_unique << "\n"; *this->output_file << "Entropy of the data: " << data.entropy() << " q-its \n\n"; *this->output_file << "Initial partition \n"; *this->output_file << "----------------- \n\n"; - *this->output_file << "Log-evidence: " << this->mcm_out.log_ev << " = " << this->mcm_out.log_ev / (data.N * log(data.q)) << " q-its/datapoint \n\n"; + *this->output_file << "Log-evidence: " << this->mcm_out.log_ev << " = " << this->mcm_out.log_ev / (data.N_synthetic * log(data.q)) << " q-its/datapoint \n\n"; - print_partition_details_to_file(*this->output_file, this->mcm_out, data.N, data.q); + print_partition_details_to_file(*this->output_file, this->mcm_out, data.N_synthetic, data.q); *this->output_file << "\n"; } @@ -76,10 +77,10 @@ MCM MCMSearch::greedy_search(Data& data, MCM* init_mcm, std::string file_name){ *this->output_file << "\nFinal partition \n"; *this->output_file << "----------------- \n\n"; - *this->output_file << "Log-evidence: " << this->mcm_out.log_ev << " = " << this->mcm_out.log_ev / (data.N * log(data.q)) << " q-its/datapoint \n"; - *this->output_file << "Max-Log-likelihood: " << max_log_likelihood << " = " << max_log_likelihood / (data.N * log(data.q)) << " q-its/datapoint \n\n"; + *this->output_file << "Log-evidence: " << this->mcm_out.log_ev << " = " << this->mcm_out.log_ev / (data.N_synthetic * log(data.q)) << " q-its/datapoint \n"; + *this->output_file << "Max-Log-likelihood: " << max_log_likelihood << " = " << max_log_likelihood / (data.N_synthetic * log(data.q)) << " q-its/datapoint \n\n"; - print_partition_details_to_file(*this->output_file, this->mcm_out, data.N, data.q); + print_partition_details_to_file(*this->output_file, this->mcm_out, data.N_synthetic, data.q); *this->output_file << "\n"; this->output_file.reset(); } @@ -150,7 +151,7 @@ void MCMSearch::hierarchical_merging(){ // Write to the output file if (this->output_file){ - *this->output_file << "Merging components " << best_i << " and " << best_j << " \t Log-evidence (q-its/datapoint): "<< (this->mcm_out.log_ev) / (this->data->N * log(this->data->q)) << "\n"; + *this->output_file << "Merging components " << best_i << " and " << best_j << " \t Log-evidence (q-its/datapoint): "<< (this->mcm_out.log_ev) / (this->data->N_synthetic * log(this->data->q)) << "\n"; } } } diff --git a/src/utilities/partition.cpp b/src/utilities/partition.cpp index 99e5569..675a29f 100644 --- a/src/utilities/partition.cpp +++ b/src/utilities/partition.cpp @@ -1,32 +1,6 @@ #include "utilities/partition.h" #include "utilities/miscellaneous.h" -/* -std::vector generate_random_partition(int n){ - // Check if the number of variables is a valid number - if (n > 128){ - throw std::domain_error("The maximum system size is 128 variables."); - } - if (n < 0){ - throw std::domain_error("The number of variables should be a positive number."); - } - - std::vector growth_string(n, 0); - - int index; - int max_index = 2; - for (int i = 1; i < n; i++){ - index = rand()/(RAND_MAX/max_index); - growth_string[i] = index; - - if (index == (max_index-1)){ - max_index += 1; - } - } - - return growth_string; -}*/ - std::vector<__uint128_t> generate_random_partition(int n){ // Check if the number of variables is a valid number if (n > 128){