Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 7 additions & 0 deletions docs/api/data.rst
Original file line number Diff line number Diff line change
Expand Up @@ -240,6 +240,13 @@ to infer either the best minimally complex (MCM) or the optimal basis representa

The actual number of datapoints in the dataset (read-only).

.. py:attribute:: N_synthetic
:type: int

The synthetic number of datapoints in the dataset.

This attribute can be changed to perform an analysis of the dataset as if it is either larger or smaller.

.. py:attribute:: N_unique
:type: int

Expand Down
9 changes: 9 additions & 0 deletions include/data/dataset.h
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,14 @@ class Data {
*/
Data(const std::vector<std::pair<std::vector<__uint128_t>, unsigned int>>& _dataset, int n_var, int n_states, int n_samples);

/**
* Change the value for the number of datapoints in the dataset that is used for analysis.
* This can be changed to perform an analysis of the dataset as if it is larger or smaller.
*
* @param n_datapoints The synthetic number of datapoints in the dataset.
*/
void set_N_synthetic(int n_datapoints);

/**
* Calculate the entropy of the dataset.
*
Expand Down Expand Up @@ -128,6 +136,7 @@ class Data {
int N; // Number of datapoints
int N_unique; // Number of different datapoints
int n_ints; // Number of 128bit integers necessary to represent the data
int N_synthetic; // The synthetic number of datapoints in the dataset

std::vector<__uint128_t> pow_q; // Vector containing the first n powers of q used to speed up the calculation of the evidence
};
3 changes: 3 additions & 0 deletions mcmpy/include/py_dataset.h
Original file line number Diff line number Diff line change
Expand Up @@ -65,9 +65,12 @@ class PyData {

int get_n() {return this->data.n;};
int get_N() {return this->data.N;};
int get_N_synthetic() {return this->data.N_synthetic;};
int get_N_unique() {return this->data.N_unique;};
int get_q() {return this->data.q;};

void set_N_synthetic(int n_datapoints) {this->data.set_N_synthetic(n_datapoints);};

Data data;
};

Expand Down
4 changes: 3 additions & 1 deletion mcmpy/src/py_dataset.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -187,8 +187,10 @@ void bind_data_class(py::module &m) {

.def("entropy", &PyData::entropy, py::arg("base") = -1)
.def("entropy_of_spin_operator", &PyData::entropy_of_spin_op, py::arg("spin_op"))

.def_property_readonly("n", &PyData::get_n)
.def_property_readonly("q", &PyData::get_q)
.def_property_readonly("N", &PyData::get_N)
.def_property_readonly("N_unique", &PyData::get_N_unique);
.def_property_readonly("N_unique", &PyData::get_N_unique)
.def_property("N_synthetic", &PyData::get_N_synthetic, &PyData::set_N_synthetic);
}
2 changes: 1 addition & 1 deletion src/data/complexity.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,7 @@ double Data::calc_param_complexity_icc(__uint128_t component){
// Determine the size of the component
int r = bit_count(component);
__uint128_t n_param = this->pow_q[r] - 1;
return (n_param / 2.) * log(this->N / (2 * M_PI));
return (n_param / 2.) * log(this->N_synthetic / (2 * M_PI));
}

double Data::calc_param_complexity(std::vector<__uint128_t>& partition){
Expand Down
10 changes: 10 additions & 0 deletions src/data/dataset.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@ Data::Data(const std::string& filename, int n_var, int n_states){
this->n_ints = ceil(log2(q));
// Process the dataset
this->N = processing(filename, n_var, n_ints, n_states, dataset);
this->N_synthetic = this->N;
this->N_unique = dataset.size();

// Precompute the powers of q to speed up the calculation of the evidence (q^r)
Expand All @@ -34,6 +35,7 @@ Data::Data(const std::vector<std::pair<std::vector<__uint128_t>, unsigned int>>&
this->q = n_states;
this->dataset = _dataset;
this->N = n_samples;
this->N_synthetic = this->N;
this->N_unique = _dataset.size();

// Calculate the number of integers necessary to represent the data
Expand All @@ -48,6 +50,14 @@ Data::Data(const std::vector<std::pair<std::vector<__uint128_t>, unsigned int>>&
}
}

void Data::set_N_synthetic(int n_datapoints){
// Check if the given number of datapoints is valid
if (n_datapoints < 1){
throw std::invalid_argument("The number of datapoints should be a positive number.");
}
this->N_synthetic = n_datapoints;
}

double Data::entropy(int base){
// Default base is q
if (base == -1){
Expand Down
10 changes: 6 additions & 4 deletions src/data/evidence.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -5,21 +5,23 @@ double Data::calc_log_ev_icc(__uint128_t component){
double log_evidence = 0;
// Determine the size of the component
int r = bit_count(component);
double N_syn = this->N_synthetic;
double alpha = N_syn / this->N;
// Contributions from the datapoint frequencies
std::unordered_map<std::vector<__uint128_t>, unsigned int, HashVector128> counts = build_histogram(*this, component);
std::unordered_map<std::vector<__uint128_t>, unsigned int, HashVector128>::iterator count_iter = counts.begin();
while (count_iter != counts.end()){
log_evidence += (lgamma(count_iter->second + 0.5) - 0.5 * log(M_PI));
log_evidence += (lgamma(alpha * count_iter->second + 0.5) - 0.5 * log(M_PI));
++count_iter;
}

// Calculate prefactor
if (r > 25){
// Approximate for large components because lgamma overflows
log_evidence -= (r * log(this->q) * this->N);
log_evidence -= (r * log(this->q) * this->N_synthetic);
}
else{
log_evidence += lgamma(this->pow_q[r]/2.) - lgamma(this->N + pow_q[r]/2.);
log_evidence += lgamma(this->pow_q[r]/2.) - lgamma(this->N_synthetic + pow_q[r]/2.);
}

return log_evidence;
Expand All @@ -37,7 +39,7 @@ double Data::calc_log_ev(std::vector<__uint128_t>& partition){
}
}
// Contribution from variables not in the model
log_ev -= this->N * (this->n - r) * log(this->q);
log_ev -= this->N_synthetic * (this->n - r) * log(this->q);

return log_ev;
}
5 changes: 3 additions & 2 deletions src/data/likelihood.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -4,13 +4,14 @@
double Data::calc_log_likelihood_icc(__uint128_t component){
double log_likelihood = 0;
double N_datapoints = this->N;
double alpha = this->N_synthetic / N_datapoints;
// Determine the size of the component
int r = bit_count(component);
// Get the datapoint frequencies
std::unordered_map<std::vector<__uint128_t>, unsigned int, HashVector128> counts = build_histogram(*this, component);
std::unordered_map<std::vector<__uint128_t>, unsigned int, HashVector128>::iterator count_iter = counts.begin();
while (count_iter != counts.end()){
log_likelihood += count_iter->second * log(count_iter->second / N_datapoints);
log_likelihood += alpha * count_iter->second * log(count_iter->second / N_datapoints);
++count_iter;
}
return log_likelihood;
Expand All @@ -28,7 +29,7 @@ double Data::calc_log_likelihood(std::vector<__uint128_t>& partition){
}
}
// Contribution from variables not in the model
log_likelihood -= this->N * (this->n - r) * log(this->q);
log_likelihood -= this->N_synthetic * (this->n - r) * log(this->q);

return log_likelihood;
}
13 changes: 7 additions & 6 deletions src/search/mcm_search/annealing.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -51,15 +51,16 @@ MCM MCMSearch::simulated_annealing(Data& data, MCM* init_mcm, std::string file_n
*this->output_file << "Number of variables: " << data.n << "\n";
*this->output_file << "Number of states per variable: " << data.q << "\n";
*this->output_file << "Number of datapoints: " << data.N << "\n";
*this->output_file << "Number of synthetic datapoints: " << data.N_synthetic << "\n";
*this->output_file << "Number of unique datapoint: " << data.N_unique << "\n";
*this->output_file << "Entropy of the data: " << data.entropy() << " q-its \n\n";

*this->output_file << "Initial partition \n";
*this->output_file << "----------------- \n\n";

*this->output_file << "Log-evidence: " << mcm_tmp.log_ev << " = " << mcm_tmp.log_ev / (data.N * log(data.q)) << " q-its/datapoint \n\n";
*this->output_file << "Log-evidence: " << mcm_tmp.log_ev << " = " << mcm_tmp.log_ev / (data.N_synthetic * log(data.q)) << " q-its/datapoint \n\n";

print_partition_details_to_file(*this->output_file, mcm_tmp, data.N, data.q);
print_partition_details_to_file(*this->output_file, mcm_tmp, data.N_synthetic, data.q);
*this->output_file << "\n";

*this->output_file << "Start annealing \n";
Expand Down Expand Up @@ -114,7 +115,7 @@ MCM MCMSearch::simulated_annealing(Data& data, MCM* init_mcm, std::string file_n

// Write to output file
if (this->output_file){
*this->output_file << "Iteration " << i << " \t\t Temperature: " << settings.temp << " \t\t Log-evidence (q-its/datapoint): " << this->mcm_out.log_ev / (this->data->N * log(this->data->q)) << "\n";
*this->output_file << "Iteration " << i << " \t\t Temperature: " << settings.temp << " \t\t Log-evidence (q-its/datapoint): " << this->mcm_out.log_ev / (this->data->N_synthetic * log(this->data->q)) << "\n";
}
}
else{steps_since_improve++;}
Expand Down Expand Up @@ -161,10 +162,10 @@ MCM MCMSearch::simulated_annealing(Data& data, MCM* init_mcm, std::string file_n
*this->output_file << "\nFinal partition \n";
*this->output_file << "----------------- \n\n";

*this->output_file << "Log-evidence: " << this->mcm_out.log_ev << " = " << this->mcm_out.log_ev / (data.N * log(data.q)) << " q-its/datapoint \n";
*this->output_file << "Max-Log-likelihood: " << max_log_likelihood << " = " << max_log_likelihood / (data.N * log(data.q)) << " q-its/datapoint \n\n";
*this->output_file << "Log-evidence: " << this->mcm_out.log_ev << " = " << this->mcm_out.log_ev / (data.N_synthetic * log(data.q)) << " q-its/datapoint \n";
*this->output_file << "Max-Log-likelihood: " << max_log_likelihood << " = " << max_log_likelihood / (data.N_synthetic * log(data.q)) << " q-its/datapoint \n\n";

print_partition_details_to_file(*this->output_file, this->mcm_out, data.N, data.q);
print_partition_details_to_file(*this->output_file, this->mcm_out, data.N_synthetic, data.q);
*this->output_file << "\n";
this->output_file.reset();
}
Expand Down
14 changes: 7 additions & 7 deletions src/search/mcm_search/div_and_conq.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -50,15 +50,16 @@ MCM MCMSearch::divide_and_conquer(Data& data, MCM* init_mcm, std::string file_na
*this->output_file << "Number of variables: " << data.n << "\n";
*this->output_file << "Number of states per variable: " << data.q << "\n";
*this->output_file << "Number of datapoints: " << data.N << "\n";
*this->output_file << "Number of synthetic datapoints: " << data.N_synthetic << "\n";
*this->output_file << "Number of unique datapoint: " << data.N_unique << "\n";
*this->output_file << "Entropy of the data: " << data.entropy() << " q-its \n\n";

*this->output_file << "Initial partition \n";
*this->output_file << "----------------- \n\n";

*this->output_file << "Log-evidence: " << this->mcm_out.log_ev << " = " << this->mcm_out.log_ev / (data.N * log(data.q)) << " q-its/datapoint \n\n";
*this->output_file << "Log-evidence: " << this->mcm_out.log_ev << " = " << this->mcm_out.log_ev / (data.N_synthetic * log(data.q)) << " q-its/datapoint \n\n";

print_partition_details_to_file(*this->output_file, this->mcm_out, data.N, data.q);
print_partition_details_to_file(*this->output_file, this->mcm_out, data.N_synthetic, data.q);
*this->output_file << "\n";

*this->output_file << "Start dividing \n";
Expand All @@ -83,10 +84,10 @@ MCM MCMSearch::divide_and_conquer(Data& data, MCM* init_mcm, std::string file_na
*this->output_file << "\nFinal partition \n";
*this->output_file << "----------------- \n\n";

*this->output_file << "Log-evidence: " << this->mcm_out.log_ev << " = " << this->mcm_out.log_ev / (data.N * log(data.q)) << " q-its/datapoint \n";
*this->output_file << "Max-Log-likelihood: " << max_log_likelihood << " = " << max_log_likelihood / (data.N * log(data.q)) << " q-its/datapoint \n\n";
*this->output_file << "Log-evidence: " << this->mcm_out.log_ev << " = " << this->mcm_out.log_ev / (data.N_synthetic * log(data.q)) << " q-its/datapoint \n";
*this->output_file << "Max-Log-likelihood: " << max_log_likelihood << " = " << max_log_likelihood / (data.N_synthetic * log(data.q)) << " q-its/datapoint \n\n";

print_partition_details_to_file(*this->output_file, this->mcm_out, data.N, data.q);
print_partition_details_to_file(*this->output_file, this->mcm_out, data.N_synthetic, data.q);
*this->output_file << "\n";
this->output_file.reset();
}
Expand Down Expand Up @@ -174,8 +175,7 @@ int MCMSearch::division(int move_from, int move_to){
this->log_evidence_trajectory.push_back(this->mcm_out.log_ev);

if (this->output_file){
std::cout << "Should only print with output file";
*this->output_file << "Splitting component " << move_from << "\t Log-evidence (q-its/datapoint): " << this->mcm_out.log_ev / (this->data->N * log(this->data->q)) << "\n";
*this->output_file << "Splitting component " << move_from << "\t Log-evidence (q-its/datapoint): " << this->mcm_out.log_ev / (this->data->N_synthetic * log(this->data->q)) << "\n";
*this->output_file << "\t Component " << move_from << " : \t" + int_to_string(this->mcm_out.partition[move_from], this->mcm_out.n) << "\n";
*this->output_file << "\t Component " << move_to << " : \t" + int_to_string(this->mcm_out.partition[move_to], this->mcm_out.n) << "\n\n";
}
Expand Down
13 changes: 7 additions & 6 deletions src/search/mcm_search/greedy.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -49,15 +49,16 @@ MCM MCMSearch::greedy_search(Data& data, MCM* init_mcm, std::string file_name){
*this->output_file << "Number of variables: " << data.n << "\n";
*this->output_file << "Number of states per variable: " << data.q << "\n";
*this->output_file << "Number of datapoints: " << data.N << "\n";
*this->output_file << "Number of synthetic datapoints: " << data.N_synthetic << "\n";
*this->output_file << "Number of unique datapoint: " << data.N_unique << "\n";
*this->output_file << "Entropy of the data: " << data.entropy() << " q-its \n\n";

*this->output_file << "Initial partition \n";
*this->output_file << "----------------- \n\n";

*this->output_file << "Log-evidence: " << this->mcm_out.log_ev << " = " << this->mcm_out.log_ev / (data.N * log(data.q)) << " q-its/datapoint \n\n";
*this->output_file << "Log-evidence: " << this->mcm_out.log_ev << " = " << this->mcm_out.log_ev / (data.N_synthetic * log(data.q)) << " q-its/datapoint \n\n";

print_partition_details_to_file(*this->output_file, this->mcm_out, data.N, data.q);
print_partition_details_to_file(*this->output_file, this->mcm_out, data.N_synthetic, data.q);
*this->output_file << "\n";
}

Expand All @@ -76,10 +77,10 @@ MCM MCMSearch::greedy_search(Data& data, MCM* init_mcm, std::string file_name){
*this->output_file << "\nFinal partition \n";
*this->output_file << "----------------- \n\n";

*this->output_file << "Log-evidence: " << this->mcm_out.log_ev << " = " << this->mcm_out.log_ev / (data.N * log(data.q)) << " q-its/datapoint \n";
*this->output_file << "Max-Log-likelihood: " << max_log_likelihood << " = " << max_log_likelihood / (data.N * log(data.q)) << " q-its/datapoint \n\n";
*this->output_file << "Log-evidence: " << this->mcm_out.log_ev << " = " << this->mcm_out.log_ev / (data.N_synthetic * log(data.q)) << " q-its/datapoint \n";
*this->output_file << "Max-Log-likelihood: " << max_log_likelihood << " = " << max_log_likelihood / (data.N_synthetic * log(data.q)) << " q-its/datapoint \n\n";

print_partition_details_to_file(*this->output_file, this->mcm_out, data.N, data.q);
print_partition_details_to_file(*this->output_file, this->mcm_out, data.N_synthetic, data.q);
*this->output_file << "\n";
this->output_file.reset();
}
Expand Down Expand Up @@ -150,7 +151,7 @@ void MCMSearch::hierarchical_merging(){

// Write to the output file
if (this->output_file){
*this->output_file << "Merging components " << best_i << " and " << best_j << " \t Log-evidence (q-its/datapoint): "<< (this->mcm_out.log_ev) / (this->data->N * log(this->data->q)) << "\n";
*this->output_file << "Merging components " << best_i << " and " << best_j << " \t Log-evidence (q-its/datapoint): "<< (this->mcm_out.log_ev) / (this->data->N_synthetic * log(this->data->q)) << "\n";
}
}
}
Expand Down
26 changes: 0 additions & 26 deletions src/utilities/partition.cpp
Original file line number Diff line number Diff line change
@@ -1,32 +1,6 @@
#include "utilities/partition.h"
#include "utilities/miscellaneous.h"

/*
std::vector<int> generate_random_partition(int n){
// Check if the number of variables is a valid number
if (n > 128){
throw std::domain_error("The maximum system size is 128 variables.");
}
if (n < 0){
throw std::domain_error("The number of variables should be a positive number.");
}

std::vector<int> growth_string(n, 0);

int index;
int max_index = 2;
for (int i = 1; i < n; i++){
index = rand()/(RAND_MAX/max_index);
growth_string[i] = index;

if (index == (max_index-1)){
max_index += 1;
}
}

return growth_string;
}*/

std::vector<__uint128_t> generate_random_partition(int n){
// Check if the number of variables is a valid number
if (n > 128){
Expand Down
Loading