diff --git a/mcmpy/include/py_dataset.h b/mcmpy/include/py_dataset.h index c4ca24f..541e096 100644 --- a/mcmpy/include/py_dataset.h +++ b/mcmpy/include/py_dataset.h @@ -25,6 +25,15 @@ class PyData { */ PyData(const std::string& filename, int n_var, int n_states) : data(filename, n_var, n_states) {}; + /** + * Constructs a new PyData object from a numpy array + * + * @param array Numpy array + * @param n_var Number of variables in the system + * @param n_states Number of values each variable can take + */ + PyData(py::array_t array, int n_var, int n_states); + /** * Constructs a new PyData object from a Data object * @@ -32,6 +41,12 @@ class PyData { */ PyData(const Data& _data) : data(_data.dataset, _data.n, _data.q, _data.N) {}; + int processing_numpy(py::array_t input_array, + int n_var, + int n_ints, + int n_states, + std::vector, unsigned int>>& data); + double calc_property_array(py::array_t partition, std::string property); double calc_property_mcm(PyMCM& mcm, std::string property); py::array calc_property_icc_array(py::array_t partition, std::string property); @@ -71,4 +86,4 @@ class PyData { Data data; }; -void bind_data_class(py::module &); \ No newline at end of file +void bind_data_class(py::module &); diff --git a/mcmpy/src/py_dataset.cpp b/mcmpy/src/py_dataset.cpp index de74447..fecd8c8 100644 --- a/mcmpy/src/py_dataset.cpp +++ b/mcmpy/src/py_dataset.cpp @@ -1,5 +1,121 @@ #include "py_dataset.h" +PyData::PyData(py::array_t input_array, int n_var, int n_states) + : data(std::vector, unsigned int>>(), + n_var, n_states, 0) +{ + // Check if the given number of variables is valid + if (n_var > 128) { + throw std::invalid_argument("The maximum system size is 128 variables."); + } + if (n_var < 1) { + throw std::invalid_argument("The system size should be at least 1."); + } + + // Assign variables + this->data.n = n_var; + this->data.q = n_states; + + // Calculate the number of integers necessary to represent the data + this->data.n_ints = ceil(log2(this->data.q)); + + // Process the dataset from NumPy array + this->data.N = processing_numpy( + input_array, + n_var, + this->data.n_ints, + n_states, + this->data.dataset + ); + this->data.N_unique = this->data.dataset.size(); + + // Precompute the powers of q to speed up the calculation of the evidence (q^r) + this->data.pow_q.assign(data.n+1, 0); + __uint128_t element = 1; + for(int i = 0; i < data.n+1; i++){ + this->data.pow_q[i] = element; + element *= this->data.q; + } +} + +int PyData::processing_numpy(py::array_t input_array, int n_var, int n_ints, int n_states, + std::vector, unsigned int>>& data) { + + py::buffer_info buf = input_array.request(); + + // Check array dimensionality + if (buf.ndim != 2) { + throw std::invalid_argument("Input array must be 2-dimensional"); + } + + int num_rows = buf.shape[0]; + int num_cols = buf.shape[1]; + + // Check array has enough rows + if (num_rows == 0) { + throw std::invalid_argument("Array must have at least one row"); + } + + // Check array has enough columns for n_var + if (num_cols < n_var) { + throw std::invalid_argument("Array must have at least " + std::to_string(n_var) + + " columns (n_var=" + std::to_string(n_var) + + "), but got " + std::to_string(num_cols) + " columns"); + } + + // Pre-validate all values are in valid range [0, n_states-1] + int* ptr = static_cast(buf.ptr); + for (int row = 0; row < num_rows; row++) { + for (int col = 0; col < n_var; col++) { + int value = ptr[row * num_cols + col]; + if (value < 0 || value >= n_states) { + throw std::invalid_argument("Array value " + std::to_string(value) + + " at position [" + std::to_string(row) + + ", " + std::to_string(col) + + "] is out of valid range [0, " + + std::to_string(n_states - 1) + "] for n_states=" + + std::to_string(n_states)); + } + } + } + + std::unordered_map, unsigned int, HashVector128> dataset_map; + int N = 0; + std::vector<__uint128_t> observation(n_ints); + + // Process each row (values are already validated) + for (int row = 0; row < num_rows; row++) { + // Reset observation vector + std::fill(observation.begin(), observation.end(), 0); + __uint128_t element = 1; + + for (int i = 0; i < n_var; i++) { + int value = ptr[row * num_cols + i]; + + // Convert value to binary representation + int bit = 0; + while (value) { + if (value & 1) { + observation[bit] += element; + } + ++bit; + value >>= 1; + } + element <<= 1; + } + + dataset_map[observation]++; + N++; + } + + // Convert map to vector + for (auto &my_pair : dataset_map) { + data.push_back(my_pair); + } + + return N; +} + std::vector<__uint128_t> convert_spin_op_from_py(const py::array_t& spin_op, int q, int n_ints){ py::buffer_info buff = spin_op.request(); @@ -162,6 +278,7 @@ double PyData::entropy_of_spin_op(const py::array_t& op){ void bind_data_class(py::module &m) { py::class_(m, "Data") .def(py::init()) + .def(py::init, int, int>()) .def("log_evidence_icc", &PyData::calc_log_ev_icc, py::arg("mcm")) .def("log_evidence_icc", &PyData::calc_log_ev_icc_array, py::arg("partition")) .def("log_evidence", &PyData::calc_log_ev_mcm)