diff --git a/.editorconfig b/.editorconfig new file mode 100644 index 00000000..80fecdfb --- /dev/null +++ b/.editorconfig @@ -0,0 +1,3 @@ +[*.py] +indent_style = tab +indent_size = 4 diff --git a/.gitattributes b/.gitattributes new file mode 100644 index 00000000..887a2c18 --- /dev/null +++ b/.gitattributes @@ -0,0 +1,2 @@ +# SCM syntax highlighting & preventing 3-way merges +pixi.lock merge=binary linguist-language=YAML linguist-generated=true diff --git a/.gitignore b/.gitignore index 9a53b815..34fa4e08 100644 --- a/.gitignore +++ b/.gitignore @@ -162,3 +162,9 @@ cython_debug/ # and can be added to the global gitignore or merged into this file. For a more nuclear # option (not recommended) you can uncomment the following to ignore the entire idea folder. #.idea/ +# pixi environments + +.pixi/* +!.pixi/config.toml +pixi.lock +.envrc diff --git a/GraphTsetlinMachine/graphs.py b/GraphTsetlinMachine/graphs.py index dae34649..23948863 100644 --- a/GraphTsetlinMachine/graphs.py +++ b/GraphTsetlinMachine/graphs.py @@ -223,3 +223,83 @@ def encode(self): self.signature = m.digest() self.encoded = True + + + def __len__(self): + return self.number_of_graphs + + def _get_indices(self, key) -> list[int]: + if isinstance(key, int): + if key < 0: + key += self.number_of_graphs + if key < 0 or key >= self.number_of_graphs: + raise IndexError("Graph index out of range") + indices = [key] + elif isinstance(key, slice): + indices = list(range(*key.indices(self.number_of_graphs))) + elif isinstance(key, list) or isinstance(key, np.ndarray): + indices = [] + for k in key: + if k < 0: + k += self.number_of_graphs + if k < 0 or k >= self.number_of_graphs: + raise IndexError("Graph index out of range") + indices.append(k) + else: + raise TypeError("Invalid graph index type") + + return indices + + def _create_subset(self, indices: list[int]): + subset = Graphs( + number_of_graphs=len(indices), + double_hashing=self.double_hashing, + one_hot_encoding=self.one_hot_encoding, + init_with=self, + ) + + # Copy number_of_graph_nodes and graph_node_id + for new_id, old_id in enumerate(indices): + subset.number_of_graph_nodes[new_id] = self.number_of_graph_nodes[old_id] + subset.graph_node_id[new_id] = self.graph_node_id[old_id] + + subset.prepare_node_configuration() + + # Copy node-level data + for new_id, old_id in enumerate(indices): + old_start = self.node_index[old_id] + old_end = old_start + self.number_of_graph_nodes[old_id] + new_start = subset.node_index[new_id] + new_end = new_start + subset.number_of_graph_nodes[new_id] + + subset.node_type[new_start:new_end] = self.node_type[old_start:old_end] + subset.number_of_graph_node_edges[new_start:new_end] = self.number_of_graph_node_edges[old_start:old_end] + subset.graph_node_edge_counter[new_start:new_end] = self.graph_node_edge_counter[old_start:old_end] + subset.X[new_start:new_end] = self.X[old_start:old_end] + + # Prepare edge configuration for subset + subset.prepare_edge_configuration() + + # Copy edge data + for new_id, old_id in enumerate(indices): + for node_id in range(self.number_of_graph_nodes[old_id]): + old_node_idx = self.node_index[old_id] + node_id + new_node_idx = subset.node_index[new_id] + node_id + + old_edge_start = self.edge_index[old_node_idx] + old_edge_count = self.graph_node_edge_counter[old_node_idx] + new_edge_start = subset.edge_index[new_node_idx] + + subset.edge[new_edge_start : new_edge_start + old_edge_count] = self.edge[ + old_edge_start : old_edge_start + old_edge_count + ] + + subset.encode() + + return subset + + def __getitem__(self, index): + indices = self._get_indices(index) + if len(indices) == 0: + raise ValueError("No graphs selected") + return self._create_subset(indices) diff --git a/GraphTsetlinMachine/tm.py b/GraphTsetlinMachine/tm.py index d402916d..d11ca037 100644 --- a/GraphTsetlinMachine/tm.py +++ b/GraphTsetlinMachine/tm.py @@ -30,11 +30,8 @@ import pycuda.curandom as curandom import pycuda.driver as cuda -import pycuda.autoinit +import pycuda.autoinit # noqa: F401 from pycuda.compiler import SourceModule -from scipy.sparse import csr_matrix -import sys -from time import time g = curandom.XORWOWRandomNumberGenerator() @@ -314,22 +311,19 @@ def set_state(self, state): def save(self, fname=""): # Copy data from GPU to CPU - if np.array_equal(self.ta_state, np.array([])): - self.ta_state = np.empty( - self.number_of_clauses * self.number_of_ta_chunks * self.number_of_state_bits, dtype=np.uint32 - ) - cuda.memcpy_dtoh(self.ta_state, self.ta_state_gpu) + self.ta_state = np.empty( + self.number_of_clauses * self.number_of_ta_chunks * self.number_of_state_bits, dtype=np.uint32 + ) + cuda.memcpy_dtoh(self.ta_state, self.ta_state_gpu) for depth in range(self.depth - 1): - if np.array_equal(self.message_ta_state[depth], np.array([])): - self.message_ta_state[depth] = np.empty( - self.number_of_clauses * self.number_of_message_chunks * self.number_of_state_bits, dtype=np.uint32 - ) - cuda.memcpy_dtoh(self.message_ta_state[depth], self.message_ta_state_gpu[depth]) + self.message_ta_state[depth] = np.empty( + self.number_of_clauses * self.number_of_message_chunks * self.number_of_state_bits, dtype=np.uint32 + ) + cuda.memcpy_dtoh(self.message_ta_state[depth], self.message_ta_state_gpu[depth]) - if np.array_equal(self.clause_weights, np.array([])): - self.clause_weights = np.empty(self.number_of_outputs * self.number_of_clauses, dtype=np.int32) - cuda.memcpy_dtoh(self.clause_weights, self.clause_weights_gpu) + self.clause_weights = np.empty(self.number_of_outputs * self.number_of_clauses, dtype=np.int32) + cuda.memcpy_dtoh(self.clause_weights, self.clause_weights_gpu) state_dict = { # State arrays diff --git a/examples/test_graph_subsetting.py b/examples/test_graph_subsetting.py new file mode 100644 index 00000000..dace9d72 --- /dev/null +++ b/examples/test_graph_subsetting.py @@ -0,0 +1,121 @@ +import random +import numpy as np +from GraphTsetlinMachine.tm import MultiClassGraphTsetlinMachine +from GraphTsetlinMachine.graphs import Graphs + +random.seed(42) + +def generate_graphs(symbols, noise, graph_args: dict): + graphs = Graphs(**graph_args) + number_of_examples = graph_args["number_of_graphs"] + + for graph_id in range(number_of_examples): + graphs.set_number_of_graph_nodes(graph_id, 2) + + graphs.prepare_node_configuration() + + for graph_id in range(number_of_examples): + for node_id in range(graphs.number_of_graph_nodes[graph_id]): + number_of_edges = 1 + graphs.add_graph_node(graph_id, node_id, number_of_edges) + + graphs.prepare_edge_configuration() + + X = np.empty((number_of_examples, 2)) + Y = np.empty(number_of_examples, dtype=np.uint32) + + for graph_id in range(number_of_examples): + edge_type = "Plain" + source_node_id = 0 + destination_node_id = 1 + graphs.add_graph_node_edge(graph_id, source_node_id, destination_node_id, edge_type) + + source_node_id = 1 + destination_node_id = 0 + graphs.add_graph_node_edge(graph_id, source_node_id, destination_node_id, edge_type) + + x1 = random.choice(symbols) + x2 = random.choice(symbols) + X[graph_id] = np.array([x1, x2]) + if (x1 % 2) == (x2 % 2): + Y[graph_id] = 0 + else: + Y[graph_id] = 1 + + graphs.add_graph_node_property(graph_id, 0, x1) + graphs.add_graph_node_property(graph_id, 1, x2) + + if np.random.rand() <= noise: + Y[graph_id] = 1 - Y[graph_id] + + graphs.encode() + + return graphs, X, Y + + +if __name__ == "__main__": + tm_params = { + "number_of_clauses": 1000, + "T": 2000, + "s": 1, + "message_size": 2048, + "message_bits": 2, + "double_hashing": True, + "depth": 2, + "grid": (16 * 13, 1, 1), + "block": (128, 1, 1), + } + + epochs = 10 + noise = 0.1 + num_value = 100 + symbols = [i for i in range(num_value)] + graph_params = { + "number_of_graphs": 50000, + "hypervector_size": 2048, + "hypervector_bits": 2, + "double_hashing": True, + "symbols": symbols, + } + graphs_train, X_train, y_train = generate_graphs(symbols, noise, graph_params) + + graphs_test, X_test, y_test = generate_graphs( + symbols, + 0.0, + { + "number_of_graphs": 2000, + "init_with": graphs_train, + }, + ) + + print("====================Training with graph splits====================") + tm = MultiClassGraphTsetlinMachine(**tm_params) + for i in range(epochs): + print(f"Epoch {i} ---------------------") + fit_time = 0.0 + for b in range(0, y_train.shape[0], 10000): + gsub = graphs_train[b : b + 10000] + ysub = y_train[b : b + 10000] + tm.fit(gsub, ysub, epochs=1, incremental=True) + result_sub = 100 * (tm.predict(gsub) == ysub).mean() + print(f" [Batch {b}-{b + 10000}] Train Acc: {result_sub:.4f}") + + pred_test = tm.predict(graphs_test) + result_test = 100 * (pred_test == y_test).mean() + result_train = 100 * (tm.predict(graphs_train) == y_train).mean() + print(f"[Graph Splits] Epoch {i} | Train Acc: {result_train:.4f}, Test Acc: {result_test:.4f}") + + print("====================Training with original graphs====================") + tm2 = MultiClassGraphTsetlinMachine(**tm_params) + for i in range(epochs): + tm2.fit(graphs_train, y_train, epochs=1, incremental=True) + + pred_test = tm2.predict(graphs_test) + + result_test = 100 * (pred_test == y_test).mean() + result_train = 100 * (tm2.predict(graphs_train) == y_train).mean() + + print( + f"[Original Graphs] Epoch {i} | Train Acc: {result_train:.4f}, Test Acc: {result_test:.4f}" + ) + diff --git a/pyproject.toml b/pyproject.toml new file mode 100644 index 00000000..68ca252a --- /dev/null +++ b/pyproject.toml @@ -0,0 +1,36 @@ +[tool.pixi.workspace] +channels = ["conda-forge"] +platforms = ["linux-64"] + +[tool.pixi.tasks] + +[tool.pixi.system-requirements] +cuda = "12" + +[tool.pixi.dependencies] +python = ">=3.11,<3.12" +numpy = ">=2.3.5,<3" +mamba = ">=2.4.0,<3" +cuda = ">=12.9.1,<13" +sympy = ">=1.14.0,<2" +numba = ">=0.62.1,<0.63" +scipy = ">=1.16.3,<2" + +[tool.pixi.pypi-dependencies] +graphtsetlinmachine = { path = ".", editable = true } +pycuda = ">=2025.1.2, <2026" + +[tool.basedpyright] +typeCheckingMode = "standard" +reportUnusedImport = false +exclude = [ + "**/__pycache__*", + "**/.*", + "**/*.ipynb", +] + +[tool.ruff] +line-length = 160 + +[tool.ruff.format] +indent-style = "tab"