cair · Mayur1009 · Nov 26, 2025 · Nov 26, 2025 · Nov 26, 2025
diff --git a/.editorconfig b/.editorconfig
@@ -0,0 +1,3 @@
+[*.py]
+indent_style = tab
+indent_size = 4
diff --git a/.gitattributes b/.gitattributes
@@ -0,0 +1,2 @@
+# SCM syntax highlighting & preventing 3-way merges
+pixi.lock merge=binary linguist-language=YAML linguist-generated=true
diff --git a/.gitignore b/.gitignore
@@ -162,3 +162,9 @@ cython_debug/
 #  and can be added to the global gitignore or merged into this file.  For a more nuclear
 #  option (not recommended) you can uncomment the following to ignore the entire idea folder.
 #.idea/
+# pixi environments
+
+.pixi/*
+!.pixi/config.toml
+pixi.lock
+.envrc
diff --git a/GraphTsetlinMachine/graphs.py b/GraphTsetlinMachine/graphs.py
@@ -223,3 +223,83 @@ def encode(self):
 		self.signature = m.digest()
 
 		self.encoded = True
+
+
+	def __len__(self):
+		return self.number_of_graphs
+
+	def _get_indices(self, key) -> list[int]:
+		if isinstance(key, int):
+			if key < 0:
+				key += self.number_of_graphs
+			if key < 0 or key >= self.number_of_graphs:
+				raise IndexError("Graph index out of range")
+			indices = [key]
+		elif isinstance(key, slice):
+			indices = list(range(*key.indices(self.number_of_graphs)))
+		elif isinstance(key, list) or isinstance(key, np.ndarray):
+			indices = []
+			for k in key:
+				if k < 0:
+					k += self.number_of_graphs
+				if k < 0 or k >= self.number_of_graphs:
+					raise IndexError("Graph index out of range")
+				indices.append(k)
+		else:
+			raise TypeError("Invalid graph index type")
+
+		return indices
+
+	def _create_subset(self, indices: list[int]):
+		subset = Graphs(
+			number_of_graphs=len(indices),
+			double_hashing=self.double_hashing,
+			one_hot_encoding=self.one_hot_encoding,
+			init_with=self,
+		)
+
+		# Copy number_of_graph_nodes and graph_node_id
+		for new_id, old_id in enumerate(indices):
+			subset.number_of_graph_nodes[new_id] = self.number_of_graph_nodes[old_id]
+			subset.graph_node_id[new_id] = self.graph_node_id[old_id]
+
+		subset.prepare_node_configuration()
+
+		# Copy node-level data
+		for new_id, old_id in enumerate(indices):
+			old_start = self.node_index[old_id]
+			old_end = old_start + self.number_of_graph_nodes[old_id]
+			new_start = subset.node_index[new_id]
+			new_end = new_start + subset.number_of_graph_nodes[new_id]
+
+			subset.node_type[new_start:new_end] = self.node_type[old_start:old_end]
+			subset.number_of_graph_node_edges[new_start:new_end] = self.number_of_graph_node_edges[old_start:old_end]
+			subset.graph_node_edge_counter[new_start:new_end] = self.graph_node_edge_counter[old_start:old_end]
+			subset.X[new_start:new_end] = self.X[old_start:old_end]
+
+		# Prepare edge configuration for subset
+		subset.prepare_edge_configuration()
+
+		# Copy edge data
+		for new_id, old_id in enumerate(indices):
+			for node_id in range(self.number_of_graph_nodes[old_id]):
+				old_node_idx = self.node_index[old_id] + node_id
+				new_node_idx = subset.node_index[new_id] + node_id
+
+				old_edge_start = self.edge_index[old_node_idx]
+				old_edge_count = self.graph_node_edge_counter[old_node_idx]
+				new_edge_start = subset.edge_index[new_node_idx]
+
+				subset.edge[new_edge_start : new_edge_start + old_edge_count] = self.edge[
+					old_edge_start : old_edge_start + old_edge_count
+				]
+
+		subset.encode()
+
+		return subset
+
+	def __getitem__(self, index):
+		indices = self._get_indices(index)
+		if len(indices) == 0:
+			raise ValueError("No graphs selected")
+		return self._create_subset(indices)
diff --git a/GraphTsetlinMachine/tm.py b/GraphTsetlinMachine/tm.py
@@ -30,11 +30,8 @@
 
 import pycuda.curandom as curandom
 import pycuda.driver as cuda
-import pycuda.autoinit
+import pycuda.autoinit  # noqa: F401
 from pycuda.compiler import SourceModule
-from scipy.sparse import csr_matrix
-import sys
-from time import time
 
 g = curandom.XORWOWRandomNumberGenerator() 
 
@@ -314,22 +311,19 @@ def set_state(self, state):
 
 	def save(self, fname=""):
 		# Copy data from GPU to CPU
-		if np.array_equal(self.ta_state, np.array([])):
-			self.ta_state = np.empty(
-				self.number_of_clauses * self.number_of_ta_chunks * self.number_of_state_bits, dtype=np.uint32
-			)
-			cuda.memcpy_dtoh(self.ta_state, self.ta_state_gpu)
+		self.ta_state = np.empty(
+			self.number_of_clauses * self.number_of_ta_chunks * self.number_of_state_bits, dtype=np.uint32
+		)
+		cuda.memcpy_dtoh(self.ta_state, self.ta_state_gpu)
 
 		for depth in range(self.depth - 1):
-			if np.array_equal(self.message_ta_state[depth], np.array([])):
-				self.message_ta_state[depth] = np.empty(
-					self.number_of_clauses * self.number_of_message_chunks * self.number_of_state_bits, dtype=np.uint32
-				)
-				cuda.memcpy_dtoh(self.message_ta_state[depth], self.message_ta_state_gpu[depth])
+			self.message_ta_state[depth] = np.empty(
+				self.number_of_clauses * self.number_of_message_chunks * self.number_of_state_bits, dtype=np.uint32
+			)
+			cuda.memcpy_dtoh(self.message_ta_state[depth], self.message_ta_state_gpu[depth])
 
-		if np.array_equal(self.clause_weights, np.array([])):
-			self.clause_weights = np.empty(self.number_of_outputs * self.number_of_clauses, dtype=np.int32)
-			cuda.memcpy_dtoh(self.clause_weights, self.clause_weights_gpu)
+		self.clause_weights = np.empty(self.number_of_outputs * self.number_of_clauses, dtype=np.int32)
+		cuda.memcpy_dtoh(self.clause_weights, self.clause_weights_gpu)
 
 		state_dict = {
 			# State arrays

diff --git a/examples/test_graph_subsetting.py b/examples/test_graph_subsetting.py
@@ -0,0 +1,121 @@
+import random
+import numpy as np
+from GraphTsetlinMachine.tm import MultiClassGraphTsetlinMachine
+from GraphTsetlinMachine.graphs import Graphs
+
+random.seed(42)
+
+def generate_graphs(symbols, noise, graph_args: dict):
+	graphs = Graphs(**graph_args)
+	number_of_examples = graph_args["number_of_graphs"]
+
+	for graph_id in range(number_of_examples):
+		graphs.set_number_of_graph_nodes(graph_id, 2)
+
+	graphs.prepare_node_configuration()
+
+	for graph_id in range(number_of_examples):
+		for node_id in range(graphs.number_of_graph_nodes[graph_id]):
+			number_of_edges = 1
+			graphs.add_graph_node(graph_id, node_id, number_of_edges)
+
+	graphs.prepare_edge_configuration()
+
+	X = np.empty((number_of_examples, 2))
+	Y = np.empty(number_of_examples, dtype=np.uint32)
+
+	for graph_id in range(number_of_examples):
+		edge_type = "Plain"
+		source_node_id = 0
+		destination_node_id = 1
+		graphs.add_graph_node_edge(graph_id, source_node_id, destination_node_id, edge_type)
+
+		source_node_id = 1
+		destination_node_id = 0
+		graphs.add_graph_node_edge(graph_id, source_node_id, destination_node_id, edge_type)
+
+		x1 = random.choice(symbols)
+		x2 = random.choice(symbols)
+		X[graph_id] = np.array([x1, x2])
+		if (x1 % 2) == (x2 % 2):
+			Y[graph_id] = 0
+		else:
+			Y[graph_id] = 1
+
+		graphs.add_graph_node_property(graph_id, 0, x1)
+		graphs.add_graph_node_property(graph_id, 1, x2)
+
+		if np.random.rand() <= noise:
+			Y[graph_id] = 1 - Y[graph_id]
+
+	graphs.encode()
+
+	return graphs, X, Y
+
+
+if __name__ == "__main__":
+	tm_params = {
+		"number_of_clauses": 1000,
+		"T": 2000,
+		"s": 1,
+		"message_size": 2048,
+		"message_bits": 2,
+		"double_hashing": True,
+		"depth": 2,
+		"grid": (16 * 13, 1, 1),
+		"block": (128, 1, 1),
+	}
+
+	epochs = 10
+	noise = 0.1
+	num_value = 100
+	symbols = [i for i in range(num_value)]
+	graph_params = {
+		"number_of_graphs": 50000,
+		"hypervector_size": 2048,
+		"hypervector_bits": 2,
+		"double_hashing": True,
+		"symbols": symbols,
+	}
+	graphs_train, X_train, y_train = generate_graphs(symbols, noise, graph_params)
+
+	graphs_test, X_test, y_test = generate_graphs(
+		symbols,
+		0.0,
+		{
+			"number_of_graphs": 2000,
+			"init_with": graphs_train,
+		},
+	)
+
+	print("====================Training with graph splits====================")
+	tm = MultiClassGraphTsetlinMachine(**tm_params)
+	for i in range(epochs):
+		print(f"Epoch {i} ---------------------")
+		fit_time = 0.0
+		for b in range(0, y_train.shape[0], 10000):
+			gsub = graphs_train[b : b + 10000]
+			ysub = y_train[b : b + 10000]
+			tm.fit(gsub, ysub, epochs=1, incremental=True)
+			result_sub = 100 * (tm.predict(gsub) == ysub).mean()
+			print(f"  [Batch {b}-{b + 10000}] Train Acc: {result_sub:.4f}")
+
+		pred_test = tm.predict(graphs_test)
+		result_test = 100 * (pred_test == y_test).mean()
+		result_train = 100 * (tm.predict(graphs_train) == y_train).mean()
+		print(f"[Graph Splits] Epoch {i} | Train Acc: {result_train:.4f}, Test Acc: {result_test:.4f}")
+
+	print("====================Training with original graphs====================")
+	tm2 = MultiClassGraphTsetlinMachine(**tm_params)
+	for i in range(epochs):
+		tm2.fit(graphs_train, y_train, epochs=1, incremental=True)
+
+		pred_test = tm2.predict(graphs_test)
+
+		result_test = 100 * (pred_test == y_test).mean()
+		result_train = 100 * (tm2.predict(graphs_train) == y_train).mean()
+
+		print(
+			f"[Original Graphs] Epoch {i} | Train Acc: {result_train:.4f}, Test Acc: {result_test:.4f}"
+		)
+
diff --git a/pyproject.toml b/pyproject.toml
@@ -0,0 +1,36 @@
+[tool.pixi.workspace]
+channels = ["conda-forge"]
+platforms = ["linux-64"]
+
+[tool.pixi.tasks]
+
+[tool.pixi.system-requirements]
+cuda = "12"
+
+[tool.pixi.dependencies]
+python = ">=3.11,<3.12"
+numpy = ">=2.3.5,<3"
+mamba = ">=2.4.0,<3"
+cuda = ">=12.9.1,<13"
+sympy = ">=1.14.0,<2"
+numba = ">=0.62.1,<0.63"
+scipy = ">=1.16.3,<2"
+
+[tool.pixi.pypi-dependencies]
+graphtsetlinmachine = { path = ".", editable = true }
+pycuda = ">=2025.1.2, <2026"
+
+[tool.basedpyright]
+typeCheckingMode = "standard"
+reportUnusedImport = false
+exclude = [
+    "**/__pycache__*",
+    "**/.*",
+    "**/*.ipynb",
+]
+
+[tool.ruff]
+line-length = 160
+
+[tool.ruff.format]
+indent-style = "tab"
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1,2 @@
		# SCM syntax highlighting & preventing 3-way merges
		pixi.lock merge=binary linguist-language=YAML linguist-generated=true