Skip to content

Commit 9dd8c22

Browse files
committed
Add v2 endpoint for hdbscan
1 parent 228fc7b commit 9dd8c22

File tree

7 files changed

+1003
-2
lines changed

7 files changed

+1003
-2
lines changed
Lines changed: 334 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,334 @@
1+
from __future__ import annotations
2+
3+
from abc import ABC, abstractmethod
4+
from typing import Any
5+
6+
import pandas as pd
7+
8+
from graphdatascience.procedure_surface.api.base_result import BaseResult
9+
from graphdatascience.procedure_surface.api.catalog.graph_api import GraphV2
10+
from graphdatascience.procedure_surface.api.estimation_result import EstimationResult
11+
12+
13+
class HdbscanEndpoints(ABC):
14+
15+
@abstractmethod
16+
def mutate(
17+
self,
18+
G: GraphV2,
19+
node_property: str,
20+
mutate_property: str,
21+
*,
22+
leaf_size: int | None = None,
23+
samples: int | None = None,
24+
min_cluster_size: int | None = None,
25+
relationship_types: list[str] | None = None,
26+
node_labels: list[str] | None = None,
27+
concurrency: int | None = None,
28+
log_progress: bool = True,
29+
sudo: bool | None = None,
30+
job_id: Any | None = None,
31+
username: str | None = None,
32+
) -> HdbscanMutateResult:
33+
"""
34+
Runs the HDBSCAN algorithm and writes the cluster ID for each node back to the in-memory graph.
35+
36+
The algorithm performs hierarchical density-based clustering on a node property,
37+
identifying clusters based on density reachability.
38+
39+
Parameters
40+
----------
41+
G : GraphV2
42+
The graph to run the algorithm on
43+
node_property : str
44+
The node property to use for clustering (required)
45+
mutate_property : str
46+
The name of the node property to write the cluster ID to
47+
leaf_size : int | None, default=None
48+
The maximum leaf size of the tree structure used in the algorithm
49+
samples : int | None, default=None
50+
The number of samples used for density estimation
51+
min_cluster_size : int | None, default=None
52+
The minimum size of clusters
53+
relationship_types : list[str] | None, default=None
54+
The relationship types used to select relationships for this algorithm run
55+
node_labels : list[str] | None, default=None
56+
The node labels used to select nodes for this algorithm run
57+
concurrency : int | None, default=None
58+
The number of concurrent threads
59+
log_progress : bool, default=True
60+
Whether to log progress
61+
sudo : bool | None, default=None
62+
Override memory estimation limits
63+
job_id : Any | None, default=None
64+
An identifier for the job
65+
username : str | None, default=None
66+
The username to attribute the procedure run to
67+
68+
Returns
69+
-------
70+
HdbscanMutateResult
71+
The result containing statistics about the clustering and algorithm execution
72+
"""
73+
74+
@abstractmethod
75+
def stats(
76+
self,
77+
G: GraphV2,
78+
node_property: str,
79+
*,
80+
leaf_size: int | None = None,
81+
samples: int | None = None,
82+
min_cluster_size: int | None = None,
83+
relationship_types: list[str] | None = None,
84+
node_labels: list[str] | None = None,
85+
concurrency: int | None = None,
86+
log_progress: bool = True,
87+
sudo: bool | None = None,
88+
job_id: Any | None = None,
89+
username: str | None = None,
90+
) -> HdbscanStatsResult:
91+
"""
92+
Runs the HDBSCAN algorithm and returns only statistics about the clustering.
93+
94+
This mode computes cluster assignments without writing them back to the graph,
95+
returning only execution statistics and cluster information.
96+
97+
Parameters
98+
----------
99+
G : GraphV2
100+
The graph to run the algorithm on
101+
node_property : str
102+
The node property to use for clustering (required)
103+
leaf_size : int | None, default=None
104+
The maximum leaf size of the tree structure used in the algorithm
105+
samples : int | None, default=None
106+
The number of samples used for density estimation
107+
min_cluster_size : int | None, default=None
108+
The minimum size of clusters
109+
relationship_types : list[str] | None, default=None
110+
The relationship types used to select relationships for this algorithm run
111+
node_labels : list[str] | None, default=None
112+
The node labels used to select nodes for this algorithm run
113+
concurrency : int | None, default=None
114+
The number of concurrent threads
115+
log_progress : bool, default=True
116+
Whether to log progress
117+
sudo : bool | None, default=None
118+
Override memory estimation limits
119+
job_id : Any | None, default=None
120+
An identifier for the job
121+
username : str | None, default=None
122+
The username to attribute the procedure run to
123+
124+
Returns
125+
-------
126+
HdbscanStatsResult
127+
The result containing statistics about the clustering and algorithm execution
128+
"""
129+
130+
@abstractmethod
131+
def stream(
132+
self,
133+
G: GraphV2,
134+
node_property: str,
135+
*,
136+
leaf_size: int | None = None,
137+
samples: int | None = None,
138+
min_cluster_size: int | None = None,
139+
relationship_types: list[str] | None = None,
140+
node_labels: list[str] | None = None,
141+
concurrency: int | None = None,
142+
log_progress: bool = True,
143+
sudo: bool | None = None,
144+
job_id: Any | None = None,
145+
username: str | None = None,
146+
) -> pd.DataFrame:
147+
"""
148+
Runs the HDBSCAN algorithm and returns the cluster ID for each node as a DataFrame.
149+
150+
The DataFrame contains the cluster assignment for each node, with noise points
151+
typically assigned to cluster -1.
152+
153+
Parameters
154+
----------
155+
G : GraphV2
156+
The graph to run the algorithm on
157+
node_property : str
158+
The node property to use for clustering (required)
159+
leaf_size : int | None, default=None
160+
The maximum leaf size of the tree structure used in the algorithm
161+
samples : int | None, default=None
162+
The number of samples used for density estimation
163+
min_cluster_size : int | None, default=None
164+
The minimum size of clusters
165+
relationship_types : list[str] | None, default=None
166+
The relationship types used to select relationships for this algorithm run
167+
node_labels : list[str] | None, default=None
168+
The node labels used to select nodes for this algorithm run
169+
concurrency : int | None, default=None
170+
The number of concurrent threads
171+
log_progress : bool, default=True
172+
Whether to log progress
173+
sudo : bool | None, default=None
174+
Override memory estimation limits
175+
job_id : Any | None, default=None
176+
An identifier for the job
177+
username : str | None, default=None
178+
The username to attribute the procedure run to
179+
180+
Returns
181+
-------
182+
pd.DataFrame
183+
A DataFrame with columns 'nodeId' and 'label'
184+
"""
185+
186+
@abstractmethod
187+
def write(
188+
self,
189+
G: GraphV2,
190+
node_property: str,
191+
write_property: str,
192+
*,
193+
leaf_size: int | None = None,
194+
samples: int | None = None,
195+
min_cluster_size: int | None = None,
196+
relationship_types: list[str] | None = None,
197+
node_labels: list[str] | None = None,
198+
write_concurrency: int | None = None,
199+
concurrency: int | None = None,
200+
log_progress: bool = True,
201+
sudo: bool | None = None,
202+
job_id: Any | None = None,
203+
username: str | None = None,
204+
) -> HdbscanWriteResult:
205+
"""
206+
Runs the HDBSCAN algorithm and writes the cluster ID for each node back to the database.
207+
208+
Parameters
209+
----------
210+
G : GraphV2
211+
The graph to run the algorithm on
212+
node_property : str
213+
The node property to use for clustering (required)
214+
write_property : str
215+
The name of the node property to write the cluster ID to
216+
leaf_size : int | None, default=None
217+
The maximum leaf size of the tree structure used in the algorithm
218+
samples : int | None, default=None
219+
The number of samples used for density estimation
220+
min_cluster_size : int | None, default=None
221+
The minimum size of clusters
222+
relationship_types : list[str] | None, default=None
223+
The relationship types used to select relationships for this algorithm run
224+
node_labels : list[str] | None, default=None
225+
The node labels used to select nodes for this algorithm run
226+
write_concurrency : int | None, default=None
227+
The number of concurrent threads for writing
228+
concurrency : int | None, default=None
229+
The number of concurrent threads
230+
log_progress : bool, default=True
231+
Whether to log progress
232+
sudo : bool | None, default=None
233+
Override memory estimation limits
234+
job_id : Any | None, default=None
235+
An identifier for the job
236+
username : str | None, default=None
237+
The username to attribute the procedure run to
238+
239+
Returns
240+
-------
241+
HdbscanWriteResult
242+
The result containing statistics about the clustering and algorithm execution
243+
"""
244+
245+
@abstractmethod
246+
def estimate(
247+
self,
248+
G: GraphV2,
249+
node_property: str,
250+
*,
251+
leaf_size: int | None = None,
252+
samples: int | None = None,
253+
min_cluster_size: int | None = None,
254+
relationship_types: list[str] | None = None,
255+
node_labels: list[str] | None = None,
256+
concurrency: int | None = None,
257+
log_progress: bool = True,
258+
sudo: bool | None = None,
259+
job_id: Any | None = None,
260+
username: str | None = None,
261+
) -> EstimationResult:
262+
"""
263+
Estimates memory requirements and other statistics for the HDBSCAN algorithm.
264+
265+
This method provides memory estimation for the HDBSCAN algorithm without
266+
actually executing the clustering. It helps determine the computational requirements
267+
before running the actual clustering procedure.
268+
269+
Parameters
270+
----------
271+
G : GraphV2
272+
The graph to run the algorithm on
273+
node_property : str
274+
The node property to use for clustering (required)
275+
leaf_size : int | None, default=None
276+
The maximum leaf size of the tree structure used in the algorithm
277+
samples : int | None, default=None
278+
The number of samples used for density estimation
279+
min_cluster_size : int | None, default=None
280+
The minimum size of clusters
281+
relationship_types : list[str] | None, default=None
282+
The relationship types used to select relationships for this algorithm run
283+
node_labels : list[str] | None, default=None
284+
The node labels used to select nodes for this algorithm run
285+
concurrency : int | None, default=None
286+
The number of concurrent threads
287+
log_progress : bool, default=True
288+
Whether to log progress
289+
sudo : bool | None, default=None
290+
Override memory estimation limits
291+
job_id : Any | None, default=None
292+
An identifier for the job
293+
username : str | None, default=None
294+
The username to attribute the procedure run to
295+
296+
Returns
297+
-------
298+
EstimationResult
299+
The estimation result containing memory requirements and other statistics
300+
"""
301+
302+
303+
class HdbscanMutateResult(BaseResult):
304+
compute_millis: int
305+
configuration: dict[str, Any]
306+
mutate_millis: int
307+
node_count: int
308+
node_properties_written: int
309+
number_of_clusters: int
310+
number_of_noise_points: int
311+
post_processing_millis: int
312+
pre_processing_millis: int
313+
314+
315+
class HdbscanStatsResult(BaseResult):
316+
compute_millis: int
317+
configuration: dict[str, Any]
318+
node_count: int
319+
number_of_clusters: int
320+
number_of_noise_points: int
321+
post_processing_millis: int
322+
pre_processing_millis: int
323+
324+
325+
class HdbscanWriteResult(BaseResult):
326+
compute_millis: int
327+
configuration: dict[str, Any]
328+
node_count: int
329+
node_properties_written: int
330+
number_of_clusters: int
331+
number_of_noise_points: int
332+
post_processing_millis: int
333+
pre_processing_millis: int
334+
write_millis: int

0 commit comments

Comments
 (0)