Fix coverage test

FlorentinD · FlorentinD · commit 701c5ec5ffea · 2025-10-16T09:28:19.000+02:00
* move graphsage endpoints behind common interface
* add hdbscan as missing
diff --git a/graphdatascience/procedure_surface/api/node_embedding/graphsage_endpoints.py b/graphdatascience/procedure_surface/api/node_embedding/graphsage_endpoints.py
@@ -0,0 +1,203 @@
+from typing import Any
+
+from pandas import DataFrame
+
+from graphdatascience.procedure_surface.api.catalog.graph_api import GraphV2
+from graphdatascience.procedure_surface.api.estimation_result import EstimationResult
+from graphdatascience.procedure_surface.api.model.graphsage_model import GraphSageModelV2
+from graphdatascience.procedure_surface.api.node_embedding.graphsage_predict_endpoints import (
+    GraphSageMutateResult,
+    GraphSagePredictEndpoints,
+    GraphSageWriteResult,
+)
+from graphdatascience.procedure_surface.api.node_embedding.graphsage_train_endpoints import (
+    GraphSageTrainEndpoints,
+    GraphSageTrainResult,
+)
+
+
+class GraphSageEndpoints(GraphSageTrainEndpoints, GraphSagePredictEndpoints):
+    """
+    API for the GraphSage algorithm, combining both training and prediction functionalities.
+    """
+
+    def __init__(
+        self,
+        train_endpoints: GraphSageTrainEndpoints,
+        predict_endpoints: GraphSagePredictEndpoints,
+    ) -> None:
+        self._train_endpoints = train_endpoints
+        self._predict_endpoints = predict_endpoints
+
+    def train(
+        self,
+        G: GraphV2,
+        model_name: str,
+        feature_properties: list[str],
+        *,
+        activation_function: Any | None = None,
+        negative_sample_weight: int | None = None,
+        embedding_dimension: int | None = None,
+        tolerance: float | None = None,
+        learning_rate: float | None = None,
+        max_iterations: int | None = None,
+        sample_sizes: list[int] | None = None,
+        aggregator: Any | None = None,
+        penalty_l2: float | None = None,
+        search_depth: int | None = None,
+        epochs: int | None = None,
+        projected_feature_dimension: int | None = None,
+        batch_sampling_ratio: float | None = None,
+        store_model_to_disk: bool | None = None,
+        relationship_types: list[str] | None = None,
+        node_labels: list[str] | None = None,
+        username: str | None = None,
+        log_progress: bool = True,
+        sudo: bool | None = None,
+        concurrency: Any | None = None,
+        job_id: Any | None = None,
+        batch_size: int | None = None,
+        relationship_weight_property: str | None = None,
+        random_seed: Any | None = None,
+    ) -> tuple[GraphSageModelV2, GraphSageTrainResult]:
+        return self._train_endpoints.train(
+            G,
+            model_name,
+            feature_properties,
+            activation_function=activation_function,
+            negative_sample_weight=negative_sample_weight,
+            embedding_dimension=embedding_dimension,
+            tolerance=tolerance,
+            learning_rate=learning_rate,
+            max_iterations=max_iterations,
+            sample_sizes=sample_sizes,
+            aggregator=aggregator,
+            penalty_l2=penalty_l2,
+            search_depth=search_depth,
+            epochs=epochs,
+            projected_feature_dimension=projected_feature_dimension,
+            batch_sampling_ratio=batch_sampling_ratio,
+            store_model_to_disk=store_model_to_disk,
+            relationship_types=relationship_types,
+            node_labels=node_labels,
+            username=username,
+            log_progress=log_progress,
+            sudo=sudo,
+            concurrency=concurrency,
+            job_id=job_id,
+            batch_size=batch_size,
+            relationship_weight_property=relationship_weight_property,
+            random_seed=random_seed,
+        )
+
+    def stream(
+        self,
+        G: GraphV2,
+        model_name: str,
+        *,
+        relationship_types: list[str] | None = None,
+        node_labels: list[str] | None = None,
+        username: str | None = None,
+        log_progress: bool = True,
+        sudo: bool | None = None,
+        concurrency: Any | None = None,
+        job_id: Any | None = None,
+        batch_size: int | None = None,
+    ) -> DataFrame:
+        return self._predict_endpoints.stream(
+            G,
+            model_name,
+            relationship_types=relationship_types,
+            node_labels=node_labels,
+            username=username,
+            log_progress=log_progress,
+            sudo=sudo,
+            concurrency=concurrency,
+            job_id=job_id,
+            batch_size=batch_size,
+        )
+
+    def write(
+        self,
+        G: GraphV2,
+        model_name: str,
+        write_property: str,
+        *,
+        relationship_types: list[str] | None = None,
+        node_labels: list[str] | None = None,
+        username: str | None = None,
+        log_progress: bool = True,
+        sudo: bool | None = None,
+        concurrency: Any | None = None,
+        write_concurrency: int | None = None,
+        job_id: Any | None = None,
+        batch_size: int | None = None,
+    ) -> GraphSageWriteResult:
+        return self._predict_endpoints.write(
+            G,
+            model_name,
+            write_property,
+            relationship_types=relationship_types,
+            node_labels=node_labels,
+            username=username,
+            log_progress=log_progress,
+            sudo=sudo,
+            concurrency=concurrency,
+            write_concurrency=write_concurrency,
+            job_id=job_id,
+            batch_size=batch_size,
+        )
+
+    def mutate(
+        self,
+        G: GraphV2,
+        model_name: str,
+        mutate_property: str,
+        relationship_types: list[str] | None = None,
+        node_labels: list[str] | None = None,
+        username: str | None = None,
+        log_progress: bool = True,
+        sudo: bool | None = None,
+        concurrency: Any | None = None,
+        job_id: Any | None = None,
+        batch_size: int | None = None,
+    ) -> GraphSageMutateResult:
+        return self._predict_endpoints.mutate(
+            G,
+            model_name,
+            mutate_property,
+            relationship_types=relationship_types,
+            node_labels=node_labels,
+            username=username,
+            log_progress=log_progress,
+            sudo=sudo,
+            concurrency=concurrency,
+            job_id=job_id,
+            batch_size=batch_size,
+        )
+
+    def estimate(
+        self,
+        G: GraphV2 | dict[str, Any],
+        model_name: str,
+        relationship_types: list[str] | None = None,
+        node_labels: list[str] | None = None,
+        batch_size: int | None = None,
+        concurrency: int | None = None,
+        log_progress: bool = True,
+        username: str | None = None,
+        sudo: bool | None = None,
+        job_id: str | None = None,
+    ) -> EstimationResult:
+        return self._predict_endpoints.estimate(
+            G,
+            model_name,
+            relationship_types=relationship_types,
+            node_labels=node_labels,
+            batch_size=batch_size,
+            concurrency=concurrency,
+            log_progress=log_progress,
+            username=username,
+            sudo=sudo,
+            job_id=job_id,
+        )
diff --git a/graphdatascience/procedure_surface/api/node_embedding/graphsage_predict_endpoints.py b/graphdatascience/procedure_surface/api/node_embedding/graphsage_predict_endpoints.py
@@ -79,7 +79,7 @@ def write(
         job_id: Any | None = None,
         batch_size: int | None = None,
     ) -> GraphSageWriteResult:
-        """ "
+        """
         Uses a pre-trained GraphSage model to predict embeddings for a graph and writes the results back to the database.
 
         Parameters
@@ -130,7 +130,7 @@ def mutate(
         job_id: Any | None = None,
         batch_size: int | None = None,
     ) -> GraphSageMutateResult:
-        """ "
+        """
         Uses a pre-trained GraphSage model to predict embeddings for a graph and writes the results back to the graph as a node property.
 
         Parameters
diff --git a/graphdatascience/session/session_v2_endpoints.py b/graphdatascience/session/session_v2_endpoints.py
@@ -13,6 +13,7 @@
 )
 from graphdatascience.procedure_surface.api.community.sllpa_endpoints import SllpaEndpoints
 from graphdatascience.procedure_surface.api.community.triangle_count_endpoints import TriangleCountEndpoints
+from graphdatascience.procedure_surface.api.node_embedding.graphsage_endpoints import GraphSageEndpoints
 from graphdatascience.procedure_surface.arrow.catalog_arrow_endpoints import CatalogArrowEndpoints
 from graphdatascience.procedure_surface.arrow.centrality.articlerank_arrow_endpoints import ArticleRankArrowEndpoints
 from graphdatascience.procedure_surface.arrow.centrality.articulationpoints_arrow_endpoints import (
@@ -122,15 +123,14 @@ def fast_rp(self) -> FastRPArrowEndpoints:
         return FastRPArrowEndpoints(self._arrow_client, self._write_back_client, show_progress=self._show_progress)
 
     @property
-    def graphsage_predict(self) -> GraphSagePredictArrowEndpoints:
-        return GraphSagePredictArrowEndpoints(
-            self._arrow_client, self._write_back_client, show_progress=self._show_progress
-        )
-
-    @property
-    def graphsage_train(self) -> GraphSageTrainArrowEndpoints:
-        return GraphSageTrainArrowEndpoints(
-            self._arrow_client, self._write_back_client, show_progress=self._show_progress
+    def graph_sage(self) -> GraphSageEndpoints:
+        return GraphSageEndpoints(
+            train_endpoints=GraphSageTrainArrowEndpoints(
+                self._arrow_client, self._write_back_client, show_progress=self._show_progress
+            ),
+            predict_endpoints=GraphSagePredictArrowEndpoints(
+                self._arrow_client, self._write_back_client, show_progress=self._show_progress
+            ),
         )
 
     @property
@@ -165,6 +165,7 @@ def label_propagation(self) -> LabelPropagationEndpoints:
             self._arrow_client, self._write_back_client, show_progress=self._show_progress
         )
 
+    @property
     def leiden(self) -> LeidenEndpoints:
         return LeidenArrowEndpoints(self._arrow_client, self._write_back_client, show_progress=self._show_progress)
 
diff --git a/graphdatascience/tests/integrationV2/procedure_surface/session/test_session_endpoint_coverage.py b/graphdatascience/tests/integrationV2/procedure_surface/session/test_session_endpoint_coverage.py
@@ -1,16 +1,17 @@
+import re
 from collections import defaultdict
 
 import pytest
-from pydantic.alias_generators import to_snake
 
 from graphdatascience import QueryRunner, ServerVersion
 from graphdatascience.arrow_client.authenticated_flight_client import AuthenticatedArrowClient
 from graphdatascience.session.aura_graph_data_science import AuraGraphDataScience
 from graphdatascience.session.session_v2_endpoints import SessionV2Endpoints
 
 MISSING_ALGO_ENDPOINTS = {
-    "embeddings.graphSage.train.estimate",  # TODO fix this by moving behind shared interface
-    "embeddings.graphSage.estimate",
+    "embeddings.graphSage.train.estimate",
+    "community.hdbscan",
+    "community.hdbscan.estimate",
     "similarity.knn.filtered",
     "similarity.knn.filtered.estimate",
     "similarity.nodeSimilarity.filtered",
@@ -44,37 +45,21 @@
     # centrality algos
     "betweenness": "betweenness_centrality",
     "celf": "influence_maximization_celf",
-    "celf.estimate": "influence_maximization_celf.estimate",
     "closeness": "closeness_centrality",
-    "closeness.estimate": "closeness_centrality.estimate",
     "degree": "degree_centrality",
-    "degree.estimate": "degree_centrality.estimate",
     "eigenvector": "eigenvector_centrality",
-    "eigenvector.estimate": "eigenvector_centrality.estimate",
     "harmonic": "harmonic_centrality",
-    "harmonic.estimate": "harmonic_centrality.estimate",
     "localClusteringCoefficient": "local_clustering_coefficient",
-    "localClusteringCoefficient.estimate": "local_clustering_coefficient.estimate",
     # community algos
+    "cliquecounting": "clique_counting",
     "k1coloring": "k1_coloring",
-    "k1coloring.estimate": "k1_coloring.estimate",
     "kcore": "k_core_decomposition",
-    "kcore.estimate": "k_core_decomposition.estimate",
     "maxkcut": "max_k_cut",
-    "maxkcut.estimate": "max_k_cut.estimate",
     "modularityOptimization": "modularity_optimization",
-    "modularityOptimization.estimate": "modularity_optimization.estimate",
-    "sllpa": "sllpa",
-    "sllpa.estimate": "sllpa.estimate",
-    "triangleCount": "triangle_count",
-    "triangleCount.estimate": "triangle_count.estimate",
     # embedding algos
     "fastrp": "fast_rp",
-    "fastrp.estimate": "fast_rp.estimate",
-    "graphSage": "graphsage_predict",
-    "graphSage.train": "graphsage_train",
+    "graphSage": "graphsage",
     "hashgnn": "hash_gnn",
-    "hashgnn.estimate": "hash_gnn.estimate",
 }
 
 
@@ -88,13 +73,24 @@ def gds(arrow_client: AuthenticatedArrowClient, db_query_runner: QueryRunner) ->
     )
 
 
+def to_snake(camel: str) -> str:
+    # adjusted version of pydantic.alias_generators.to_snake (without digit handling)
+
+    # Handle the sequence of uppercase letters followed by a lowercase letter
+    snake = re.sub(r"([A-Z]+)([A-Z][a-z])", lambda m: f"{m.group(1)}_{m.group(2)}", camel)
+    # Insert an underscore between a lowercase letter and an uppercase letter
+    snake = re.sub(r"([a-z])([A-Z])", lambda m: f"{m.group(1)}_{m.group(2)}", snake)
+    # Replace hyphens with underscores to handle kebab-case
+    snake = snake.replace("-", "_")
+    return snake.lower()
+
+
 def check_gds_v2_availability(endpoints: SessionV2Endpoints, algo: str) -> bool:
     """Check if an algorithm is available through gds.v2 interface"""
 
-    algo = ENDPOINT_MAPPINGS.get(algo, algo)
-
     algo_parts = algo.split(".")
     algo_parts = [to_snake(part) for part in algo_parts]
+    algo_parts = [ENDPOINT_MAPPINGS.get(part, part) for part in algo_parts]
 
     callable_object = endpoints
     for algo_part in algo_parts:
@@ -110,7 +106,6 @@ def check_gds_v2_availability(endpoints: SessionV2Endpoints, algo: str) -> bool:
 
 @pytest.mark.db_integration
 def test_algo_coverage(gds: AuraGraphDataScience) -> None:
-    """Test that all available Arrow actions are accessible through gds.v2"""
     arrow_client = gds.v2._arrow_client
 
     # Get all available Arrow actions
@@ -151,9 +146,9 @@ def test_algo_coverage(gds: AuraGraphDataScience) -> None:
     print(f"Available through gds.v2: {len(available_endpoints)}")
 
     # check if any previously missing algos are now available
-    assert not available_endpoints.intersection(MISSING_ALGO_ENDPOINTS), (
-        "Endpoints now available, please remove from MISSING_ALGO_ENDPOINTS"
-    )
+    newly_available_endpoints = available_endpoints.intersection(MISSING_ALGO_ENDPOINTS)
+    assert not newly_available_endpoints, "Endpoints now available, please remove from MISSING_ALGO_ENDPOINTS"
 
     # check missing endpoints against known missing algos
-    assert missing_endpoints.difference(MISSING_ALGO_ENDPOINTS), "Unexpectedly missing endpoints"
+    missing_endpoints = missing_endpoints.difference(MISSING_ALGO_ENDPOINTS)
+    assert not missing_endpoints, f"Unexpectedly missing endpoints {len(missing_endpoints)}"