Make field notebook working again

orazve · orazve · commit a709d2253369 · 2024-09-19T17:37:41.000+01:00
diff --git a/examples/kge-distmult-nations-field.ipynb b/examples/kge-distmult-nations-field.ipynb
@@ -347,7 +347,7 @@
    "outputs": [],
    "source": [
     "for index, row in predict_result.iterrows():\n",
-    "    h = row[\"head\"]\n",
+    "    h = row[\"sourceNodeId\"]\n",
     "    r = row[\"rel\"]\n",
     "    gds.run_cypher(\n",
     "        f\"\"\"\n",
@@ -356,7 +356,7 @@
     "        MATCH (b:Entity WHERE id(b) = t)\n",
     "        MERGE (a)-[:NEW_REL_{r}]->(b)\n",
     "    \"\"\",\n",
-    "        params={\"tt\": row[\"tail\"]},\n",
+    "        params={\"tt\": row[\"targetNodeIdTopK\"]},\n",
     "    )"
    ]
   },
diff --git a/examples/kge-distmult-nations.py b/examples/kge-distmult-nations.py
@@ -195,11 +195,11 @@ def inspect_graph(G):
     res = gds.kge.model.train(
         G_train,
         model_name=model_name,
-        scoring_function="distmult",
-        num_epochs=1,
-        embedding_dimension=10,
+        scoring_function="TransE",
+        num_epochs=30,
+        embedding_dimension=64,
         epochs_per_checkpoint=0,
-        epochs_per_val=5,
+        epochs_per_val=0,
         split_ratios={"TRAIN": 0.8, "VALID": 0.1, "TEST": 0.1},
     )
     print(res["metrics"])
@@ -218,7 +218,7 @@ def inspect_graph(G):
     print(predict_result.to_string())
 
     for index, row in predict_result.iterrows():
-        h = row["head"]
+        h = row["sourceNodeId"]
         r = row["rel"]
         gds.run_cypher(
             f"""
@@ -227,7 +227,7 @@ def inspect_graph(G):
             MATCH (b:Entity WHERE id(b) = t)
             MERGE (a)-[:NEW_REL_{r}]->(b)
         """,
-            params={"tt": row["tail"]},
+            params={"tt": row["targetNodeIdTopK"]},
         )
 
     brazil_node = gds.find_node_id(["Entity"], {"text": "brazil"})
diff --git a/graphdatascience/graph_data_science.py b/graphdatascience/graph_data_science.py
@@ -4,10 +4,12 @@
 import sys
 from typing import Any, Dict, Optional, Tuple, Type, Union
 
-import rsa
 from neo4j import Driver
 from pandas import DataFrame
 
+from graphdatascience.graph.graph_proc_runner import GraphProcRunner
+from graphdatascience.utils.util_proc_runner import UtilProcRunner
+
 from .call_builder import IndirectCallBuilder
 from .endpoints import AlphaEndpoints, BetaEndpoints, DirectEndpoints
 from .error.uncallable_namespace import UncallableNamespace
@@ -16,8 +18,6 @@
 from .query_runner.neo4j_query_runner import Neo4jQueryRunner
 from .query_runner.query_runner import QueryRunner
 from .server_version.server_version import ServerVersion
-from graphdatascience.graph.graph_proc_runner import GraphProcRunner
-from graphdatascience.utils.util_proc_runner import UtilProcRunner
 
 
 class GraphDataScience(DirectEndpoints, UncallableNamespace):
@@ -53,11 +53,11 @@ def __init__(
         database: Optional[str], default None
             The Neo4j database to query against.
         arrow : Union[str, bool], default True
-            Arrow connection information. This is either a bool or a string.
-            If it is a string, it will be interpreted as a connection URL to a GDS Arrow Server.
-            If it is a bool,
-                True will make the client discover the connection URI to the GDS Arrow server via the Neo4j endpoint,
-                while False will make the client use Bolt for all operations.
+           Arrow connection information. This is either a string or a bool.
+            - If it is a string, it will be interpreted as a connection URL to a GDS Arrow Server.
+            - If it is a bool:
+                - True will make the client discover the connection URI to the GDS Arrow server via the Neo4j endpoint.
+                - False will make the client use Bolt for all operations.
         arrow_disable_server_verification : bool, default True
             A flag that overrides other TLS settings and disables server verification for TLS connections.
         arrow_tls_root_certs : Optional[bytes], default None
@@ -91,6 +91,7 @@ def __init__(
         #         pub_key = rsa.PublicKey.load_pkcs1(f.read())
         #     self._encrypted_db_password = rsa.encrypt(auth[1].encode(), pub_key).hex()
 
+        self._encrypted_db_password = None
         self._compute_cluster_ip = None
 
         super().__init__(self._query_runner, "gds", self._server_version)
diff --git a/graphdatascience/model/kge_runner.py b/graphdatascience/model/kge_runner.py
@@ -4,7 +4,7 @@
 import time
 from typing import Any, Dict, Optional
 
-import pandas as pd
+import pyarrow
 import requests
 from pandas import DataFrame, Series
 
@@ -32,12 +32,13 @@ def __init__(
         self._namespace = namespace
         self._server_version = server_version
         self._compute_cluster_web_uri = f"http://{compute_cluster_ip}:5005"
+        self._compute_cluster_arrow_uri = f"grpc://{compute_cluster_ip}:8815"
         self._compute_cluster_mlflow_uri = f"http://{compute_cluster_ip}:8080"
         self._encrypted_db_password = encrypted_db_password
         self._arrow_uri = arrow_uri
 
     @property
-    def model(self):
+    def model(self) -> "KgeRunner":
         return self
 
     # @compatible_with("stream", min_inclusive=ServerVersion(2, 5, 0))
@@ -75,7 +76,7 @@ def train(
         mlflow_experiment_name: Optional[str] = None,
     ) -> Series:
         if epochs_per_checkpoint is None:
-            epochs_per_checkpoint = max(num_epochs / 10, 1)
+            epochs_per_checkpoint = max(int(num_epochs / 10), 1)
         if loss_function_kwargs is None:
             loss_function_kwargs = dict(margin=1.0, adversarial_temperature=1.0, gamma=20.0)
         if lr_scheduler_kwargs is None:
@@ -92,7 +93,7 @@ def train(
         }
         print(algo_config)
 
-        graph_config = {"name": G.name()}
+        graph_config = {"name": G.name(), "config_type": "GdsGraphConfig"}
 
         config = {
             "user_name": "DUMMY_USER",
@@ -133,7 +134,6 @@ def predict(
         rel_types: list[str],
         mlflow_experiment_name: Optional[str] = None,
     ) -> DataFrame:
-
         algo_config = {
             "top_k": top_k,
             "node_ids": node_ids,
@@ -144,8 +144,10 @@ def predict(
             "user_name": "DUMMY_USER",
             "task": "KGE_PREDICT_PYG",
             "task_config": {
+                "graph_config": {"config_type": "GdsGraphConfig", "name": "NOGRAPH"},
                 "modelname": model_name,
                 "task_config": algo_config,
+                "stream_rel_results": True,
             },
             "graph_arrow_uri": self._arrow_uri,
         }
@@ -162,7 +164,7 @@ def predict(
 
         self._wait_for_job(job_id)
 
-        return self._stream_results(config["user_name"], config["task_config"]["modelname"], job_id)
+        return self._stream_results(config, job_id)
 
     @client_only_endpoint("gds.kge.model")
     def score_triplets(
@@ -171,7 +173,6 @@ def score_triplets(
         triplets: list[tuple[int, str, int]],
         mlflow_experiment_name: Optional[str] = None,
     ) -> DataFrame:
-
         algo_config = {
             "triplets": triplets,
         }
@@ -180,8 +181,10 @@ def score_triplets(
             "user_name": "DUMMY_USER",
             "task": "KGE_SCORE_TRIPLETS_PYG",
             "task_config": {
+                "graph_config": {"config_type": "GdsGraphConfig", "name": "NOGRAPH"},
                 "modelname": model_name,
                 "task_config": algo_config,
+                "stream_rel_results": True,
             },
             "graph_arrow_uri": self._arrow_uri,
         }
@@ -198,22 +201,20 @@ def score_triplets(
 
         self._wait_for_job(job_id)
 
-        return self._stream_results(config["user_name"], config["task_config"]["modelname"], job_id)
+        return self._stream_results(config, job_id)
 
-    def _stream_results(self, user_name: str, model_name: str, job_id: str) -> DataFrame:
-        res = requests.get(
-            f"{self._compute_cluster_web_uri}/internal/fetch-result",
-            params={"user_name": user_name, "modelname": model_name, "job_id": job_id},
-        )
-        res.raise_for_status()
+    def _stream_results(self, config: dict, job_id: str) -> DataFrame:
+        client = pyarrow.flight.connect(self._compute_cluster_arrow_uri)
 
-        res_file_name = f"res_{job_id}.json"
-        with open(res_file_name, mode="wb+") as f:
-            f.write(res.content)
+        if config["task_config"].get("stream_rel_results", False):
+            upload_descriptor = pyarrow.flight.FlightDescriptor.for_path(f"{job_id}.relationships")
+        else:
+            raise ValueError("No results to fetch: need to set stream_rel_results or stream_graph_results to True")
+        flight = client.get_flight_info(upload_descriptor)
+        reader = client.do_get(flight.endpoints[0].ticket)
+        read_table = reader.read_all()
 
-        df = pd.read_json(res_file_name, orient="records", lines=True)
-        os.remove(res_file_name)
-        return df
+        return read_table.to_pandas()
 
     def _get_metrics(self, user_name: str, model_name: str, job_id: str) -> DataFrame:
         res = requests.get(