Added score_triplets function

orazve · orazve · commit d9b34bb8f293 · 2024-07-23T15:49:47.000+01:00
diff --git a/examples/kge-distmult-nations.ipynb b/examples/kge-distmult-nations.ipynb
@@ -13,7 +13,7 @@
   {
    "cell_type": "code",
    "execution_count": null,
-   "id": "8d9719b198c3fe8e",
+   "id": "9135277efcde2800",
    "metadata": {},
    "outputs": [],
    "source": [
@@ -29,7 +29,7 @@
   {
    "cell_type": "code",
    "execution_count": null,
-   "id": "d4d82474217c5ca2",
+   "id": "1551fddc3a67fa5b",
    "metadata": {},
    "outputs": [],
    "source": [
@@ -39,7 +39,7 @@
   {
    "cell_type": "code",
    "execution_count": null,
-   "id": "c522b3dba2a0c1c9",
+   "id": "2f05ee7fdb496f84",
    "metadata": {},
    "outputs": [],
    "source": [
@@ -57,7 +57,7 @@
   {
    "cell_type": "code",
    "execution_count": null,
-   "id": "532f7596",
+   "id": "658c9f8369fff77e",
    "metadata": {},
    "outputs": [],
    "source": [
@@ -70,7 +70,7 @@
   {
    "cell_type": "code",
    "execution_count": null,
-   "id": "00757ac4",
+   "id": "bdbf4f91da4b9934",
    "metadata": {},
    "outputs": [],
    "source": [
@@ -84,7 +84,7 @@
   {
    "cell_type": "code",
    "execution_count": null,
-   "id": "6c9a1c4d",
+   "id": "485869468ad5ad2e",
    "metadata": {},
    "outputs": [],
    "source": [
@@ -142,16 +142,16 @@
     "            f\"Number of relationships of type {rel_split}: \",\n",
     "            sum([len(dataset[rel_split][rel_type]) for rel_type in dataset[rel_split]]),\n",
     "        )\n",
-    "    return dataset\n",
+    "    return dataset, node_map\n",
     "\n",
     "\n",
-    "dataset = read_data()"
+    "dataset, node_map = read_data()"
    ]
   },
   {
    "cell_type": "code",
    "execution_count": null,
-   "id": "e1cb98e4",
+   "id": "2032a4e1aed1bd5",
    "metadata": {},
    "outputs": [],
    "source": [
@@ -160,7 +160,6 @@
     "    if res[\"num_nodes\"].values[0] > 0:\n",
     "        print(\"Data already in db, number of nodes: \", res[\"num_nodes\"].values[0])\n",
     "        return\n",
-    "    dataset = read_data()\n",
     "    pbar = tqdm(\n",
     "        desc=\"Putting data in db\",\n",
     "        total=sum([len(dataset[rel_split][rel_type]) for rel_split in dataset for rel_type in dataset[rel_split]]),\n",
@@ -198,7 +197,7 @@
   {
    "cell_type": "code",
    "execution_count": null,
-   "id": "0fceb15b",
+   "id": "5c4f1523a225fa3c",
    "metadata": {},
    "outputs": [],
    "source": [
@@ -232,7 +231,7 @@
   {
    "cell_type": "code",
    "execution_count": null,
-   "id": "b4e2825a",
+   "id": "5d518e67375f6ab3",
    "metadata": {},
    "outputs": [],
    "source": [
@@ -261,43 +260,53 @@
     "    rel_types=[\"REL_RELDIPLOMACY\", \"REL_RELNGO\"],\n",
     ")\n",
     "\n",
-    "print(predict_result.to_string())\n",
-    "#\n",
-    "# gds.kge.model.predict_tail(\n",
-    "#     G_train,\n",
-    "#     model_name=model_name,\n",
-    "#     top_k=10,\n",
-    "#     node_ids=[gds.find_node_id([\"Entity\"], {\"text\": \"/m/016wzw\"}), gds.find_node_id([\"Entity\"], {\"id\": 2})],\n",
-    "#     rel_types=[\"REL_1\", \"REL_2\"],\n",
-    "# )\n",
-    "#\n",
-    "# gds.kge.model.score_triples(\n",
-    "#     G_train,\n",
-    "#     model_name=model_name,\n",
-    "#     triples=[\n",
-    "#         (gds.find_node_id([\"Entity\"], {\"text\": \"/m/016wzw\"}), \"REL_1\", gds.find_node_id([\"Entity\"], {\"id\": 2})),\n",
-    "#         (gds.find_node_id([\"Entity\"], {\"id\": 0}), \"REL_123\", gds.find_node_id([\"Entity\"], {\"id\": 3})),\n",
-    "#     ],\n",
-    "# )"
+    "print(predict_result.to_string())"
    ]
   },
   {
    "cell_type": "code",
    "execution_count": null,
-   "id": "786eda29280ed31f",
+   "id": "83b75194c69259a2",
    "metadata": {},
    "outputs": [],
    "source": [
-    "# Create the dictionary"
+    "for index, row in predict_result.iterrows():\n",
+    "    h = row[\"head\"]\n",
+    "    r = row[\"rel\"]\n",
+    "    gds.run_cypher(\n",
+    "        f\"\"\"\n",
+    "        UNWIND $tt as t\n",
+    "        MATCH (a:Entity WHERE id(a) = {h})\n",
+    "        MATCH (b:Entity WHERE id(b) = t)\n",
+    "        MERGE (a)-[:NEW_REL_{r}]->(b)\n",
+    "    \"\"\",\n",
+    "        params={\"tt\": row[\"tail\"]},\n",
+    "    )"
    ]
   },
   {
    "cell_type": "code",
    "execution_count": null,
-   "id": "74c501f8fcb411eb",
+   "id": "b4e2825a",
    "metadata": {},
    "outputs": [],
-   "source": []
+   "source": [
+    "brazil_node = gds.find_node_id([\"Entity\"], {\"text\": \"brazil\"})\n",
+    "uk_node = gds.find_node_id([\"Entity\"], {\"text\": \"uk\"})\n",
+    "jordan_node = gds.find_node_id([\"Entity\"], {\"text\": \"jordan\"})\n",
+    "\n",
+    "triplets = [\n",
+    "    (brazil_node, \"REL_RELNGO\", uk_node),\n",
+    "    (brazil_node, \"REL_RELDIPLOMACY\", jordan_node),\n",
+    "]\n",
+    "\n",
+    "scores = gds.kge.model.score_triplets(\n",
+    "    model_name=model_name,\n",
+    "    triplets=triplets,\n",
+    ")\n",
+    "\n",
+    "print(scores)"
+   ]
   }
  ],
  "metadata": {},
diff --git a/examples/kge-distmult-nations.py b/examples/kge-distmult-nations.py
@@ -186,6 +186,8 @@ def inspect_graph(G):
     put_data_in_db(gds)
     G_train, G_valid, G_test = project_graphs(gds)
 
+    inspect_graph(G_train)
+
     gds.set_compute_cluster_ip("localhost")
 
     model_name = "dummyModelName_" + str(time.time())
@@ -197,10 +199,11 @@ def inspect_graph(G):
         num_epochs=1,
         embedding_dimension=10,
         epochs_per_checkpoint=0,
-        epochs_per_val=0,
+        epochs_per_val=5,
+        split_ratios={"TRAIN": 0.8, "VALID": 0.1, "TEST": 0.1},
     )
 
-    df = gds.kge.model.predict(
+    predict_result = gds.kge.model.predict(
         model_name=model_name,
         top_k=3,
         node_ids=[
@@ -211,7 +214,37 @@ def inspect_graph(G):
         rel_types=["REL_RELDIPLOMACY", "REL_RELNGO"],
     )
 
-    print(df.to_string())
+    print(predict_result.to_string())
+
+    print(predict_result.to_string())
+    for index, row in predict_result.iterrows():
+        h = row["head"]
+        r = row["rel"]
+        gds.run_cypher(
+            f"""
+            UNWIND $tt as t
+            MATCH (a:Entity WHERE id(a) = {h})
+            MATCH (b:Entity WHERE id(b) = t)
+            MERGE (a)-[:NEW_REL_{r}]->(b)
+        """,
+            params={"tt": row["tail"]},
+        )
+
+    brazil_node = gds.find_node_id(["Entity"], {"text": "brazil"})
+    uk_node = gds.find_node_id(["Entity"], {"text": "uk"})
+    jordan_node = gds.find_node_id(["Entity"], {"text": "jordan"})
+
+    triplets = [
+        (brazil_node, "REL_RELNGO", uk_node),
+        (brazil_node, "REL_RELDIPLOMACY", jordan_node),
+    ]
+
+    scores = gds.kge.model.score_triplets(
+        model_name=model_name,
+        triplets=triplets,
+    )
+
+    print(scores)
     #
     # gds.kge.model.predict_tail(
     #     G_train,
diff --git a/graphdatascience/graph_data_science.py b/graphdatascience/graph_data_science.py
@@ -17,7 +17,6 @@
 from .query_runner.query_runner import QueryRunner
 from .server_version.server_version import ServerVersion
 from graphdatascience.graph.graph_proc_runner import GraphProcRunner
-from graphdatascience.utils.util_proc_runner import UtilProcRunner
 
 
 class GraphDataScience(DirectEndpoints, UncallableNamespace):
diff --git a/graphdatascience/model/kge_runner.py b/graphdatascience/model/kge_runner.py
@@ -155,6 +155,41 @@ def predict(
 
         return self._stream_results(config["user_name"], config["task_config"]["modelname"], job_id)
 
+    @client_only_endpoint("gds.kge.model")
+    def score_triplets(
+        self,
+        model_name: str,
+        triplets: list[tuple[int, str, int]],
+        mlflow_experiment_name: Optional[str] = None,
+    ) -> DataFrame:
+
+        algo_config = {
+            "triplets": triplets,
+        }
+
+        config = {
+            "user_name": "DUMMY_USER",
+            "task": "KGE_SCORE_TRIPLETS_PYG",
+            "task_config": {
+                "modelname": model_name,
+                "task_config": algo_config,
+            },
+            "graph_arrow_uri": self._arrow_uri,
+        }
+        if self._encrypted_db_password is not None:
+            config["encrypted_db_password"] = self._encrypted_db_password
+
+        if mlflow_experiment_name is not None:
+            config["task_config"]["mlflow"] = {
+                "config": {"tracking_uri": self._compute_cluster_mlflow_uri, "experiment_name": mlflow_experiment_name}
+            }
+
+        job_id = self._start_job(config)
+
+        self._wait_for_job(job_id)
+
+        return self._stream_results(config["user_name"], config["task_config"]["modelname"], job_id)
+
     def _stream_results(self, user_name: str, model_name: str, job_id: str) -> DataFrame:
         res = requests.get(
             f"{self._compute_cluster_web_uri}/internal/fetch-result",
@@ -172,11 +207,10 @@ def _stream_results(self, user_name: str, model_name: str, job_id: str) -> DataF
 
     def _start_job(self, config: Dict[str, Any]) -> str:
         url = f"{self._compute_cluster_web_uri}/api/machine-learning/start"
-        print(url)
         res = requests.post(url, json=config)
         res.raise_for_status()
         job_id = res.json()["job_id"]
-        logging.info(f"Job with ID '{job_id}' started")
+        logging.info(f"Job '{config['task']}' with ID '{job_id}' started")
 
         return job_id