JohT
diff --git a/‎domains/anomaly-detection/umap2dNodeEmbeddings.py‎
Lines changed: 211 additions & 0 deletions b/‎domains/anomaly-detection/umap2dNodeEmbeddings.py‎
Lines changed: 211 additions & 0 deletions
@@ -0,0 +1,211 @@
+#!/usr/bin/env python
+
+# This Python script uses UMAP (https://umap-learn.readthedocs.io) to reduce the dimensionality of node embeddings to two dimensions for visualization purposes.
+# This is useful to get a visual intuition about the structure of the code units (like Java packages) and their dependencies.
+# The resulting 2D coordinates are written back to Neo4j for further use.
+
+# Prerequisite:
+# - Provide the password for Neo4j in the environment variable "NEO4J_INITIAL_PASSWORD".
+# - Already existing Graph with analyzed code units (like Java Packages) and their dependencies.
+# - Already existing node embeddings for the code units, e.g. generated by Fast Random Projection (FastRP) or other algorithms.
+
+import typing
+
+import os
+import sys
+import argparse
+import pprint
+
+import pandas as pd
+import numpy as np
+
+from neo4j import GraphDatabase, Driver
+import umap
+
+
+class Parameters:
+    required_parameters_ = ["projection_node_label"]
+
+    def __init__(self, input_parameters: typing.Dict[str, str], verbose: bool = False):
+        self.query_parameters_ = input_parameters.copy()  # copy enforces immutability
+        self.verbose_ = verbose
+
+    def __repr__(self):
+        pretty_dict = pprint.pformat(self.query_parameters_, indent=4)
+        return f"Parameters: verbose={self.verbose_}, query_parameters:\n{pretty_dict}"
+
+    @staticmethod
+    def log_dependency_versions_() -> None:
+        print('---------------------------------------')
+
+        print('Python version: {}'.format(sys.version))
+
+        from numpy import __version__ as numpy_version
+        print('numpy version: {}'.format(numpy_version))
+
+        from pandas import __version__ as pandas_version
+        print('pandas version: {}'.format(pandas_version))
+
+        from neo4j import __version__ as neo4j_version
+        print('neo4j version: {}'.format(neo4j_version))
+
+        from umap import __version__ as umap_version
+        print('umap version: {}'.format(umap_version))
+
+        print('---------------------------------------')
+
+    @classmethod
+    def from_input_parameters(cls, input_parameters: typing.Dict[str, str], verbose: bool = False):
+        """
+        Creates a Parameters instance from a dictionary of input parameters.
+        The dictionary must contain the following keys:
+         - "projection_node_label": The node type of the projection.
+        """
+        missing_parameters = [parameter for parameter in cls.required_parameters_ if parameter not in input_parameters]
+        if missing_parameters:
+            raise ValueError("Missing parameters:", missing_parameters)
+        created_parameters = cls(input_parameters, verbose)
+        if created_parameters.is_verbose():
+            print(created_parameters)
+            cls.log_dependency_versions_()
+        return created_parameters
+
+    @classmethod
+    def example(cls):
+        return cls(dict(projection_node_label="Package"))
+
+    def get_query_parameters(self) -> typing.Dict[str, str]:
+        return self.query_parameters_.copy()  # copy enforces immutability
+
+    def clone_with_projection_name(self, projection_name: str):
+        updated_parameter = self.get_query_parameters()
+        updated_parameter.update({"projection_name": projection_name})
+        return Parameters(updated_parameter)
+
+    def get_projection_node_label(self) -> str:
+        return self.query_parameters_["projection_node_label"]
+
+    def is_verbose(self) -> bool:
+        return self.verbose_
+
+
+def parse_input_parameters() -> Parameters:
+    # Convert list of "key=value" strings to a dictionary
+    def parse_key_value_list(param_list: typing.List[str]) -> typing.Dict[str, str]:
+        param_dict = {}
+        for item in param_list:
+            if '=' in item:
+                key, value = item.split('=', 1)
+                param_dict[key] = value
+        return param_dict
+
+    parser = argparse.ArgumentParser(
+        description="Unsupervised clustering to assign labels to code units (Java packages, types,...) and their dependencies based on how structurally similar they are within a software system.")
+    parser.add_argument('--verbose', action='store_true', help='Enable verbose mode to log all details')
+    parser.add_argument('query_parameters', nargs='*', type=str, help='List of key=value Cypher query parameters')
+    parser.set_defaults(verbose=False)
+    args = parser.parse_args()
+    return Parameters.from_input_parameters(parse_key_value_list(args.query_parameters), args.verbose)
+
+
+def get_graph_database_driver() -> Driver:
+    driver = GraphDatabase.driver(
+        uri="bolt://localhost:7687",
+        auth=("neo4j", os.environ.get("NEO4J_INITIAL_PASSWORD"))
+    )
+    driver.verify_connectivity()
+    return driver
+
+
+def query_cypher_to_data_frame(query: typing.LiteralString, parameters: typing.Optional[typing.Dict[str, typing.Any]] = None):
+    records, summary, keys = driver.execute_query(query, parameters_=parameters)
+    return pd.DataFrame([record.values() for record in records], columns=keys)
+
+
+def write_batch_data_into_database(dataframe: pd.DataFrame, node_label: str, id_column: str = "nodeElementId", batch_size: int = 1000):
+    """
+    Writes the given dataframe to the Neo4j database using a batch write operation.
+
+    Parameters:
+    - dataframe: The pandas DataFrame to write.
+    - label: The label to use for the nodes in the Neo4j database.
+    - id_column: The name of the column in the DataFrame that contains the node IDs.
+    - cypher_query_file: The file containing the Cypher query for writing the data.
+    - batch_size: The number of rows to write in each batch.
+    """
+    def prepare_rows(dataframe):
+        rows = []
+        for _, row in dataframe.iterrows():
+            properties_without_id = row.drop(labels=[id_column]).to_dict()
+            rows.append({
+                "nodeId": row[id_column],
+                "properties": properties_without_id
+            })
+        return rows
+
+    def update_batch(transaction, rows):
+        query = """
+            UNWIND $rows AS row
+            MATCH (codeUnit)
+            WHERE elementId(codeUnit) = row.nodeId
+            AND $node_label IN labels(codeUnit) 
+            SET codeUnit += row.properties
+        """
+        transaction.run(query, rows=rows, node_label=node_label)
+
+    with driver.session() as session:
+        for start in range(0, len(dataframe), batch_size):
+            batch_dataframe = dataframe.iloc[start:start + batch_size]
+            batch_rows = prepare_rows(batch_dataframe)
+            return session.execute_write(update_batch, batch_rows)
+
+
+def prepare_node_embeddings_for_2d_visualization(embeddings: pd.DataFrame) -> pd.DataFrame:
+    """
+    Reduces the dimensionality of the node embeddings (e.g. 64 floating point numbers in an array)
+    to two dimensions for 2D visualization using UMAP.
+    see https://umap-learn.readthedocs.io
+    """
+
+    if embeddings.empty: 
+        print("No projected data for node embeddings dimensionality reduction available")
+        return embeddings
+
+    # Convert the list of embeddings to a numpy array
+    embeddings_as_numpy_array = np.array(embeddings.embedding.to_list())
+
+    # Use UMAP to reduce the dimensionality to 2D for visualization
+    umap_reducer = umap.UMAP(n_components=2, min_dist=0.3, random_state=47, n_jobs=1, verbose=parameters.is_verbose())
+    two_dimension_node_embeddings = umap_reducer.fit_transform(embeddings_as_numpy_array)
+
+    # Add the 2D coordinates to the DataFrame
+    embeddings['embeddingVisualizationX'] = two_dimension_node_embeddings[:, 0]
+    embeddings['embeddingVisualizationY'] = two_dimension_node_embeddings[:, 1]
+
+    return embeddings
+
+
+# ------------------------------------------------------------------------------------------------------------
+#  MAIN
+# ------------------------------------------------------------------------------------------------------------
+
+parameters = parse_input_parameters()
+driver = get_graph_database_driver()
+
+cypher_query_embeddings_: typing.LiteralString = """
+   MATCH (codeUnit)
+   WHERE $projection_node_label IN labels(codeUnit)
+     AND codeUnit.embeddingsFastRandomProjectionForClustering  IS NOT NULL
+  RETURN elementId(codeUnit)                                   AS nodeElementId
+         ,codeUnit.embeddingsFastRandomProjectionForClustering AS embedding
+    """
+
+embeddings = query_cypher_to_data_frame(cypher_query_embeddings_, parameters.get_query_parameters())
+embeddings = prepare_node_embeddings_for_2d_visualization(embeddings)
+
+data_to_write = pd.DataFrame(data={
+    'nodeElementId': embeddings["nodeElementId"],
+    'embeddingFastRandomProjectionVisualizationX': embeddings["embeddingVisualizationX"],
+    'embeddingFastRandomProjectionVisualizationY': embeddings["embeddingVisualizationY"],
+})
+write_batch_data_into_database(data_to_write, parameters.get_projection_node_label())