1+ #! /usr/bin/env bash
2+
3+ # Pipeline that coordinates anomaly detection using the Graph Data Science Library of Neo4j.
4+ # It requires an already running Neo4j graph database with already scanned and analyzed artifacts.
5+ # The results will be written into the sub directory reports/anomaly-detection.
6+
7+ # Note that "scripts/prepareAnalysis.sh" is required to run prior to this script.
8+
9+ # Requires executeQueryFunctions.sh, projectionFunctions.sh, cleanupAfterReportGeneration.sh
10+
11+ # Fail on any error ("-e" = exit on first error, "-o pipefail" exist on errors within piped commands)
12+ set -o errexit -o pipefail
13+
14+ # Overrideable Constants (defaults also defined in sub scripts)
15+ REPORTS_DIRECTORY=${REPORTS_DIRECTORY:- " reports" }
16+
17+ # # Get this "scripts/reports" directory if not already set
18+ # Even if $BASH_SOURCE is made for Bourne-like shells it is also supported by others and therefore here the preferred solution.
19+ # CDPATH reduces the scope of the cd command to potentially prevent unintended directory changes.
20+ # This way non-standard tools like readlink aren't needed.
21+ ANOMALY_DETECTION_SCRIPT_DIR=${ANOMALY_DETECTION_SCRIPT_DIR:- $(CDPATH=. cd -- " $( dirname -- " ${BASH_SOURCE[0]} " ) " && pwd -P)}
22+ echo " anomalyDetectionPipeline: ANOMALY_DETECTION_SCRIPT_DIR=${ANOMALY_DETECTION_SCRIPT_DIR} "
23+ # Get the "scripts" directory by taking the path of this script and going one directory up.
24+ SCRIPTS_DIR=${SCRIPTS_DIR:- " ${ANOMALY_DETECTION_SCRIPT_DIR} /../../scripts" } # Repository directory containing the shell scripts
25+ # Get the "cypher" query directory for gathering features.
26+ ANOMALY_DETECTION_FEATURE_CYPHER_DIR=${ANOMALY_DETECTION_FEATURE_CYPHER_DIR:- " ${ANOMALY_DETECTION_SCRIPT_DIR} /features" }
27+ ANOMALY_DETECTION_QUERY_CYPHER_DIR=${ANOMALY_DETECTION_QUERY_CYPHER_DIR:- " ${ANOMALY_DETECTION_SCRIPT_DIR} /queries" }
28+
29+ # Function to display script usage
30+ usage () {
31+ echo -e " ${COLOR_ERROR} " >&2
32+ echo " Usage: $0 [--verbose]" >&2
33+ echo -e " ${COLOR_DEFAULT} " >&2
34+ exit 1
35+ }
36+
37+ # Default values
38+ verboseMode=" " # either "" or "--verbose"
39+
40+ # Parse command line arguments
41+ while [[ $# -gt 0 ]]; do
42+ key=" $1 "
43+ value=" ${2} "
44+
45+ case ${key} in
46+ --verbose)
47+ verboseMode=" --verbose"
48+ ;;
49+ * )
50+ echo -e " ${COLOR_ERROR} anomalyDetectionPipeline: Error: Unknown option: ${key}${COLOR_DEFAULT} " >&2
51+ usage
52+ ;;
53+ esac
54+ shift || true # ignore error when there are no more arguments
55+ done
56+
57+ # Define functions to execute a cypher query from within a given file (first and only argument) like "execute_cypher"
58+ source " ${SCRIPTS_DIR} /executeQueryFunctions.sh"
59+
60+ # Define functions to create and delete Graph Projections like "createUndirectedDependencyProjection"
61+ source " ${SCRIPTS_DIR} /projectionFunctions.sh"
62+
63+ # Query or recalculate features.
64+ #
65+ # Required Parameters:
66+ # - projection_name=...
67+ # Name prefix for the in-memory projection name. Example: "package-anomaly-detection"
68+ # - projection_node_label=...
69+ # Label of the nodes that will be used for the projection. Example: "Package"
70+ # - projection_weight_property=...
71+ # Name of the node property that contains the dependency weight. Example: "weight"
72+ anomaly_detection_features () {
73+ # Determine the Betweenness centrality (with the directed graph projection) if not already done
74+ execute_cypher_queries_until_results " ${ANOMALY_DETECTION_FEATURE_CYPHER_DIR} /AnomalyDetectionFeature-Betweenness-Exists.cypher" \
75+ " ${ANOMALY_DETECTION_FEATURE_CYPHER_DIR} /AnomalyDetectionFeature-Betweenness-Write.cypher" " ${@ } "
76+ # Determine the local clustering coefficient if not already done
77+ execute_cypher_queries_until_results " ${ANOMALY_DETECTION_FEATURE_CYPHER_DIR} /AnomalyDetectionFeature-LocalClusteringCoefficient-Exists.cypher" \
78+ " ${ANOMALY_DETECTION_FEATURE_CYPHER_DIR} /AnomalyDetectionFeature-LocalClusteringCoefficient-Write.cypher" " ${@ } "
79+ # Determine the page rank if not already done
80+ execute_cypher_queries_until_results " ${ANOMALY_DETECTION_FEATURE_CYPHER_DIR} /AnomalyDetectionFeature-PageRank-Exists.cypher" \
81+ " ${ANOMALY_DETECTION_FEATURE_CYPHER_DIR} /AnomalyDetectionFeature-PageRank-Write.cypher" " ${@ } "
82+ # Determine the article rank if not already done
83+ execute_cypher_queries_until_results " ${ANOMALY_DETECTION_FEATURE_CYPHER_DIR} /AnomalyDetectionFeature-ArticleRank-Exists.cypher" \
84+ " ${ANOMALY_DETECTION_FEATURE_CYPHER_DIR} /AnomalyDetectionFeature-PageRank-Write.cypher" " ${@ } "
85+ }
86+ # Run queries to find anomalies in the graph.
87+ #
88+ # Required Parameters:
89+ # - projection_node_label=...
90+ # Label of the nodes that will be used for the projection. Example: "Package"
91+ anomaly_detection_queries () {
92+ local nodeLabel
93+ nodeLabel=$( extractQueryParameter " projection_node_label" " ${@ } " )
94+
95+ execute_cypher " ${ANOMALY_DETECTION_QUERY_CYPHER_DIR} /AnomalyDetectionPotentialImbalancedRoles.cypher" " ${@ } " > " ${FULL_REPORT_DIRECTORY} /${nodeLabel} AnomalyDetection_PotentialImbalancedRoles.csv"
96+ execute_cypher " ${ANOMALY_DETECTION_QUERY_CYPHER_DIR} /AnomalyDetectionPotentialOverEngineerOrIsolated.cypher" " ${@ } " > " ${FULL_REPORT_DIRECTORY} /${nodeLabel} AnomalyDetection_PotentialOverEngineerOrIsolated.csv"
97+
98+ execute_cypher " ${ANOMALY_DETECTION_QUERY_CYPHER_DIR} /AnomalyDetectionHiddenBridgeNodes.cypher" " ${@ } " > " ${FULL_REPORT_DIRECTORY} /${nodeLabel} AnomalyDetection_HiddenBridgeNodes.csv"
99+ execute_cypher " ${ANOMALY_DETECTION_QUERY_CYPHER_DIR} /AnomalyDetectionPopularBottlenecks.cypher" " ${@ } " > " ${FULL_REPORT_DIRECTORY} /${nodeLabel} AnomalyDetection_PopularBottlenecks.csv"
100+ execute_cypher " ${ANOMALY_DETECTION_QUERY_CYPHER_DIR} /AnomalyDetectionSilentCoordinators.cypher" " ${@ } " > " ${FULL_REPORT_DIRECTORY} /${nodeLabel} AnomalyDetection_SilentCoordinators.csv"
101+ execute_cypher " ${ANOMALY_DETECTION_QUERY_CYPHER_DIR} /AnomalyDetectionFragileStructuralBridges.cypher" " ${@ } " > " ${FULL_REPORT_DIRECTORY} /${nodeLabel} AnomalyDetection_FragileStructuralBridges.csv"
102+ execute_cypher " ${ANOMALY_DETECTION_QUERY_CYPHER_DIR} /AnomalyDetectionUnexpectedCentralNodes.cypher" " ${@ } " > " ${FULL_REPORT_DIRECTORY} /${nodeLabel} AnomalyDetection_UnexpectedCentralNodes.csv"
103+ }
104+
105+ # TODO Remove notes:
106+ # ✅ High betweenness + low degree Hidden bottleneck or hub
107+ # ✅ High PageRank + high betweenness Critical infrastructure component
108+ # High ArticleRank + high betweenness Complex orchestrator or manager
109+ # Low PageRank + high betweenness Architecture smell or misplaced responsibility
110+ # ✅ High betweenness + low clustering coefficient Potential boundary-spanning module (violates cohesion)
111+
112+
113+ # ✅🔸 1. Hidden Bridge Nodes
114+ # Primary: betweenness → DESC (high)
115+ # Secondary: pagerank → ASC (low)
116+ # 📁 File name: hidden_bridge_nodes.csv
117+ # 💡 Meaning: Mediates flow, but isn’t highly depended on — structural surprise.
118+
119+ # ✅🔸 2. Popular Bottlenecks
120+ # Primary: pagerank → DESC (high)
121+ # Secondary: betweenness → DESC (high)
122+ # 📁 File name: popular_bottlenecks.csv
123+ # 💡 Meaning: Key nodes that are both heavily depended on and control flow — critical hubs.
124+
125+ # ✅ 🔸 3. Silent Coordinators
126+ # Primary: betweenness → DESC
127+ # Secondary: in_degree → ASC
128+ # 📁 File name: silent_coordinators.csv
129+ # 💡 Meaning: Not many modules depend on them, yet they control lots of interactions — hidden complexity.
130+
131+ # 🔸 4. Over-referenced Utility Nodes
132+ # Primary: pagerank → DESC
133+ # Secondary: clustering_coefficient → ASC
134+ # 📁 File name: overused_utilities.csv
135+ # 💡 Meaning: Widely referenced, but loosely coupled in neighborhood — could be over-generalized or abused.
136+
137+ # ✅ 🔸 5. Fragile Structural Bridges
138+ # Primary: betweenness → DESC
139+ # Secondary: clustering_coefficient → ASC
140+ # 📁 File name: fragile_bridges.csv
141+ # 💡 Meaning: Connect otherwise unrelated parts of the graph — potential architectural risks.
142+
143+ # 🔸 6. Dependency Hungry Orchestrators
144+ # Primary: articlerank → DESC
145+ # Secondary: betweenness → DESC
146+ # 📁 File name: dependency_orchestrators.csv
147+ # 💡 Meaning: Nodes that depend on many others and also control flow — likely orchestrators or managers.
148+
149+ # 🔸 7. Layer Violation Candidates
150+ # Primary: articlerank → DESC
151+ # Secondary: pagerank → DESC
152+ # 📁 File name: layer_violation_candidates.csv
153+ # 💡 Meaning: Nodes that are both deeply dependent on others and highly referenced — may break clean layering.
154+
155+ # ✅🔸 8. Unexpected Central Nodes
156+ # Primary: betweenness → DESC
157+ # Secondary: degree → ASC
158+ # 📁 File name: unexpected_central_nodes.csv
159+ # 💡 Meaning: Low-degree nodes with high structural importance — often unexpected.
160+
161+ # 🔸 9. Popular but Isolated
162+ # Primary: pagerank → DESC
163+ # Secondary: clustering_coefficient → ASC
164+ # 📁 File name: popular_isolated_nodes.csv
165+ # 💡 Meaning: Nodes everyone uses, but which are in poorly connected local contexts.
166+
167+ # 🔸 10. Overconnected Internal Modules
168+ # Primary: clustering_coefficient → DESC
169+ # Secondary: degree → DESC
170+ # 📁 File name: tight_coupled_internal_modules.csv
171+ # 💡 Meaning: Densely connected neighborhoods — could be complex internal packages.
172+
173+ # Run the anomaly detection pipeline.
174+ #
175+ # Required Parameters:
176+ # - projection_name=...
177+ # Name prefix for the in-memory projection name. Example: "package-anomaly-detection"
178+ # - projection_node_label=...
179+ # Label of the nodes that will be used for the projection. Example: "Package"
180+ # - projection_weight_property=...
181+ # Name of the node property that contains the dependency weight. Example: "weight"
182+ anomaly_detection_pipeline () {
183+ time anomaly_detection_features " ${@ } "
184+ time anomaly_detection_queries " ${@ } "
185+ # Get tuned Leiden communities as a reference to tune clustering
186+ time " ${ANOMALY_DETECTION_SCRIPT_DIR} /tunedLeidenCommunityDetection.py" " ${@ } " ${verboseMode}
187+ # Tuned Fast Random Projection and tuned HDBSCAN clustering
188+ time " ${ANOMALY_DETECTION_SCRIPT_DIR} /tunedNodeEmbeddingClustering.py" " ${@ } " ${verboseMode}
189+
190+ # Query Results: Output all collected features into a CSV file.
191+ local nodeLabel
192+ nodeLabel=$( extractQueryParameter " projection_node_label" " ${@ } " )
193+ execute_cypher " ${ANOMALY_DETECTION_FEATURE_CYPHER_DIR} /AnomalyDetectionFeatures.cypher" " ${@ } " > " ${FULL_REPORT_DIRECTORY} /${nodeLabel} AnomalyDetectionFeatures.csv"
194+ }
195+
196+ # Create report directory
197+ REPORT_NAME=" anomaly-detection"
198+ FULL_REPORT_DIRECTORY=" ${REPORTS_DIRECTORY} /${REPORT_NAME} "
199+ mkdir -p " ${FULL_REPORT_DIRECTORY} "
200+
201+ # Query Parameter key pairs for projection and algorithm side
202+ PROJECTION_NAME=" dependencies_projection"
203+ ALGORITHM_PROJECTION=" projection_name"
204+
205+ PROJECTION_NODE=" dependencies_projection_node"
206+ ALGORITHM_NODE=" projection_node_label"
207+
208+ PROJECTION_WEIGHT=" dependencies_projection_weight_property"
209+ ALGORITHM_WEIGHT=" projection_weight_property"
210+
211+ # Code independent algorithm parameters
212+ COMMUNITY_PROPERTY=" community_property=communityLeidenIdTuned"
213+
214+ # -- Java Artifact Node Embeddings -------------------------------
215+
216+ if createUndirectedDependencyProjection " ${PROJECTION_NAME} =artifact-anomaly-detection" " ${PROJECTION_NODE} =Artifact" " ${PROJECTION_WEIGHT} =weight" ; then
217+ createDirectedDependencyProjection " ${PROJECTION_NAME} =artifact-anomaly-detection-directed" " ${PROJECTION_NODE} =Artifact" " ${PROJECTION_WEIGHT} =weight"
218+ anomaly_detection_pipeline " ${ALGORITHM_PROJECTION} =artifact-anomaly-detection" " ${ALGORITHM_NODE} =Artifact" " ${ALGORITHM_WEIGHT} =weight" " ${COMMUNITY_PROPERTY} "
219+ fi
220+
221+ # -- Java Package Node Embeddings --------------------------------
222+
223+ if createUndirectedDependencyProjection " ${PROJECTION_NAME} =package-anomaly-detection" " ${PROJECTION_NODE} =Package" " ${PROJECTION_WEIGHT} =weight25PercentInterfaces" ; then
224+ createDirectedDependencyProjection " ${PROJECTION_NAME} =package-anomaly-detection-directed" " ${PROJECTION_NODE} =Package" " ${PROJECTION_WEIGHT} =weight25PercentInterfaces"
225+ anomaly_detection_pipeline " ${ALGORITHM_PROJECTION} =package-anomaly-detection" " ${ALGORITHM_NODE} =Package" " ${ALGORITHM_WEIGHT} =weight25PercentInterfaces" " ${COMMUNITY_PROPERTY} "
226+ fi
227+
228+ # -- Java Type Node Embeddings -----------------------------------
229+
230+ if createUndirectedJavaTypeDependencyProjection " ${PROJECTION_NAME} =type-anomaly-detection" ; then
231+ createDirectedJavaTypeDependencyProjection " ${PROJECTION_NAME} =type-anomaly-detection-directed"
232+ anomaly_detection_pipeline " ${ALGORITHM_PROJECTION} =type-anomaly-detection" " ${ALGORITHM_NODE} =Type" " ${ALGORITHM_WEIGHT} =weight" " ${COMMUNITY_PROPERTY} "
233+ fi
234+
235+ # -- Typescript Module Node Embeddings ---------------------------
236+
237+ if createUndirectedDependencyProjection " ${PROJECTION_NAME} =typescript-module-embedding" " ${PROJECTION_NODE} =Module" " ${PROJECTION_WEIGHT} =lowCouplingElement25PercentWeight" ; then
238+ createDirectedDependencyProjection " ${PROJECTION_NAME} =typescript-module-embedding-directed" " ${PROJECTION_NODE} =Module" " ${PROJECTION_WEIGHT} =lowCouplingElement25PercentWeight"
239+ anomaly_detection_pipeline " ${ALGORITHM_PROJECTION} =typescript-module-embedding" " ${ALGORITHM_NODE} =Module" " ${ALGORITHM_WEIGHT} =lowCouplingElement25PercentWeight" " ${COMMUNITY_PROPERTY} "
240+ fi
241+
242+ # ---------------------------------------------------------------
243+
244+ # Clean-up after report generation. Empty reports will be deleted.
245+ source " ${SCRIPTS_DIR} /cleanupAfterReportGeneration.sh" " ${FULL_REPORT_DIRECTORY} "
246+
247+ echo " anomalyDetectionPipeline: $( date +' %Y-%m-%dT%H:%M:%S%z' ) Successfully finished."
0 commit comments