Skip to content

Commit bf8a758

Browse files
committed
Add anomaly detection Markdown summary report
1 parent cfa9f70 commit bf8a758

11 files changed

+392
-0
lines changed

domains/anomaly-detection/anomalyDetectionCsv.sh

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -70,6 +70,8 @@ anomaly_detection_features() {
7070
# Required Parameters:
7171
# - projection_node_label=...
7272
# Label of the nodes that will be used for the projection. Example: "Package"
73+
# - projection_language=...
74+
# Name of the associated programming language. Default: "Java". Example: "Typescript"
7375
anomaly_detection_queries() {
7476
local nodeLabel
7577
nodeLabel=$( extractQueryParameter "projection_node_label" "${@}" )
@@ -99,6 +101,8 @@ anomaly_detection_queries() {
99101
# Required Parameters:
100102
# - projection_node_label=...
101103
# Label of the nodes that will be used for the projection. Example: "Package"
104+
# - projection_language=...
105+
# Name of the associated programming language. Examples: "Java", "Typescript"
102106
anomaly_detection_labels() {
103107
local nodeLabel
104108
nodeLabel=$( extractQueryParameter "projection_node_label" "${@}" )
@@ -129,6 +133,8 @@ anomaly_detection_labels() {
129133
# Label of the nodes that will be used for the projection. Example: "Package"
130134
# - projection_weight_property=...
131135
# Name of the node property that contains the dependency weight. Example: "weight"
136+
# - projection_language=...
137+
# Name of the associated programming language. Examples: "Java", "Typescript"
132138
anomaly_detection_csv_reports() {
133139
time anomaly_detection_features "${@}"
134140
time anomaly_detection_queries "${@}"
Lines changed: 27 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,27 @@
1+
// Anomaly Detection Summary: Summarizes all labelled archetypes by their anomaly score including examples. Requires all other labels/*.cypher queries to run first. Variables: projection_language, projection_node_label
2+
3+
MATCH (codeUnit)
4+
WHERE $projection_node_label IN labels(codeUnit)
5+
UNWIND keys(codeUnit) AS codeUnitProperty
6+
WITH *
7+
WHERE codeUnitProperty STARTS WITH 'anomaly'
8+
AND codeUnitProperty ENDS WITH 'Rank'
9+
WITH *
10+
,coalesce(codeUnit.fqn, codeUnit.globalFqn, codeUnit.fileName, codeUnit.signature, codeUnit.name) AS codeUnitName
11+
,split(split(codeUnitProperty, 'anomaly')[1], 'Rank')[0] AS archetype
12+
,codeUnit[codeUnitProperty] AS archetypeRank
13+
,codeUnit.anomalyScore AS anomalyScore
14+
WITH *, collect(archetype)[0] AS archetype
15+
ORDER BY codeUnit.anomalyScore DESC, archetypeRank ASC, codeUnitName ASC, archetype ASC
16+
WITH archetype
17+
,anomalyScore
18+
,CASE WHEN codeUnit.anomalyScore <= 0 THEN 'Typical'
19+
WHEN codeUnit.anomalyTopFeature1 IS NULL THEN 'Undetermined'
20+
ELSE 'Anomalous' END AS modelStatus
21+
,codeUnitName
22+
RETURN archetype AS `Archetype`
23+
,count(DISTINCT codeUnitName) AS `Count`
24+
,round(max(anomalyScore), 4, 'HALF_UP') AS `Max. Score`
25+
,modelStatus AS `Model Status`
26+
,apoc.text.join(collect(DISTINCT codeUnitName)[0..3], ', ') AS `Examples`
27+
ORDER BY modelStatus, archetype, `Max. Score` DESC
Lines changed: 20 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,20 @@
1+
// Anomaly Detection DeepDive: Overview of analyzed code units and the number of anomalies detected. Requires all other labels/*.cypher queries to run first. Variables: projection_language, projection_node_label
2+
3+
MATCH (codeUnit)
4+
WHERE $projection_node_label IN labels(codeUnit)
5+
AND (codeUnit.incomingDependencies IS NOT NULL
6+
OR codeUnit.outgoingDependencies IS NOT NULL)
7+
WITH sum(codeUnit.anomalyLabel) AS anomalyCount
8+
,sum(sign(codeUnit.anomalyAuthorityRank)) AS authorityCount
9+
,sum(sign(codeUnit.anomalyBottleneckRank)) AS bottleNeckCount
10+
,sum(sign(codeUnit.anomalyBridgeRank)) AS bridgeCount
11+
,sum(sign(codeUnit.anomalyHubRank)) AS hubCount
12+
,sum(sign(codeUnit.anomalyOutlierRank)) AS outlierCount
13+
//,collect(codeUnit.name)[0..4] AS exampleNames
14+
RETURN anomalyCount AS `Anomalies`
15+
,authorityCount AS `Authorities`
16+
,bottleNeckCount AS `Bottlenecks`
17+
,bridgeCount AS `Bridges`
18+
,hubCount AS `Hubs`
19+
,outlierCount AS `Outliers`
20+
//,exampleNames
Lines changed: 22 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,22 @@
1+
// Anomaly Detection Summary: Overview of all analyzed code units in total. Requires all other labels/*.cypher queries to run first. Variables: projection_language, projection_node_label
2+
3+
MATCH (codeUnit)
4+
WHERE (codeUnit.incomingDependencies IS NOT NULL
5+
OR codeUnit.outgoingDependencies IS NOT NULL)
6+
WITH count(DISTINCT codeUnit) AS codeUnitCount
7+
,sum(codeUnit.anomalyLabel) AS anomalyCount
8+
,sum(sign(codeUnit.anomalyAuthorityRank)) AS authorityCount
9+
,sum(sign(codeUnit.anomalyBottleneckRank)) AS bottleNeckCount
10+
,sum(sign(codeUnit.anomalyBridgeRank)) AS bridgeCount
11+
,sum(sign(codeUnit.anomalyHubRank)) AS hubCount
12+
,sum(sign(codeUnit.anomalyOutlierRank)) AS outlierCount
13+
//,collect(codeUnit.name)[0..4] AS exampleNames
14+
RETURN codeUnitCount AS `Analyzed Units`
15+
,anomalyCount AS `Anomalies`
16+
,authorityCount AS `Authorities`
17+
,bottleNeckCount AS `Bottlenecks`
18+
,bridgeCount AS `Bridges`
19+
,hubCount AS `Hubs`
20+
,outlierCount AS `Outliers`
21+
//,exampleNames
22+
ORDER BY anomalyCount DESC, codeUnitCount DESC
Lines changed: 30 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,30 @@
1+
// Anomaly Detection Summary: Overview of analyzed code units and the number of anomalies detected. Requires all other labels/*.cypher queries to run first. Variables: projection_language, projection_node_label
2+
3+
MATCH (codeUnit)
4+
WHERE (codeUnit.incomingDependencies IS NOT NULL
5+
OR codeUnit.outgoingDependencies IS NOT NULL)
6+
UNWIND labels(codeUnit) AS codeUnitLabel
7+
WITH *
8+
WHERE NOT codeUnitLabel STARTS WITH 'Mark4'
9+
AND NOT codeUnitLabel IN ['File', 'Directory', 'ByteCode', 'GenericDeclaration']
10+
WITH collect(codeUnitLabel) AS codeUnitLabels
11+
,codeUnit
12+
WITH apoc.text.join(codeUnitLabels, ',') AS codeUnitLabels
13+
,count(DISTINCT codeUnit) AS codeUnitCount
14+
,sum(codeUnit.anomalyLabel) AS anomalyCount
15+
,sum(sign(codeUnit.anomalyAuthorityRank)) AS authorityCount
16+
,sum(sign(codeUnit.anomalyBottleneckRank)) AS bottleNeckCount
17+
,sum(sign(codeUnit.anomalyBridgeRank)) AS bridgeCount
18+
,sum(sign(codeUnit.anomalyHubRank)) AS hubCount
19+
,sum(sign(codeUnit.anomalyOutlierRank)) AS outlierCount
20+
//,collect(codeUnit.name)[0..4] AS exampleNames
21+
RETURN codeUnitLabels AS `Abstraction Level`
22+
,codeUnitCount AS `Units`
23+
,anomalyCount AS `Anomalies`
24+
,authorityCount AS `Authorities`
25+
,bottleNeckCount AS `Bottlenecks`
26+
,bridgeCount AS `Bridges`
27+
,hubCount AS `Hubs`
28+
,outlierCount AS `Outliers`
29+
//,exampleNames
30+
ORDER BY anomalyCount DESC, codeUnitCount DESC
Lines changed: 38 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,38 @@
1+
// Anomaly Detection Labels: Summarizes all labelled archetypes by their anomaly score including their archetype rank. For code units with more than one archetype, the one with the higher rank is shown. Requires all other labels/*.cypher queries to run first. Variables: projection_language, projection_node_label
2+
3+
MATCH (codeUnit)
4+
WHERE $projection_node_label IN labels(codeUnit)
5+
UNWIND keys(codeUnit) AS codeUnitProperty
6+
WITH *
7+
WHERE codeUnitProperty STARTS WITH 'anomaly'
8+
AND codeUnitProperty ENDS WITH 'Rank'
9+
WITH *
10+
,coalesce(codeUnit.fqn, codeUnit.globalFqn, codeUnit.fileName, codeUnit.signature, codeUnit.name) AS codeUnitName
11+
,split(split(codeUnitProperty, 'anomaly')[1], 'Rank')[0] AS archetype
12+
,codeUnit[codeUnitProperty] AS archetypeRank
13+
,codeUnit.anomalyScore AS anomalyScore
14+
ORDER BY codeUnit.anomalyScore DESC, archetypeRank ASC, codeUnitName ASC, archetype ASC
15+
OPTIONAL MATCH (artifact:Java:Artifact)-[:CONTAINS]->(codeUnit)
16+
WITH *, artifact.name AS artifactName
17+
OPTIONAL MATCH (projectRoot:Directory)<-[:HAS_ROOT]-(proj:TS:Project)-[:CONTAINS]->(codeUnit)
18+
WITH *, last(split(projectRoot.absoluteFileName, '/')) AS projectName
19+
OPTIONAL MATCH (codeDirectory:File:Directory)-[:CONTAINS]->(codeUnit)
20+
WITH *, split(replace(codeDirectory.fileName, './', ''), '/')[-2] AS directoryName
21+
WITH *, coalesce(artifactName, projectName, directoryName, "") AS projectName
22+
RETURN projectName AS `Contained in`
23+
//$projection_language + ' ' + $projection_node_label AS `Code Unit`
24+
,codeUnitName AS `Name`
25+
,round(anomalyScore, 4, 'HALF_UP') AS `Score`
26+
,collect(archetype)[0] AS `Archetype`
27+
,collect(archetypeRank)[0] AS `Archetype Rank`
28+
,nullif(codeUnit.anomalyTopFeature1, "") AS `Top Feature 1`
29+
,nullif(round(codeUnit.anomalyTopFeatureSHAPValue1, 4, 'HALF_UP'), 0.0) AS `Top Feature 1 SHAP`
30+
,nullif(codeUnit.anomalyTopFeature2, "") AS `Top Feature 2`
31+
,nullif(round(codeUnit.anomalyTopFeatureSHAPValue2, 4, 'HALF_UP'), 0.0) AS `Top Feature 2 SHAP`
32+
,nullif(codeUnit.anomalyTopFeature3, "") AS `Top Feature 3`
33+
,nullif(round(codeUnit.anomalyTopFeatureSHAPValue3, 4, 'HALF_UP'), 0.0) AS `Top Feature 3 SHAP`
34+
,CASE WHEN codeUnit.anomalyScore <= 0 THEN 'Typical'
35+
WHEN codeUnit.anomalyTopFeature1 IS NULL THEN 'Undetermined'
36+
ELSE 'Anomalous' END AS `Model Status`
37+
//,collect(archetype)[1] AS secondaryArchetype
38+
//,collect(archetypeRank)[1] AS secondaryArchetypeRank
Lines changed: 144 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,144 @@
1+
#!/usr/bin/env bash
2+
3+
# Creates a Markdown report that contains all results of all the anomaly detection methods.
4+
# It requires an already running Neo4j graph database with already scanned and analyzed artifacts.
5+
# The results will be written into the sub directory reports/anomaly-detection.
6+
7+
# Note that "scripts/prepareAnalysis.sh" is required to run prior to this script.
8+
# Note that either "anomalyDetectionCsv.sh" or "anomalyDetectionPython.sh" is required to run prior to this script.
9+
10+
# Requires executeQueryFunctions.sh, projectionFunctions.sh, cleanupAfterReportGeneration.sh
11+
12+
# Fail on any error ("-e" = exit on first error, "-o pipefail" exist on errors within piped commands)
13+
set -o errexit -o pipefail
14+
15+
# Overrideable Constants (defaults also defined in sub scripts)
16+
REPORTS_DIRECTORY=${REPORTS_DIRECTORY:-"reports"}
17+
MARKDOWN_INCLUDES_DIRECTORY=${MARKDOWN_INCLUDES_DIRECTORY:-"includes"}
18+
19+
## Get this "domains/anomaly-detection/summary" directory if not already set
20+
# Even if $BASH_SOURCE is made for Bourne-like shells it is also supported by others and therefore here the preferred solution.
21+
# CDPATH reduces the scope of the cd command to potentially prevent unintended directory changes.
22+
# This way non-standard tools like readlink aren't needed.
23+
ANOMALY_DETECTION_SUMMARY_DIR=${ANOMALY_DETECTION_SUMMARY_DIR:-$(CDPATH=. cd -- "$(dirname -- "${BASH_SOURCE[0]}")" && pwd -P)}
24+
#echo "anomalyDetectionSummary: ANOMALY_DETECTION_SUMMARY_DIR=${ANOMALY_DETECTION_SUMMARY_DIR}"
25+
# Get the "scripts" directory by taking the path of this script and going one directory up.
26+
SCRIPTS_DIR=${SCRIPTS_DIR:-"${ANOMALY_DETECTION_SUMMARY_DIR}/../../../scripts"} # Repository directory containing the shell scripts
27+
28+
MARKDOWN_INCLUDES_DIRECTORY="includes"
29+
MARKDOWN_SCRIPTS_DIR=${MARKDOWN_SCRIPTS_DIR:-"${SCRIPTS_DIR}/markdown"}
30+
#echo "anomalyDetectionSummary: MARKDOWN_SCRIPTS_DIR=${MARKDOWN_SCRIPTS_DIR}" >&2
31+
32+
# Define functions to execute a cypher query from within a given file (first and only argument) like "execute_cypher" and "execute_cypher_summarized"
33+
source "${SCRIPTS_DIR}/executeQueryFunctions.sh"
34+
35+
# Appends a Markdown table to an existing file and
36+
# removes redundant header + separator rows.
37+
#
38+
# Usage:
39+
# cat newTable.md | append_table myMarkdownFile.md
40+
#
41+
# append_table myMarkdownFile.md <<'EOF'
42+
# | Name | Score | Archetype |
43+
# | --- | --- | --- |
44+
# | Bar | 0.9 | Something |
45+
# EOF
46+
#
47+
# Behavior:
48+
# - Keeps the first header row and its following separator row.
49+
# - Removes all subsequent duplicate header + separator pairs.
50+
# - Leaves all data rows untouched.
51+
append_to_markdown_table() {
52+
local file="$1"
53+
54+
# Append stdin to the target file
55+
cat >> "${file}"
56+
57+
# Clean up duplicate headers (header row + --- row)
58+
awk '!seen[$0]++ || NR <= 2' "${file}" > "${file}.tmp" && mv "${file}.tmp" "${file}"
59+
}
60+
61+
# Run the anomaly detection main report generation.
62+
anomaly_detection_report_first_section() {
63+
local report_markdown_includes_directory="${FULL_REPORT_DIRECTORY}/${MARKDOWN_INCLUDES_DIRECTORY}"
64+
mkdir -p "${report_markdown_includes_directory}"
65+
66+
execute_cypher "${ANOMALY_DETECTION_SUMMARY_DIR}/AnomaliesPerAbstractionLayer.cypher" --output-markdown-table > "${report_markdown_includes_directory}/AnomaliesPerAbstractionLayer.md"
67+
execute_cypher "${ANOMALY_DETECTION_SUMMARY_DIR}/AnomaliesInTotal.cypher" --output-markdown-table > "${report_markdown_includes_directory}/AnomaliesInTotal.md"
68+
}
69+
70+
# Aggregates all results in a Markdown report.
71+
#
72+
# Required Parameters:
73+
# - projection_node_label=...
74+
# Label of the nodes that will be used for the projection. Example: "Package"
75+
# - projection_language=...
76+
# Name of the associated programming language. Examples: "Java", "Typescript"
77+
anomaly_detection_deep_dive_report() {
78+
local nodeLabel
79+
nodeLabel=$( extractQueryParameter "projection_node_label" "${@}" )
80+
81+
local language
82+
language=$( extractQueryParameter "projection_language" "${@}" )
83+
84+
local report_number
85+
report_number=$( extractQueryParameter "report_number" "${@}" )
86+
87+
echo "anomalyDetectionSummary: $(date +'%Y-%m-%dT%H:%M:%S%z') Creating ${language} ${nodeLabel} anomaly summary Markdown report..."
88+
89+
detail_report_directory="${FULL_REPORT_DIRECTORY}/${language}_${nodeLabel}"
90+
mkdir -p "${detail_report_directory}/${MARKDOWN_INCLUDES_DIRECTORY}"
91+
92+
# TODO 2.{number of report}
93+
echo "### 2.${report_number} ${language} ${nodeLabel}" > "${detail_report_directory}/${MARKDOWN_INCLUDES_DIRECTORY}/DeepDiveSectionTitle.md"
94+
cp -f "${ANOMALY_DETECTION_SUMMARY_DIR}/report-no-data-info-template.md" "${detail_report_directory}/${MARKDOWN_INCLUDES_DIRECTORY}"
95+
cp -f "${detail_report_directory}/Top_anomaly_features.md" "${detail_report_directory}/${MARKDOWN_INCLUDES_DIRECTORY}" || true
96+
97+
execute_cypher "${ANOMALY_DETECTION_SUMMARY_DIR}/AnomaliesDeepDiveOverview.cypher" "${@}" --output-markdown-table > "${detail_report_directory}/${MARKDOWN_INCLUDES_DIRECTORY}/DeepDiveOverview.md"
98+
execute_cypher "${ANOMALY_DETECTION_SUMMARY_DIR}/AnomaliesDeepDiveArchetypes.cypher" "${@}" --output-markdown-table > "${detail_report_directory}/${MARKDOWN_INCLUDES_DIRECTORY}/DeepDiveArchetypes.md"
99+
execute_cypher "${ANOMALY_DETECTION_SUMMARY_DIR}/AnomalyDetectionReportTopArchetypes.cypher" "${@}" --output-markdown-table > "${detail_report_directory}/${MARKDOWN_INCLUDES_DIRECTORY}/TopAnomaliesByArchetype.md"
100+
101+
# Use Markdown template to assemble the final deep dive section of the Markdown report
102+
cp -f "${ANOMALY_DETECTION_SUMMARY_DIR}/report-deep-dive-template.md" "${detail_report_directory}"
103+
cat "${detail_report_directory}/report-deep-dive-template.md" | "${MARKDOWN_SCRIPTS_DIR}/embedMarkdownIncludes.sh" "${detail_report_directory}/${MARKDOWN_INCLUDES_DIRECTORY}" > "${detail_report_directory}/report-deep-dive-${report_number}.md"
104+
rm -rf "${detail_report_directory}/report-deep-dive-template.md"
105+
106+
# Clean-up after report generation. Empty reports will be deleted.
107+
source "${SCRIPTS_DIR}/cleanupAfterReportGeneration.sh" "${detail_report_directory}"
108+
}
109+
110+
# Run the anomaly detection report generation.
111+
#
112+
# Required Parameters:
113+
# - projection_node_label=...
114+
# Label of the nodes that will be used for the projection. Example: "Package"
115+
# - projection_language=...
116+
# Name of the associated programming language. Examples: "Java", "Typescript"
117+
anomaly_detection_report() {
118+
time anomaly_detection_deep_dive_report "${@}"
119+
}
120+
121+
# Create report directory
122+
REPORT_NAME="anomaly-detection"
123+
FULL_REPORT_DIRECTORY="${REPORTS_DIRECTORY}/${REPORT_NAME}"
124+
mkdir -p "${FULL_REPORT_DIRECTORY}"
125+
126+
# Query Parameter key pairs for projection and algorithm side
127+
ALGORITHM_NODE="projection_node_label"
128+
ALGORITHM_LANGUAGE="projection_language"
129+
REPORT_NUMBER="report_number"
130+
131+
# -- Overview Report for all code type -------------------------------
132+
133+
anomaly_detection_report_first_section
134+
135+
# -- Detail Reports for each code type -------------------------------
136+
137+
anomaly_detection_report "${REPORT_NUMBER}=1" "${ALGORITHM_NODE}=Artifact" "${ALGORITHM_LANGUAGE}=Java"
138+
anomaly_detection_report "${REPORT_NUMBER}=2" "${ALGORITHM_NODE}=Package" "${ALGORITHM_LANGUAGE}=Java"
139+
anomaly_detection_report "${REPORT_NUMBER}=3" "${ALGORITHM_NODE}=Type" "${ALGORITHM_LANGUAGE}=Java"
140+
anomaly_detection_report "${REPORT_NUMBER}=4" "${ALGORITHM_NODE}=Module" "${ALGORITHM_LANGUAGE}=Typescript"
141+
142+
# ---------------------------------------------------------------
143+
144+
echo "anomalyDetectionSummary: $(date +'%Y-%m-%dT%H:%M:%S%z') Successfully finished."
Lines changed: 33 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,33 @@
1+
<!-- include:DeepDiveSectionTitle.md -->
2+
3+
#### Metrics and Features
4+
5+
* Degree (in/out)
6+
* PageRank, ArticleRank, PageRank–ArticleRank difference
7+
* Betweenness centrality
8+
* Local clustering coefficient
9+
* Cluster metrics (distance-to-medoid, average/max radius, outlier score)
10+
* Node embeddings (PCA-reduced)
11+
12+
#### Anomaly Results
13+
14+
##### Total anomalies
15+
16+
<!-- include:DeepDiveOverview.md -->
17+
18+
* **Top contributing features (via SHAP):**
19+
20+
<!-- include:Top_anomaly_features.md|report-no-data-info-template.md -->
21+
22+
#### Archetype Distribution
23+
24+
<!-- include:DeepDiveArchetypes.md|report-no-data-info-template.md -->
25+
26+
#### Plots
27+
28+
* SHAP summary plots
29+
* Betweenness distribution histogram
30+
* Scatter: PageRank vs Clustering Coefficient
31+
* Cluster outlier visualization
32+
33+
---
Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,2 @@
1+
| ⚠️ No data available |
2+
|----------------------|

0 commit comments

Comments
 (0)