Merge pull request #247 from jarbus/node2vec

yuehhua · web-flow · commit 16d43e561c26 · 2021-12-12T14:05:38.000+08:00
Node2vec prototype
diff --git a/Project.toml b/Project.toml
@@ -7,6 +7,7 @@ version = "0.8.0"
 CUDA = "052768ef-5323-5732-b1bb-66c8b64840ba"
 ChainRulesCore = "d360d2e6-b24c-11e9-a2a3-2a2ae2dbcce4"
 DataStructures = "864edb3b-99cc-5e75-8d2d-829cb0a9cfe8"
+DelimitedFiles = "8bb1440f-4735-579b-a4ab-409b98df4dab"
 FillArrays = "1a297f60-69ca-5386-bcde-b61e274b549b"
 Flux = "587475ba-b771-5e3f-ad9e-33799f191a9c"
 GraphMLDatasets = "21828b05-d3b3-40ad-870e-a4bc2f52d5e8"
@@ -17,7 +18,9 @@ NNlib = "872c559c-99b0-510c-b3b7-b6c96a88d5cd"
 NNlibCUDA = "a00861dc-f156-4864-bf3c-e6376f28a68d"
 Random = "9a3f8284-a2c9-5f02-9a11-845980a1fd5c"
 Reexport = "189a3867-3050-52da-a836-e630ba90ab69"
+SparseArrays = "2f01184e-e22b-5df5-ae63-d93ebab69eaf"
 Statistics = "10745b16-79ce-11e8-11f9-7d13ad32a3b2"
+Word2Vec = "c64b6f0f-98cd-51d1-af78-58ae84944834"
 Zygote = "e88e6eb3-aa80-5325-afca-941959d7151f"
 
 [compat]
@@ -27,17 +30,18 @@ DataStructures = "0.18"
 FillArrays = "0.12"
 Flux = "0.12"
 GraphMLDatasets = "0.1"
-GraphSignals = "0.3"
 Graphs = "1.4"
 NNlib = "0.7"
 NNlibCUDA = "0.1"
 Reexport = "1.1"
+Word2Vec = "0.5"
 Zygote = "0.6"
 julia = "1.6"
 
 [extras]
+Clustering = "aaaa29a8-35af-508c-8bc3-b662a17a0fe5"
 SparseArrays = "2f01184e-e22b-5df5-ae63-d93ebab69eaf"
 Test = "8dfed614-e22c-5e08-85e1-65c5234f0b40"
 
 [targets]
-test = ["SparseArrays", "Test"]
+test = ["Clustering", "SparseArrays", "Test"]
diff --git a/docs/Project.toml b/docs/Project.toml
@@ -1,5 +1,6 @@
 [deps]
 Documenter = "e30172f5-a6a5-5a46-863b-614d45cd2de4"
+DocumenterCitations = "daee34ce-89f3-4625-b898-19384cb65244"
 
 [compat]
-Documenter = "0.24"
+Documenter = "0.27"
diff --git a/docs/bibliography.bib b/docs/bibliography.bib
@@ -0,0 +1,17 @@
+@inproceedings{node2vec2016,
+    author = {Grover, Aditya and Leskovec, Jure},
+    title = {Node2vec: Scalable Feature Learning for Networks},
+    year = {2016},
+    isbn = {9781450342322},
+    publisher = {Association for Computing Machinery},
+    address = {New York, NY, USA},
+    url = {https://doi.org/10.1145/2939672.2939754},
+    doi = {10.1145/2939672.2939754},
+    abstract = {Prediction tasks over nodes and edges in networks require careful effort in engineering features used by learning algorithms. Recent research in the broader field of representation learning has led to significant progress in automating prediction by learning the features themselves. However, present feature learning approaches are not expressive enough to capture the diversity of connectivity patterns observed in networks. Here we propose node2vec, an algorithmic framework for learning continuous feature representations for nodes in networks. In node2vec, we learn a mapping of nodes to a low-dimensional space of features that maximizes the likelihood of preserving network neighborhoods of nodes. We define a flexible notion of a node's network neighborhood and design a biased random walk procedure, which efficiently explores diverse neighborhoods. Our algorithm generalizes prior work which is based on rigid notions of network neighborhoods, and we argue that the added flexibility in exploring neighborhoods is the key to learning richer representations.We demonstrate the efficacy of node2vec over existing state-of-the-art techniques on multi-label classification and link prediction in several real-world networks from diverse domains. Taken together, our work represents a new way for efficiently learning state-of-the-art task-independent representations in complex networks.},
+    booktitle = {Proceedings of the 22nd ACM SIGKDD International Conference on Knowledge Discovery and Data Mining},
+    pages = {855–864},
+    numpages = {10},
+    keywords = {node embeddings, information networks, graph representations, feature learning},
+    location = {San Francisco, California, USA},
+    series = {KDD '16}
+}
diff --git a/docs/make.jl b/docs/make.jl
@@ -1,7 +1,11 @@
 using Documenter
+using DocumenterCitations
 using GeometricFlux
 
+bib = CitationBibliography(joinpath(@__DIR__, "bibliography.bib"), sorting=:nyt)
+
 makedocs(
+    bib,
     sitename = "GeometricFlux.jl",
     format = Documenter.HTML(
       assets = ["assets/flux.css"],
@@ -24,7 +28,8 @@ makedocs(
                ["Convolutional Layers" => "manual/conv.md",
                 "Pooling Layers" => "manual/pool.md",
                 "Models" => "manual/models.md",
-                "Linear Algebra" => "manual/linalg.md"]
+                "Linear Algebra" => "manual/linalg.md"],
+             "References" => "references.md",
     ]
 )
 
diff --git a/docs/src/references.md b/docs/src/references.md
@@ -0,0 +1,4 @@
+# References
+
+```@bibliography
+```
diff --git a/examples/node2vec.jl b/examples/node2vec.jl
@@ -0,0 +1,39 @@
+using GeometricFlux
+using GraphSignals
+using Graphs
+using SparseArrays
+using Plots
+using GraphPlot
+using Clustering
+using Cairo, Compose
+
+clusters = [1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 1, 1, 1, 1, 2, 2, 1, 1, 2, 1, 2, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2]
+
+int2col_str(x::Int) = x==1 ? "lightblue" : "red"
+
+
+g = smallgraph(:karate)
+fg = FeaturedGraph(g)
+vectors = node2vec(fg; walks_per_node=10, len=80, p=1.0, q=1.0)
+R = kmeans(vectors, 2)
+
+
+learned_clusters = copy(assignments(R))
+# ensure that the cluster containing node 1 is cluster 1
+if assignments(R)[1] != 1
+    learned_clusters = [i == 1 ? 2 : 1 for i in assignments(R)]
+end
+
+output_plot_name = "karateclub.pdf"
+draw(
+    PDF(output_plot_name, 16cm, 16cm),
+    gplot(g,
+        nodelabel=map(string, 1:34),
+        nodefillc=[int2col_str(learned_clusters[i]) for i in 1:34],
+        nodestrokec=["white" for _ in 1:34]
+    )
+)
+
+incorrect = sum(learned_clusters .!= clusters)
+println(incorrect, " incorrect cluster labelings")
+println("Drawn graph to ", output_plot_name)
diff --git a/src/GeometricFlux.jl b/src/GeometricFlux.jl
@@ -1,19 +1,24 @@
 module GeometricFlux
 
+using DelimitedFiles
+using SparseArrays
 using Statistics: mean
 using LinearAlgebra: Adjoint, norm, Transpose
+using Random
 using Reexport
 
 using CUDA
 using ChainRulesCore: @non_differentiable
 using FillArrays: Fill
 using Flux
 using Flux: glorot_uniform, leakyrelu, GRUCell, @functor
-using NNlib, NNlibCUDA
-using GraphSignals
+@reexport using GraphSignals
 using Graphs
+using NNlib, NNlibCUDA
 using Zygote
 
+import Word2Vec: word2vec, wordvectors, get_vector
+
 export
     # layers/graphlayers
     AbstractGraphLayer,
@@ -52,7 +57,10 @@ export
     bypass_graph,
 
     # utils
-    generate_cluster
+    generate_cluster,
+
+    #node2vec
+    node2vec
 
 include("datasets.jl")
 
@@ -67,6 +75,9 @@ include("layers/pool.jl")
 include("models.jl")
 include("layers/misc.jl")
 
+include("sampling.jl")
+include("embedding/node2vec.jl")
+
 include("cuda/conv.jl")
 
 using .Datasets
diff --git a/src/embedding/node2vec.jl b/src/embedding/node2vec.jl
@@ -0,0 +1,154 @@
+const Alias = Tuple{SparseVector{Int}, SparseVector{Float64}}
+
+"""
+    node2vec(g; walks_per_node, len, p, q, dims)
+
+Returns an embedding matrix with size of `nv(g)` x `dims`. It computes node embeddings
+on graph `g` accroding to node2vec [node2vec2016](@cite). It performs biased random walks on the graph,
+then computes word embeddings by treating those random walks as sentences.
+
+# Arguments
+
+- `g::FeaturedGraph`: The graph to perform random walk on.
+- `walks_per_node::Int`: Number of walks starting on each node,
+total number of walks is `nv(g) * walks_per_node`
+- `len::Int`: Length of random walks
+- `p::Real`: Return parameter from [node2vec2016](@cite)
+- `q::Real`: In-out parameter from [node2vec2016](@cite)
+- `dims::Int`: Number of vector dimensions
+"""
+function node2vec(g::FeaturedGraph; walks_per_node::Int=100, len::Int=5, p::Real=0.5, q::Real=0.5, dims::Int=128)
+    walks = simulate_walks(g; walks_per_node=walks_per_node, len=len, p=p, q=q)
+    model = walks2vec(walks,dims=dims)
+    vecs = []
+    println(typeof(model))
+    for i in 1:nv(g)
+        push!(vecs, get_vector(model, string(i)))
+    end
+    matrix = cat(vecs..., dims=2)
+    return matrix
+end
+
+"""
+Modified version of Node2Vec.learn_embeddings[1]. Uses
+a Julia interface[2] to the original word2vec C code[3].
+
+Treats each random walk like a sentence, and computed word
+embeddings using node ID as words.
+
+[1] https://github.com/ollin18/Node2Vec.jl
+[2] https://github.com/JuliaText/Word2Vec.jl
+[3] https://code.google.com/archive/p/word2vec/
+"""
+function walks2vec(walks::Vector{Vector{Int}}; dims::Int=100)
+    str_walks=map(x -> string.(x),walks)
+
+    if Sys.iswindows()
+        rpath = pwd()
+    else
+        rpath = "/tmp"
+    end
+    the_walks = joinpath(rpath,"str_walk.txt")
+    the_vecs = joinpath(rpath,"str_walk-vec.txt")
+
+    writedlm(the_walks,str_walks)
+    word2vec(the_walks,the_vecs,verbose=true,size=dims)
+    model=wordvectors(the_vecs)
+    rm(the_walks)
+    rm(the_vecs)
+    model
+end
+
+
+"""
+    Conducts a random walk over `g` in O(l) time,
+weighted by alias sampling probabilities `alias_nodes`
+and `alias_edges`.
+"""
+function node2vec_walk(
+    g::FeaturedGraph,
+    alias_nodes::Dict{Int, Alias},
+    alias_edges::Dict{Tuple{Int, Int}, Alias};
+    start_node::Int,
+    walk_length::Int)::Vector{Int}
+    walk::Vector{Int} = [start_node]
+    for _ in 2:walk_length
+        curr = walk[end]
+        cur_nbrs = sort(neighbors(g, curr; dir=:out))
+        if length(walk) == 1
+            push!(walk, cur_nbrs[alias_sample(alias_nodes[curr]...)])
+        else
+            prev = walk[end-1]
+            next = cur_nbrs[alias_sample(alias_edges[(prev, curr)]...)]
+            push!(walk, next)
+        end
+    end
+    return walk
+end
+
+"""
+Returns J and q for a given edge
+"""
+function get_alias_edge(g::FeaturedGraph, src::Int, dst::Int, p::Float64, q::Float64)::Alias
+    unnormalized_probs = spzeros(length(neighbors(g, dst; dir=:out)))
+    neighbor_weight_pairs = zip(weighted_outneighbors(g, dst)...)
+    for (i, (dst_nbr, weight)) in enumerate(neighbor_weight_pairs)
+        if dst_nbr == src
+            unnormalized_probs[i] = weight/p
+        elseif has_edge(g, dst_nbr, src)
+            unnormalized_probs[i] = weight
+        else
+            unnormalized_probs[i] = weight/q
+        end
+    end
+    normalized_probs = unnormalized_probs ./ sum(unnormalized_probs)
+    return alias_setup(normalized_probs)
+end
+
+# Returns (neighbors::Vector{Int}, weights::Vector{Float64})
+function weighted_outneighbors(fg::FeaturedGraph, i::Int)
+    nbrs = neighbors(fg, i; dir=:out)
+    nbrs, sparse(graph(fg))[i, nbrs]
+end
+
+"""
+    Computes weighted probability transition aliases J and q for nodes and edges
+using return parameter `p` and In-out parameter `q`
+
+Implementation as specified in the node2vec paper [node2vec2016](@cite).
+"""
+function preprocess_modified_weights(g::FeaturedGraph, p::Real, q::Real)
+
+    alias_nodes = Dict{Int, Alias}()
+    alias_edges = Dict{Tuple{Int, Int}, Alias}()
+
+    for node in 1:nv(g)
+        nbrs = neighbors(g, node, dir=:out)
+        probs = fill(1, length(nbrs)) ./ length(nbrs)
+        alias_nodes[node] =  alias_setup(probs)
+    end
+    for (_, edge) in edges(g)
+        src, dst = edge
+        alias_edges[(src, dst)] = get_alias_edge(g, src, dst, p, q)
+        if !is_directed(g)
+            alias_edges[(dst, src)] = get_alias_edge(g, dst, src, p, q)
+        end
+    end
+    return alias_nodes, alias_edges
+end
+
+
+"""
+Given a graph, compute `walks_per_node` * nv(g) random walks.
+"""
+function simulate_walks(g::FeaturedGraph; walks_per_node::Int, len::Int, p::Real, q::Real)::Vector{Vector{Int}}
+    alias_nodes, alias_edges = preprocess_modified_weights(g, p, q)
+    walks = Vector{Int}[]
+    for _ in 1:walks_per_node
+        for node in shuffle(1:nv(g))
+            walk::Vector{Int} = node2vec_walk(g, alias_nodes, alias_edges; start_node=node, walk_length=len)
+            push!(walks, walk)
+        end
+    end
+    return walks
+end
diff --git a/src/sampling.jl b/src/sampling.jl
@@ -0,0 +1,55 @@
+"""
+    alias_setup(probs)
+
+Computes alias probabilities.
+"""
+alias_setup(probs::AbstractVector{<:Real}) = alias_setup(sparse(probs))
+
+function alias_setup(probs::SparseVector{<:Real})
+    K = length(probs)
+    J = spzeros(Int, K)
+    q = probs * K
+
+    smaller = Int[] # prob idxs < 1/K
+    larger = Int[]  # prob idxs >= 1/k
+
+    for i in 1:length(probs)
+        if q[i] < 1.0  # equivalent to prob < 1/K but saves the division
+            push!(smaller, i)
+        else
+            push!(larger, i)
+        end
+    end
+
+    while length(smaller) > 0 && length(larger) > 0
+        small = pop!(smaller)
+        large = pop!(larger)
+        J[small] = large
+        q[large] = q[large] + q[small] - 1.0
+        if q[large] < 1.0
+            push!(smaller, large)
+        else
+            push!(larger, large)
+        end
+    end
+
+    return J, q
+end
+
+"""
+    alias_sample(J, q)
+
+Alias Sampling first described in [1]. [2] might be a helpful resource to understand alias sampling.
+
+[1] A. Kronmal and A. V. Peterson. On the alias method for generating random variables from a
+    discrete distribution. The American Statistician, 33(4):214-218, 1979.
+[2] https://lips.cs.princeton.edu/the-alias-method-efficient-sampling-with-many-discrete-outcomes/
+"""
+function alias_sample(J::AbstractVector{<:Integer}, q::AbstractVector{<:Real})
+    small_index = ceil(Int, rand() * length(J))
+    if rand() < q[small_index]
+        return small_index
+    else
+        return J[small_index]
+    end
+end
diff --git a/test/embedding/node2vec.jl b/test/embedding/node2vec.jl
diff --git a/test/runtests.jl b/test/runtests.jl
diff --git a/test/sampling.jl b/test/sampling.jl

-Original file line number
+Diff line change
@@ @@ -0,0 +1,4 @@ @@
 +# References
++
 +```@bibliography
 +```