refactor

yuehhua · yuehhua · commit 3b24f6eddfed · 2021-12-12T13:58:08.000+08:00
fix example and deps

reorder using

use bib for doc
diff --git a/Project.toml b/Project.toml
@@ -6,7 +6,6 @@ version = "0.8.0"
 [deps]
 CUDA = "052768ef-5323-5732-b1bb-66c8b64840ba"
 ChainRulesCore = "d360d2e6-b24c-11e9-a2a3-2a2ae2dbcce4"
-Clustering = "aaaa29a8-35af-508c-8bc3-b662a17a0fe5"
 DataStructures = "864edb3b-99cc-5e75-8d2d-829cb0a9cfe8"
 DelimitedFiles = "8bb1440f-4735-579b-a4ab-409b98df4dab"
 FillArrays = "1a297f60-69ca-5386-bcde-b61e274b549b"
@@ -35,12 +34,14 @@ Graphs = "1.4"
 NNlib = "0.7"
 NNlibCUDA = "0.1"
 Reexport = "1.1"
+Word2Vec = "0.5"
 Zygote = "0.6"
 julia = "1.6 - 1.7"
 
 [extras]
+Clustering = "aaaa29a8-35af-508c-8bc3-b662a17a0fe5"
 SparseArrays = "2f01184e-e22b-5df5-ae63-d93ebab69eaf"
 Test = "8dfed614-e22c-5e08-85e1-65c5234f0b40"
 
 [targets]
-test = ["SparseArrays", "Test"]
+test = ["Clustering", "SparseArrays", "Test"]
diff --git a/docs/Project.toml b/docs/Project.toml
@@ -1,5 +1,6 @@
 [deps]
 Documenter = "e30172f5-a6a5-5a46-863b-614d45cd2de4"
+DocumenterCitations = "daee34ce-89f3-4625-b898-19384cb65244"
 
 [compat]
-Documenter = "0.24"
+Documenter = "0.27"
diff --git a/docs/bibliography.bib b/docs/bibliography.bib
@@ -0,0 +1,17 @@
+@inproceedings{node2vec2016,
+    author = {Grover, Aditya and Leskovec, Jure},
+    title = {Node2vec: Scalable Feature Learning for Networks},
+    year = {2016},
+    isbn = {9781450342322},
+    publisher = {Association for Computing Machinery},
+    address = {New York, NY, USA},
+    url = {https://doi.org/10.1145/2939672.2939754},
+    doi = {10.1145/2939672.2939754},
+    abstract = {Prediction tasks over nodes and edges in networks require careful effort in engineering features used by learning algorithms. Recent research in the broader field of representation learning has led to significant progress in automating prediction by learning the features themselves. However, present feature learning approaches are not expressive enough to capture the diversity of connectivity patterns observed in networks. Here we propose node2vec, an algorithmic framework for learning continuous feature representations for nodes in networks. In node2vec, we learn a mapping of nodes to a low-dimensional space of features that maximizes the likelihood of preserving network neighborhoods of nodes. We define a flexible notion of a node's network neighborhood and design a biased random walk procedure, which efficiently explores diverse neighborhoods. Our algorithm generalizes prior work which is based on rigid notions of network neighborhoods, and we argue that the added flexibility in exploring neighborhoods is the key to learning richer representations.We demonstrate the efficacy of node2vec over existing state-of-the-art techniques on multi-label classification and link prediction in several real-world networks from diverse domains. Taken together, our work represents a new way for efficiently learning state-of-the-art task-independent representations in complex networks.},
+    booktitle = {Proceedings of the 22nd ACM SIGKDD International Conference on Knowledge Discovery and Data Mining},
+    pages = {855–864},
+    numpages = {10},
+    keywords = {node embeddings, information networks, graph representations, feature learning},
+    location = {San Francisco, California, USA},
+    series = {KDD '16}
+}
diff --git a/docs/make.jl b/docs/make.jl
@@ -1,7 +1,11 @@
 using Documenter
+using DocumenterCitations
 using GeometricFlux
 
+bib = CitationBibliography(joinpath(@__DIR__, "bibliography.bib"), sorting=:nyt)
+
 makedocs(
+    bib,
     sitename = "GeometricFlux.jl",
     format = Documenter.HTML(
       assets = ["assets/flux.css"],
@@ -24,7 +28,8 @@ makedocs(
                ["Convolutional Layers" => "manual/conv.md",
                 "Pooling Layers" => "manual/pool.md",
                 "Models" => "manual/models.md",
-                "Linear Algebra" => "manual/linalg.md"]
+                "Linear Algebra" => "manual/linalg.md"],
+             "References" => "references.md",
     ]
 )
 
diff --git a/docs/src/references.md b/docs/src/references.md
@@ -0,0 +1,4 @@
+# References
+
+```@bibliography
+```
diff --git a/src/GeometricFlux.jl b/src/GeometricFlux.jl
@@ -1,23 +1,22 @@
 module GeometricFlux
 
+using DelimitedFiles
+using SparseArrays
 using Statistics: mean
 using LinearAlgebra: Adjoint, norm, Transpose
+using Random
 using Reexport
 
 using CUDA
 using ChainRulesCore: @non_differentiable
 using FillArrays: Fill
 using Flux
 using Flux: glorot_uniform, leakyrelu, GRUCell, @functor
-using NNlib, NNlibCUDA
 @reexport using GraphSignals
 using Graphs
-using Random
+using NNlib, NNlibCUDA
 using Zygote
-using SparseArrays
-using DelimitedFiles
 
-import Graphs: neighbors, is_directed, has_edge
 import Word2Vec: word2vec, wordvectors, get_vector
 
 export
@@ -76,8 +75,8 @@ include("layers/pool.jl")
 include("models.jl")
 include("layers/misc.jl")
 
-include("graph_embedding/sampling.jl")
-include("graph_embedding/node2vec.jl")
+include("sampling.jl")
+include("embedding/node2vec.jl")
 
 include("cuda/conv.jl")
 
diff --git a/src/embedding/node2vec.jl b/src/embedding/node2vec.jl
@@ -3,23 +3,19 @@ const Alias = Tuple{SparseVector{Int}, SparseVector{Float64}}
 """
     node2vec(g; walks_per_node, len, p, q, dims)
 
-Computes node embeddings on graph `g`, as per [1]. Performs biased random walks on the graph,
-then computes word embeddings by treating those random walks like sentences.
-
-Returns an nv(g) x dims matrix of embeddings
+Returns an embedding matrix with size of `nv(g)` x `dims`. It computes node embeddings
+on graph `g` accroding to node2vec [node2vec2016](@cite). It performs biased random walks on the graph,
+then computes word embeddings by treating those random walks as sentences.
 
 # Arguments
 
 - `g::FeaturedGraph`: The graph to perform random walk on.
 - `walks_per_node::Int`: Number of walks starting on each node,
 total number of walks is `nv(g) * walks_per_node`
 - `len::Int`: Length of random walks
-- `p::Real`: Return parameter from [1]
-- `q::Real`: In-out parameter from [1]
+- `p::Real`: Return parameter from [node2vec2016](@cite)
+- `q::Real`: In-out parameter from [node2vec2016](@cite)
 - `dims::Int`: Number of vector dimensions
-
-
-[1] https://cs.stanford.edu/~jure/pubs/node2vec-kdd16.pdf
 """
 function node2vec(g::FeaturedGraph; walks_per_node::Int=100, len::Int=5, p::Real=0.5, q::Real=0.5, dims::Int=128)
     walks = simulate_walks(g; walks_per_node=walks_per_node, len=len, p=p, q=q)
@@ -44,8 +40,7 @@ embeddings using node ID as words.
 [2] https://github.com/JuliaText/Word2Vec.jl
 [3] https://code.google.com/archive/p/word2vec/
 """
-function walks2vec(walks::Vector{Vector{Int}};dims::Int=100)
-
+function walks2vec(walks::Vector{Vector{Int}}; dims::Int=100)
     str_walks=map(x -> string.(x),walks)
 
     if Sys.iswindows()
@@ -56,7 +51,6 @@ function walks2vec(walks::Vector{Vector{Int}};dims::Int=100)
     the_walks = joinpath(rpath,"str_walk.txt")
     the_vecs = joinpath(rpath,"str_walk-vec.txt")
 
-    symbols = Iterators.flatten(walks) |> Set
     writedlm(the_walks,str_walks)
     word2vec(the_walks,the_vecs,verbose=true,size=dims)
     model=wordvectors(the_vecs)
@@ -69,7 +63,7 @@ end
 """
     Conducts a random walk over `g` in O(l) time,
 weighted by alias sampling probabilities `alias_nodes`
-and `alias_edges`
+and `alias_edges`.
 """
 function node2vec_walk(
     g::FeaturedGraph,
@@ -78,10 +72,9 @@ function node2vec_walk(
     start_node::Int,
     walk_length::Int)::Vector{Int}
     walk::Vector{Int} = [start_node]
-    current::Int = start_node
     for _ in 2:walk_length
         curr = walk[end]
-        cur_nbrs = sort(outneighbors(g, curr))
+        cur_nbrs = sort(neighbors(g, curr; dir=:out))
         if length(walk) == 1
             push!(walk, cur_nbrs[alias_sample(alias_nodes[curr]...)])
         else
@@ -93,9 +86,11 @@ function node2vec_walk(
     return walk
 end
 
-"Returns J and q for a given edge"
+"""
+Returns J and q for a given edge
+"""
 function get_alias_edge(g::FeaturedGraph, src::Int, dst::Int, p::Float64, q::Float64)::Alias
-    unnormalized_probs = spzeros(length(outneighbors(g, dst)))
+    unnormalized_probs = spzeros(length(neighbors(g, dst; dir=:out)))
     neighbor_weight_pairs = zip(weighted_outneighbors(g, dst)...)
     for (i, (dst_nbr, weight)) in enumerate(neighbor_weight_pairs)
         if dst_nbr == src
@@ -110,36 +105,45 @@ function get_alias_edge(g::FeaturedGraph, src::Int, dst::Int, p::Float64, q::Flo
     return alias_setup(normalized_probs)
 end
 
+# Returns (neighbors::Vector{Int}, weights::Vector{Float64})
+function weighted_outneighbors(fg::FeaturedGraph, i::Int)
+    nbrs = neighbors(fg, i; dir=:out)
+    nbrs, sparse(graph(fg))[i, nbrs]
+end
+
 """
     Computes weighted probability transition aliases J and q for nodes and edges
 using return parameter `p` and In-out parameter `q`
 
-Implementation as specified in the node2vec paper by Grover and Leskovec (2016)
-https://cs.stanford.edu/~jure/pubs/node2vec-kdd16.pdf
+Implementation as specified in the node2vec paper [node2vec2016](@cite).
 """
-function preprocess_modified_weights(g::FeaturedGraph, p::Float64, q::Float64)
+function preprocess_modified_weights(g::FeaturedGraph, p::Real, q::Real)
 
-    alias_nodes::Dict{Int, Alias} = Dict()
-    alias_edges::Dict{Tuple{Int, Int}, Alias} = Dict()
+    alias_nodes = Dict{Int, Alias}()
+    alias_edges = Dict{Tuple{Int, Int}, Alias}()
 
     for node in 1:nv(g)
-        probs = [1 / length(outneighbors(g, node)) for _ in outneighbors(g, node)]
+        nbrs = neighbors(g, node, dir=:out)
+        probs = fill(1, length(nbrs)) ./ length(nbrs)
         alias_nodes[node] =  alias_setup(probs)
     end
-    for edge in edges(g)
-        alias_edges[(edge.src, edge.dst)] = get_alias_edge(g, edge.src, edge.dst, p, q)
+    for (_, edge) in edges(g)
+        src, dst = edge
+        alias_edges[(src, dst)] = get_alias_edge(g, src, dst, p, q)
         if !is_directed(g)
-            alias_edges[(edge.dst, edge.src)] = get_alias_edge(g, edge.dst, edge.src, p, q)
+            alias_edges[(dst, src)] = get_alias_edge(g, dst, src, p, q)
         end
     end
     return alias_nodes, alias_edges
 end
 
 
-"Given a graph, compute `walks_per_node` * nv(g) random walks."
-function simulate_walks(g::FeaturedGraph; walks_per_node::Int, len::Int, p::Float64, q::Float64)::Vector{Vector{Int}}
+"""
+Given a graph, compute `walks_per_node` * nv(g) random walks.
+"""
+function simulate_walks(g::FeaturedGraph; walks_per_node::Int, len::Int, p::Real, q::Real)::Vector{Vector{Int}}
     alias_nodes, alias_edges = preprocess_modified_weights(g, p, q)
-    walks::Vector{Vector{Int}} = []
+    walks = Vector{Int}[]
     for _ in 1:walks_per_node
         for node in shuffle(1:nv(g))
             walk::Vector{Int} = node2vec_walk(g, alias_nodes, alias_edges; start_node=node, walk_length=len)
diff --git a/src/sampling.jl b/src/sampling.jl
@@ -1,22 +1,17 @@
 """
-Alias Sampling first described in [1]. [2] might be a helpful resource to understand alias sampling.
-
-[1] A. Kronmal and A. V. Peterson. On the alias method for generating random variables from a discrete distribution. The American Statistician, 33(4):214-218, 1979.
-[2] https://lips.cs.princeton.edu/the-alias-method-efficient-sampling-with-many-discrete-outcomes/
-"""
+    alias_setup(probs)
 
-alias_setup(probs::Vector{Float64}) = alias_setup(sparse(probs))
-
-"""
 Computes alias probabilities.
 """
-function alias_setup(probs::SparseVector{Float64})::Tuple{SparseVector{Int}, SparseVector{Float64}}
+alias_setup(probs::AbstractVector{<:Real}) = alias_setup(sparse(probs))
+
+function alias_setup(probs::SparseVector{<:Real})
     K = length(probs)
     J = spzeros(Int, K)
     q = probs * K
 
-    smaller::Vector{Int} = [] # prob idxs < 1/K
-    larger::Vector{Int} = []  # prob idxs >= 1/k
+    smaller = Int[] # prob idxs < 1/K
+    larger = Int[]  # prob idxs >= 1/k
 
     for i in 1:length(probs)
         if q[i] < 1.0  # equivalent to prob < 1/K but saves the division
@@ -25,6 +20,7 @@ function alias_setup(probs::SparseVector{Float64})::Tuple{SparseVector{Int}, Spa
             push!(larger, i)
         end
     end
+
     while length(smaller) > 0 && length(larger) > 0
         small = pop!(smaller)
         large = pop!(larger)
@@ -40,9 +36,17 @@ function alias_setup(probs::SparseVector{Float64})::Tuple{SparseVector{Int}, Spa
     return J, q
 end
 
-function alias_sample(J::SparseVector{Int}, q::SparseVector{Float64})::Int
+"""
+    alias_sample(J, q)
+
+Alias Sampling first described in [1]. [2] might be a helpful resource to understand alias sampling.
 
-    small_index = rand() * length(J) |> ceil |> Int
+[1] A. Kronmal and A. V. Peterson. On the alias method for generating random variables from a
+    discrete distribution. The American Statistician, 33(4):214-218, 1979.
+[2] https://lips.cs.princeton.edu/the-alias-method-efficient-sampling-with-many-discrete-outcomes/
+"""
+function alias_sample(J::AbstractVector{<:Integer}, q::AbstractVector{<:Real})
+    small_index = ceil(Int, rand() * length(J))
     if rand() < q[small_index]
         return small_index
     else
diff --git a/src/utils.jl b/src/utils.jl
@@ -20,84 +20,3 @@ end
 
 @non_differentiable accumulated_edges(x...)
 @non_differentiable generate_cluster(x...)
-
-"""
-    edge_index_table(adj[, directed])
-
-Generate a mapping from vertex pair (i, j) to edge index. The edge indecies are determined by
-the sorted vertex indecies.
-"""
-function edge_index_table(adj::AbstractVector{<:AbstractVector{<:Integer}}, directed::Bool=is_directed(adj))
-    table = Dict{Tuple{UInt32,UInt32},UInt64}()
-    e = one(UInt64)
-    if directed
-        for (i, js) = enumerate(adj)
-            js = sort(js)
-            for j = js
-                table[(i, j)] = e
-                e += one(UInt64)
-            end
-        end
-    else
-        for (i, js) = enumerate(adj)
-            js = sort(js)
-            js = js[i .≤ js]
-            for j = js
-                table[(i, j)] = e
-                table[(j, i)] = e
-                e += one(UInt64)
-            end
-        end
-    end
-    table
-end
-
-function edge_index_table(vpair::AbstractVector{<:Tuple})
-    table = Dict{Tuple{UInt32,UInt32},UInt64}()
-    for (i, p) = enumerate(vpair)
-        table[p] = i
-    end
-    table
-end
-
-edge_index_table(fg::FeaturedGraph) = edge_index_table(fg.graph, fg.directed)
-
-Zygote.@nograd edge_index_table
-
-### TODO move these to GraphSignals ######
-import GraphSignals: FeaturedGraph
-
-# function FeaturedGraph(fg::FeaturedGraph;
-#                         nf=node_feature(fg),
-#                         ef=edge_feature(fg),
-#                         gf=global_feature(fg))
-
-#     return FeaturedGraph(graph(fg); nf, ef, gf)
-# end
-
-
-function edges(fg::FeaturedGraph)
-    edges = []
-    for (src, vec) in enumerate(adjacency_list(GraphSignals.adjacency_matrix(fg)))
-        for v in vec
-            push!(edges, Edge(src, v))
-        end
-    end
-    edges
-end
-
-Graphs.has_edge(fg::FeaturedGraph, u::Int, v::Int) = has_edge(graph(fg), u, v)
-
-# Returns (neighbors::Vector{Int}, weights::Vector{Float64})
-function weighted_outneighbors(fg::FeaturedGraph, v::Int)
-    nbrs = neighbors(fg,v; dir=:out)
-    nbrs, graph(fg).S[v, nbrs]
-end
-
-function check_num_nodes(fg::FeaturedGraph, x::AbstractArray)
-    @assert nv(fg) == size(x, ndims(x))
-end
-
-function check_num_edges(fg::FeaturedGraph, e::AbstractArray)
-    @assert ne(fg) == size(e, ndims(e))
-end
diff --git a/test/embedding/node2vec.jl b/test/embedding/node2vec.jl
diff --git a/test/runtests.jl b/test/runtests.jl
diff --git a/test/sampling.jl b/test/sampling.jl

-Original file line number
+Diff line change
@@ @@ -0,0 +1,4 @@ @@
 +# References
++
 +```@bibliography
 +```