From 835a106315a690362ef9fd1eddb8488caaa5f47e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Luis=20M=C3=BCller?= Date: Mon, 20 Nov 2023 10:10:58 +0100 Subject: [PATCH] Introducing Critical-Look Datasets --- README.md | 5 +- configs/GPS/amazon-ratings-CL.yaml | 82 ++++++++++++++++++++++++ configs/GPS/minesweeper-CL.yaml | 82 ++++++++++++++++++++++++ configs/GPS/questions-CL.yaml | 82 ++++++++++++++++++++++++ configs/GPS/roman-empire-CL.yaml | 82 ++++++++++++++++++++++++ configs/GPS/tolokers-CL.yaml | 82 ++++++++++++++++++++++++ graphgps/config/dataset_config.py | 2 + graphgps/config/gt_config.py | 2 + graphgps/layer/gps_layer.py | 6 +- graphgps/loader/master_loader.py | 19 +++++- graphgps/network/gps_model.py | 1 + graphgps/transform/posenc_stats.py | 76 +++++++++++++++------- graphgps/transform/task_preprocessing.py | 10 ++- 13 files changed, 502 insertions(+), 29 deletions(-) create mode 100644 configs/GPS/amazon-ratings-CL.yaml create mode 100644 configs/GPS/minesweeper-CL.yaml create mode 100644 configs/GPS/questions-CL.yaml create mode 100644 configs/GPS/roman-empire-CL.yaml create mode 100644 configs/GPS/tolokers-CL.yaml diff --git a/README.md b/README.md index ba66174c..e4e95bdc 100644 --- a/README.md +++ b/README.md @@ -19,7 +19,7 @@ conda create -n graphgps python=3.10 conda activate graphgps conda install pytorch=1.13 torchvision torchaudio pytorch-cuda=11.7 -c pytorch -c nvidia -conda install pyg=2.2 -c pyg -c conda-forge +conda install pyg=2.3 -c pyg -c conda-forge pip install pyg-lib -f https://data.pyg.org/whl/torch-1.13.0+cu117.html # RDKit is required for OGB-LSC PCQM4Mv2 and datasets derived from it. @@ -31,6 +31,9 @@ pip install tensorboardX pip install ogb pip install wandb +# DGL - install for your environment: https://www.dgl.ai/pages/start.html, e.g., +conda install -c dglteam/label/cu117 dgl + conda clean --all ``` diff --git a/configs/GPS/amazon-ratings-CL.yaml b/configs/GPS/amazon-ratings-CL.yaml new file mode 100644 index 00000000..86e88346 --- /dev/null +++ b/configs/GPS/amazon-ratings-CL.yaml @@ -0,0 +1,82 @@ +out_dir: results +metric_best: accuracy +wandb: + use: False + project: probing-heterophilous + entity: null +dataset: + format: PyG-HeterophilousGraphDataset + name: Amazon-ratings + task: node + task_type: classification + transductive: True + split_mode: standard + node_encoder: True + node_encoder_name: LinearNode + node_encoder_bn: False + edge_encoder: False + edge_encoder_name: DummyEdge + edge_encoder_bn: False + store_precompute: True +posenc_GraphormerBias: + enable: False + node_degrees_only: True + num_spatial_types: 0 + num_in_degrees: 729 + num_out_degrees: 700 +graphormer: + use_graph_token: False +posenc_LapPE: + enable: False + eigen: + laplacian_norm: none + eigvec_norm: L2 + max_freqs: 10 + model: DeepSet + dim_pe: 32 + layers: 2 + n_heads: 4 # Only used when `posenc.model: Transformer` + raw_norm_type: none +posenc_RWSE: + enable: False + kernel: + times_func: range(1,21) + model: Linear + dim_pe: 32 + raw_norm_type: BatchNorm +train: + mode: custom + sampler: full_batch +# sampler: saint_rw +# batch_size: 32 + eval_period: 5 + enable_ckpt: False +model: + type: GPSModel + loss_fun: cross_entropy + edge_decoding: dot +gt: + layer_type: GAT+Performer + layers: 1 + n_heads: 8 + dim_hidden: 512 # `gt.dim_hidden` must match `gnn.dim_inner` + dropout: 0.2 + attn_dropout: 0.0 + layer_norm: False + batch_norm: False + hidden_dim_multiplier: 2 +gnn: + head: node + layers_pre_mp: 0 + layers_post_mp: 1 + dim_inner: 512 # `gt.dim_hidden` must match `gnn.dim_inner` + act: gelu + dropout: 0.2 + normalize_adj: False +optim: + clip_grad_norm: False + optimizer: adamW + weight_decay: 0.0 + base_lr: 3e-5 + max_epoch: 1000 + scheduler: null diff --git a/configs/GPS/minesweeper-CL.yaml b/configs/GPS/minesweeper-CL.yaml new file mode 100644 index 00000000..34e61598 --- /dev/null +++ b/configs/GPS/minesweeper-CL.yaml @@ -0,0 +1,82 @@ +out_dir: results +metric_best: auc +wandb: + use: False + project: probing-heterophilous + entity: null +dataset: + format: PyG-HeterophilousGraphDataset + name: Minesweeper + task: node + task_type: classification + transductive: True + split_mode: standard + node_encoder: True + node_encoder_name: LinearNode + node_encoder_bn: False + edge_encoder: False + edge_encoder_name: DummyEdge + edge_encoder_bn: False + store_precompute: True +posenc_GraphormerBias: + enable: False + node_degrees_only: True + num_spatial_types: 0 + num_in_degrees: 729 + num_out_degrees: 700 +graphormer: + use_graph_token: False +posenc_LapPE: + enable: False + eigen: + laplacian_norm: none + eigvec_norm: L2 + max_freqs: 10 + model: DeepSet + dim_pe: 32 + layers: 2 + n_heads: 4 # Only used when `posenc.model: Transformer` + raw_norm_type: none +posenc_RWSE: + enable: False + kernel: + times_func: range(1,21) + model: Linear + dim_pe: 32 + raw_norm_type: BatchNorm +train: + mode: custom + sampler: full_batch +# sampler: saint_rw +# batch_size: 32 + eval_period: 5 + enable_ckpt: False +model: + type: GPSModel + loss_fun: cross_entropy + edge_decoding: dot +gt: + layer_type: GAT+Performer + layers: 1 + n_heads: 8 + dim_hidden: 512 # `gt.dim_hidden` must match `gnn.dim_inner` + dropout: 0.2 + attn_dropout: 0.0 + layer_norm: False + batch_norm: False + hidden_dim_multiplier: 2 +gnn: + head: node + layers_pre_mp: 0 + layers_post_mp: 1 + dim_inner: 512 # `gt.dim_hidden` must match `gnn.dim_inner` + act: gelu + dropout: 0.2 + normalize_adj: False +optim: + clip_grad_norm: False + optimizer: adamW + weight_decay: 0.0 + base_lr: 3e-5 + max_epoch: 1000 + scheduler: null diff --git a/configs/GPS/questions-CL.yaml b/configs/GPS/questions-CL.yaml new file mode 100644 index 00000000..4a6598d3 --- /dev/null +++ b/configs/GPS/questions-CL.yaml @@ -0,0 +1,82 @@ +out_dir: results +metric_best: auc +wandb: + use: False + project: probing-heterophilous + entity: null +dataset: + format: PyG-HeterophilousGraphDataset + name: Questions + task: node + task_type: classification + transductive: True + split_mode: standard + node_encoder: True + node_encoder_name: LinearNode + node_encoder_bn: False + edge_encoder: False + edge_encoder_name: DummyEdge + edge_encoder_bn: False + store_precompute: True +posenc_GraphormerBias: + enable: False + node_degrees_only: True + num_spatial_types: 0 + num_in_degrees: 2000 + num_out_degrees: 2000 +graphormer: + use_graph_token: False +posenc_LapPE: + enable: False + eigen: + laplacian_norm: none + eigvec_norm: L2 + max_freqs: 10 + model: DeepSet + dim_pe: 32 + layers: 2 + n_heads: 4 # Only used when `posenc.model: Transformer` + raw_norm_type: none +posenc_RWSE: + enable: False + kernel: + times_func: range(1,21) + model: Linear + dim_pe: 32 + raw_norm_type: BatchNorm +train: + mode: custom + sampler: full_batch +# sampler: saint_rw +# batch_size: 32 + eval_period: 5 + enable_ckpt: False +model: + type: GPSModel + loss_fun: cross_entropy + edge_decoding: dot +gt: + layer_type: GAT+Performer + layers: 1 + n_heads: 8 + dim_hidden: 512 # `gt.dim_hidden` must match `gnn.dim_inner` + dropout: 0.2 + attn_dropout: 0.0 + layer_norm: False + batch_norm: False + hidden_dim_multiplier: 2 +gnn: + head: node + layers_pre_mp: 0 + layers_post_mp: 1 + dim_inner: 512 # `gt.dim_hidden` must match `gnn.dim_inner` + act: gelu + dropout: 0.2 + normalize_adj: False +optim: + clip_grad_norm: False + optimizer: adamW + weight_decay: 0.0 + base_lr: 3e-5 + max_epoch: 1000 + scheduler: null diff --git a/configs/GPS/roman-empire-CL.yaml b/configs/GPS/roman-empire-CL.yaml new file mode 100644 index 00000000..63b228a8 --- /dev/null +++ b/configs/GPS/roman-empire-CL.yaml @@ -0,0 +1,82 @@ +out_dir: results +metric_best: accuracy +wandb: + use: False + project: probing-heterophilous + entity: null +dataset: + format: PyG-HeterophilousGraphDataset + name: Roman-empire + task: node + task_type: classification + transductive: True + split_mode: standard + node_encoder: True + node_encoder_name: LinearNode + node_encoder_bn: False + edge_encoder: False + edge_encoder_name: DummyEdge + edge_encoder_bn: False + store_precompute: True +posenc_GraphormerBias: + enable: False + node_degrees_only: True + num_spatial_types: 0 + num_in_degrees: 729 + num_out_degrees: 700 +graphormer: + use_graph_token: False +posenc_LapPE: + enable: False + eigen: + laplacian_norm: none + eigvec_norm: L2 + max_freqs: 10 + model: DeepSet + dim_pe: 32 + layers: 2 + n_heads: 4 # Only used when `posenc.model: Transformer` + raw_norm_type: none +posenc_RWSE: + enable: False + kernel: + times_func: range(1,21) + model: Linear + dim_pe: 32 + raw_norm_type: BatchNorm +train: + mode: custom + sampler: full_batch +# sampler: saint_rw +# batch_size: 32 + eval_period: 5 + enable_ckpt: False +model: + type: GPSModel + loss_fun: cross_entropy + edge_decoding: dot +gt: + layer_type: GAT+Performer + layers: 1 + n_heads: 8 + dim_hidden: 512 # `gt.dim_hidden` must match `gnn.dim_inner` + dropout: 0.2 + attn_dropout: 0.0 + layer_norm: False + batch_norm: False + hidden_dim_multiplier: 2 +gnn: + head: node + layers_pre_mp: 0 + layers_post_mp: 1 + dim_inner: 512 # `gt.dim_hidden` must match `gnn.dim_inner` + act: gelu + dropout: 0.2 + normalize_adj: False +optim: + clip_grad_norm: False + optimizer: adamW + weight_decay: 0.0 + base_lr: 3e-5 + max_epoch: 1000 + scheduler: null \ No newline at end of file diff --git a/configs/GPS/tolokers-CL.yaml b/configs/GPS/tolokers-CL.yaml new file mode 100644 index 00000000..f2fe851c --- /dev/null +++ b/configs/GPS/tolokers-CL.yaml @@ -0,0 +1,82 @@ +out_dir: results +metric_best: auc +wandb: + use: False + project: probing-heterophilous + entity: null +dataset: + format: PyG-HeterophilousGraphDataset + name: Tolokers + task: node + task_type: classification + transductive: True + split_mode: standard + node_encoder: True + node_encoder_name: LinearNode + node_encoder_bn: False + edge_encoder: False + edge_encoder_name: DummyEdge + edge_encoder_bn: False + store_precompute: True +posenc_GraphormerBias: + enable: False + node_degrees_only: True + num_spatial_types: 0 + num_in_degrees: 3000 + num_out_degrees: 3000 +graphormer: + use_graph_token: False +posenc_LapPE: + enable: False + eigen: + laplacian_norm: none + eigvec_norm: L2 + max_freqs: 10 + model: DeepSet + dim_pe: 32 + layers: 2 + n_heads: 4 # Only used when `posenc.model: Transformer` + raw_norm_type: none +posenc_RWSE: + enable: False + kernel: + times_func: range(1,21) + model: Linear + dim_pe: 32 + raw_norm_type: BatchNorm +train: + mode: custom + sampler: full_batch +# sampler: saint_rw +# batch_size: 32 + eval_period: 5 + enable_ckpt: False +model: + type: GPSModel + loss_fun: cross_entropy + edge_decoding: dot +gt: + layer_type: GAT+Performer + layers: 1 + n_heads: 8 + dim_hidden: 512 # `gt.dim_hidden` must match `gnn.dim_inner` + dropout: 0.2 + attn_dropout: 0.0 + layer_norm: False + batch_norm: False + hidden_dim_multiplier: 2 +gnn: + head: node + layers_pre_mp: 0 + layers_post_mp: 1 + dim_inner: 512 # `gt.dim_hidden` must match `gnn.dim_inner` + act: gelu + dropout: 0.2 + normalize_adj: False +optim: + clip_grad_norm: False + optimizer: adamW + weight_decay: 0.0 + base_lr: 3e-5 + max_epoch: 1000 + scheduler: null diff --git a/graphgps/config/dataset_config.py b/graphgps/config/dataset_config.py index 3f2ea4b7..660b9cd7 100644 --- a/graphgps/config/dataset_config.py +++ b/graphgps/config/dataset_config.py @@ -17,3 +17,5 @@ def dataset_cfg(cfg): # infer-link parameters (e.g., edge prediction task) cfg.dataset.infer_link_label = "None" + + cfg.dataset.store_precompute = False diff --git a/graphgps/config/gt_config.py b/graphgps/config/gt_config.py index 7d85844a..7481fb97 100644 --- a/graphgps/config/gt_config.py +++ b/graphgps/config/gt_config.py @@ -48,6 +48,8 @@ def set_cfg_gt(cfg): cfg.gt.residual = True + cfg.gt.hidden_dim_multiplier = 2 + # BigBird model/GPS-BigBird layer. cfg.gt.bigbird = CN() diff --git a/graphgps/layer/gps_layer.py b/graphgps/layer/gps_layer.py index 813371f0..16091542 100644 --- a/graphgps/layer/gps_layer.py +++ b/graphgps/layer/gps_layer.py @@ -21,7 +21,7 @@ def __init__(self, dim_h, local_gnn_type, global_model_type, num_heads, act='relu', pna_degrees=None, equivstable_pe=False, dropout=0.0, attn_dropout=0.0, layer_norm=False, batch_norm=True, - bigbird_cfg=None, log_attn_weights=False): + bigbird_cfg=None, log_attn_weights=False, hidden_dim_multiplier=2): super().__init__() self.dim_h = dim_h @@ -140,8 +140,8 @@ def __init__(self, dim_h, self.dropout_attn = nn.Dropout(dropout) # Feed Forward block. - self.ff_linear1 = nn.Linear(dim_h, dim_h * 2) - self.ff_linear2 = nn.Linear(dim_h * 2, dim_h) + self.ff_linear1 = nn.Linear(dim_h, dim_h * hidden_dim_multiplier) + self.ff_linear2 = nn.Linear(dim_h * hidden_dim_multiplier, dim_h) self.act_fn_ff = self.activation() if self.layer_norm: self.norm2 = pygnn.norm.LayerNorm(dim_h) diff --git a/graphgps/loader/master_loader.py b/graphgps/loader/master_loader.py index edadb7e8..caa32bc8 100644 --- a/graphgps/loader/master_loader.py +++ b/graphgps/loader/master_loader.py @@ -9,7 +9,7 @@ from numpy.random import default_rng from ogb.graphproppred import PygGraphPropPredDataset from torch_geometric.datasets import (Actor, GNNBenchmarkDataset, Planetoid, - TUDataset, WebKB, WikipediaNetwork, ZINC) + TUDataset, WebKB, WikipediaNetwork, ZINC, HeterophilousGraphDataset) from torch_geometric.graphgym.config import cfg from torch_geometric.graphgym.loader import load_pyg, load_ogb, set_dataset_attr from torch_geometric.graphgym.register import register_loader @@ -140,6 +140,9 @@ def load_dataset_master(format, name, dataset_dir): elif pyg_dataset_id == 'COCOSuperpixels': dataset = preformat_COCOSuperpixels(dataset_dir, name, cfg.dataset.slic_compactness) + + elif pyg_dataset_id == "HeterophilousGraphDataset": + dataset = preformat_HeterophilousGraphDataset(dataset_dir, name) else: raise ValueError(f"Unexpected PyG Dataset identifier: {format}") @@ -254,6 +257,20 @@ def compute_indegree_histogram(dataset): return deg.numpy().tolist()[:max_degree + 1] +def preformat_HeterophilousGraphDataset(dataset_dir, name): + """Load and preformat datasets from PyG's HeterophilousGraphDataset. + + Args: + dataset_dir: path where to store the cached dataset + name: name of the specific dataset in the HeterophilousGraphDataset class + + Returns: + PyG dataset object + """ + dataset = HeterophilousGraphDataset(root=dataset_dir, name=name) + return dataset + + def preformat_GNNBenchmarkDataset(dataset_dir, name): """Load and preformat datasets from PyG's GNNBenchmarkDataset. diff --git a/graphgps/network/gps_model.py b/graphgps/network/gps_model.py index bce22e1a..40b70d18 100644 --- a/graphgps/network/gps_model.py +++ b/graphgps/network/gps_model.py @@ -96,6 +96,7 @@ def __init__(self, dim_in, dim_out): batch_norm=cfg.gt.batch_norm, bigbird_cfg=cfg.gt.bigbird, log_attn_weights=cfg.train.mode == 'log-attn-weights', + hidden_dim_multiplier=cfg.gt.hidden_dim_multiplier, )) self.layers = torch.nn.Sequential(*layers) diff --git a/graphgps/transform/posenc_stats.py b/graphgps/transform/posenc_stats.py index 90ebcf74..6e85df92 100644 --- a/graphgps/transform/posenc_stats.py +++ b/graphgps/transform/posenc_stats.py @@ -8,7 +8,7 @@ to_undirected, to_dense_adj, scatter) from torch_geometric.utils.num_nodes import maybe_num_nodes from graphgps.encoder.graphormer_encoder import graphormer_pre_processing - +import os def compute_posenc_stats(data, pe_types, is_undirected, cfg): """Precompute positional encodings for the given graph. @@ -53,24 +53,42 @@ def compute_posenc_stats(data, pe_types, is_undirected, cfg): # Eigen values and vectors. evals, evects = None, None if 'LapPE' in pe_types or 'EquivStableLapPE' in pe_types: - # Eigen-decomposition with numpy, can be reused for Heat kernels. - L = to_scipy_sparse_matrix( - *get_laplacian(undir_edge_index, normalization=laplacian_norm_type, - num_nodes=N) - ) - evals, evects = np.linalg.eigh(L.toarray()) - - if 'LapPE' in pe_types: - max_freqs=cfg.posenc_LapPE.eigen.max_freqs - eigvec_norm=cfg.posenc_LapPE.eigen.eigvec_norm - elif 'EquivStableLapPE' in pe_types: - max_freqs=cfg.posenc_EquivStableLapPE.eigen.max_freqs - eigvec_norm=cfg.posenc_EquivStableLapPE.eigen.eigvec_norm - - data.EigVals, data.EigVecs = get_lap_decomp_stats( - evals=evals, evects=evects, - max_freqs=max_freqs, - eigvec_norm=eigvec_norm) + require_compute = False + + if cfg.dataset.store_precompute: + path = f"{cfg.dataset.name}_LapPE_{cfg.posenc_LapPE.eigen.max_freqs}.pt" + if os.path.exists(path): + print(f"Loading stats from {path}") + data.EigVals, data.EigVecs = torch.load(path) + else: + require_compute = True + else: + require_compute = True + + if require_compute: + # Eigen-decomposition with numpy, can be reused for Heat kernels. + L = to_scipy_sparse_matrix( + *get_laplacian(undir_edge_index, normalization=laplacian_norm_type, + num_nodes=N) + ) + evals, evects = np.linalg.eigh(L.toarray()) + + if 'LapPE' in pe_types: + max_freqs=cfg.posenc_LapPE.eigen.max_freqs + eigvec_norm=cfg.posenc_LapPE.eigen.eigvec_norm + elif 'EquivStableLapPE' in pe_types: + max_freqs=cfg.posenc_EquivStableLapPE.eigen.max_freqs + eigvec_norm=cfg.posenc_EquivStableLapPE.eigen.eigvec_norm + + data.EigVals, data.EigVecs = get_lap_decomp_stats( + evals=evals, evects=evects, + max_freqs=max_freqs, + eigvec_norm=eigvec_norm) + + if cfg.dataset.store_precompute and require_compute: + print(f"Saving stats to {path}") + torch.save((data.EigVals, data.EigVecs), path) + if 'SignNet' in pe_types: # Eigen-decomposition with numpy for SignNet. @@ -92,9 +110,23 @@ def compute_posenc_stats(data, pe_types, is_undirected, cfg): kernel_param = cfg.posenc_RWSE.kernel if len(kernel_param.times) == 0: raise ValueError("List of kernel times required for RWSE") - rw_landing = get_rw_landing_probs(ksteps=kernel_param.times, - edge_index=data.edge_index, - num_nodes=N) + + if cfg.dataset.store_precompute: + path = f"{cfg.dataset.name}_RWSE_{len(kernel_param.times)}.pt" + if os.path.exists(path): + print(f"Loading stats from {path}") + rw_landing = torch.load(path) + else: + rw_landing = get_rw_landing_probs(ksteps=kernel_param.times, + edge_index=data.edge_index, + num_nodes=N) + print(f"Saving stats to {path}") + torch.save(rw_landing, path) + else: + rw_landing = get_rw_landing_probs(ksteps=kernel_param.times, + edge_index=data.edge_index, + num_nodes=N) + data.pestat_RWSE = rw_landing # Heat Kernels. diff --git a/graphgps/transform/task_preprocessing.py b/graphgps/transform/task_preprocessing.py index ed28efc5..12c293ee 100644 --- a/graphgps/transform/task_preprocessing.py +++ b/graphgps/transform/task_preprocessing.py @@ -1,6 +1,6 @@ import torch - - +from torch_geometric.utils import add_remaining_self_loops, to_undirected +import os def shuffle(tensor): idx = torch.randperm(len(tensor)) return tensor[idx] @@ -16,6 +16,12 @@ def task_specific_preprocessing(data, cfg): Returns: Extended PyG Data object. """ + if cfg.dataset.format == "PyG-HeterophilousGraphDataset": + data.edge_index, _ = add_remaining_self_loops(data.edge_index) + data.edge_index = to_undirected(data.edge_index) + if cfg.gt.layer_type.split("+")[0] in ["GINE", "CustomGatedGCN"]: + data.edge_attr = torch.zeros(data.edge_index.size(1), cfg.gnn.dim_inner) + if cfg.gnn.head == "infer_links": N = data.x.size(0) idx = torch.arange(N, dtype=torch.long)