Skip to content

Commit 2be5c85

Browse files
committed
add protein datasets
1 parent 462fd86 commit 2be5c85

18 files changed

+1298
-4
lines changed

torchdrug/datasets/__init__.py

Lines changed: 26 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -20,9 +20,27 @@
2020
from .zinc2m import ZINC2m
2121
from .pcqm4m import PCQM4M
2222
from .pubchem110m import PubChem110m
23-
2423
from .chembl_filtered import ChEMBLFiltered
2524

25+
from .beta_lactamase import BetaLactamase
26+
from .fluorescence import Fluorescence
27+
from .stability import Stability
28+
from .solubility import Solubility
29+
from .fold import Fold
30+
from .binary_localization import BinaryLocalization
31+
from .subcellular_localization import SubcellularLocalization
32+
from .secondary_structure import SecondaryStructure
33+
from .human_ppi import HumanPPI
34+
from .yeast_ppi import YeastPPI
35+
from .ppi_affinity import PPIAffinity
36+
from .bindingdb import BindingDB
37+
from .pdbbind import PDBBind
38+
from .proteinnet import ProteinNet
39+
40+
from .enzyme_commission import EnzymeCommission
41+
from .gene_ontology import GeneOntology
42+
from .alphafolddb import AlphaFoldDB
43+
2644
from .fb15k import FB15k, FB15k237
2745
from .wn18 import WN18, WN18RR
2846
from .hetionet import Hetionet
@@ -32,10 +50,14 @@
3250
from .pubmed import PubMed
3351

3452
__all__ = [
35-
"BACE", "BBBP", "CEP", "ChEMBLFiltered", "ClinTox", "Delaney", "FreeSolv", "HIV", "Lipophilicity",
53+
"BACE", "BBBP", "CEP", "ClinTox", "Delaney", "FreeSolv", "HIV", "Lipophilicity",
3654
"Malaria", "MOSES", "MUV", "OPV", "QM8", "QM9", "SIDER", "Tox21", "ToxCast",
3755
"USPTO50k", "ZINC250k",
38-
"ZINC2m", "PCQM4M", "PubChem110m",
56+
"ZINC2m", "PCQM4M", "PubChem110m", "ChEMBLFiltered",
57+
"EnzymeCommission", "GeneOntology", "AlphaFoldDB",
58+
"BetaLactamase", "Fluorescence", "Stability", "Solubility", "Fold",
59+
"BinaryLocalization", "SubcellularLocalization", "SecondaryStructure",
60+
"HumanPPI", "YeastPPI", "PPIAffinity", "BindingDB", "PDBBind", "ProteinNet",
3961
"FB15k", "FB15k237", "WN18", "WN18RR", "Hetionet",
40-
"Cora", "CiteSeer",
62+
"Cora", "CiteSeer", "PubMed",
4163
]

torchdrug/datasets/alphafolddb.py

Lines changed: 150 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,150 @@
1+
import os
2+
import glob
3+
4+
from torchdrug import data, utils
5+
from torchdrug.core import Registry as R
6+
7+
8+
@R.register("datasets.AlphaFoldDB")
9+
@utils.copy_args(data.ProteinDataset.load_pdbs, ignore=("filtered_pdb",))
10+
class AlphaFoldDB(data.ProteinDataset):
11+
"""
12+
3D protein structures predicted by AlphaFold.
13+
This dataset covers proteomes of 48 organisms, as well as the majority of Swiss-Prot.
14+
15+
Statistics:
16+
See https://alphafold.ebi.ac.uk/download
17+
18+
Parameters:
19+
path (str): path to store the dataset
20+
species_id (int, optional): the id of species to be loaded. The species are numbered
21+
by the order appeared on https://alphafold.ebi.ac.uk/download (0-20 for model
22+
organism proteomes, 21 for Swiss-Prot)
23+
split_id (int, optional): the id of split to be loaded. To avoid large memory consumption
24+
for one dataset, we have cut each species into several splits, each of which contains
25+
at most 22000 proteins.
26+
verbose (int, optional): output verbose level
27+
**kwargs
28+
"""
29+
30+
urls = [
31+
"https://ftp.ebi.ac.uk/pub/databases/alphafold/latest/UP000006548_3702_ARATH_v2.tar",
32+
"https://ftp.ebi.ac.uk/pub/databases/alphafold/latest/UP000001940_6239_CAEEL_v2.tar",
33+
"https://ftp.ebi.ac.uk/pub/databases/alphafold/latest/UP000000559_237561_CANAL_v2.tar",
34+
"https://ftp.ebi.ac.uk/pub/databases/alphafold/latest/UP000000437_7955_DANRE_v2.tar",
35+
"https://ftp.ebi.ac.uk/pub/databases/alphafold/latest/UP000002195_44689_DICDI_v2.tar",
36+
"https://ftp.ebi.ac.uk/pub/databases/alphafold/latest/UP000000803_7227_DROME_v2.tar",
37+
"https://ftp.ebi.ac.uk/pub/databases/alphafold/latest/UP000000625_83333_ECOLI_v2.tar",
38+
"https://ftp.ebi.ac.uk/pub/databases/alphafold/latest/UP000008827_3847_SOYBN_v2.tar",
39+
"https://ftp.ebi.ac.uk/pub/databases/alphafold/latest/UP000005640_9606_HUMAN_v2.tar",
40+
"https://ftp.ebi.ac.uk/pub/databases/alphafold/latest/UP000008153_5671_LEIIN_v2.tar",
41+
"https://ftp.ebi.ac.uk/pub/databases/alphafold/latest/UP000000805_243232_METJA_v2.tar",
42+
"https://ftp.ebi.ac.uk/pub/databases/alphafold/latest/UP000000589_10090_MOUSE_v2.tar",
43+
"https://ftp.ebi.ac.uk/pub/databases/alphafold/latest/UP000001584_83332_MYCTU_v2.tar",
44+
"https://ftp.ebi.ac.uk/pub/databases/alphafold/latest/UP000059680_39947_ORYSJ_v2.tar",
45+
"https://ftp.ebi.ac.uk/pub/databases/alphafold/latest/UP000001450_36329_PLAF7_v2.tar",
46+
"https://ftp.ebi.ac.uk/pub/databases/alphafold/latest/UP000002494_10116_RAT_v2.tar",
47+
"https://ftp.ebi.ac.uk/pub/databases/alphafold/latest/UP000002311_559292_YEAST_v2.tar",
48+
"https://ftp.ebi.ac.uk/pub/databases/alphafold/latest/UP000002485_284812_SCHPO_v2.tar",
49+
"https://ftp.ebi.ac.uk/pub/databases/alphafold/latest/UP000008816_93061_STAA8_v2.tar",
50+
"https://ftp.ebi.ac.uk/pub/databases/alphafold/latest/UP000002296_353153_TRYCC_v2.tar",
51+
"https://ftp.ebi.ac.uk/pub/databases/alphafold/latest/UP000007305_4577_MAIZE_v2.tar",
52+
"https://ftp.ebi.ac.uk/pub/databases/alphafold/latest/swissprot_pdb_v2.tar",
53+
"https://ftp.ebi.ac.uk/pub/databases/alphafold/latest/UP000001631_447093_AJECG_v2.tar",
54+
"https://ftp.ebi.ac.uk/pub/databases/alphafold/latest/UP000006672_6279_BRUMA_v2.tar",
55+
"https://ftp.ebi.ac.uk/pub/databases/alphafold/latest/UP000000799_192222_CAMJE_v2.tar",
56+
"https://ftp.ebi.ac.uk/pub/databases/alphafold/latest/UP000094526_86049_9EURO1_v2.tar",
57+
"https://ftp.ebi.ac.uk/pub/databases/alphafold/latest/UP000274756_318479_DRAME_v2.tar",
58+
"https://ftp.ebi.ac.uk/pub/databases/alphafold/latest/UP000325664_1352_ENTFC_v2.tar",
59+
"https://ftp.ebi.ac.uk/pub/databases/alphafold/latest/UP000053029_1442368_9EURO2_v2.tar",
60+
"https://ftp.ebi.ac.uk/pub/databases/alphafold/latest/UP000000579_71421_HAEIN_v2.tar",
61+
"https://ftp.ebi.ac.uk/pub/databases/alphafold/latest/UP000000429_85962_HELPY_v2.tar",
62+
"https://ftp.ebi.ac.uk/pub/databases/alphafold/latest/UP000007841_1125630_KLEPH_v2.tar",
63+
"https://ftp.ebi.ac.uk/pub/databases/alphafold/latest/UP000008153_5671_LEIIN_v2.tar",
64+
"https://ftp.ebi.ac.uk/pub/databases/alphafold/latest/UP000078237_100816_9PEZI1_v2.tar",
65+
"https://ftp.ebi.ac.uk/pub/databases/alphafold/latest/UP000000806_272631_MYCLE_v2.tar",
66+
"https://ftp.ebi.ac.uk/pub/databases/alphafold/latest/UP000001584_83332_MYCTU_v2.tar",
67+
"https://ftp.ebi.ac.uk/pub/databases/alphafold/latest/UP000020681_1299332_MYCUL_v2.tar",
68+
"https://ftp.ebi.ac.uk/pub/databases/alphafold/latest/UP000000535_242231_NEIG1_v2.tar",
69+
"https://ftp.ebi.ac.uk/pub/databases/alphafold/latest/UP000006304_1133849_9NOCA1_v2.tar",
70+
"https://ftp.ebi.ac.uk/pub/databases/alphafold/latest/UP000024404_6282_ONCVO_v2.tar",
71+
"https://ftp.ebi.ac.uk/pub/databases/alphafold/latest/UP000002059_502779_PARBA_v2.tar",
72+
"https://ftp.ebi.ac.uk/pub/databases/alphafold/latest/UP000001450_36329_PLAF7_v2.tar",
73+
"https://ftp.ebi.ac.uk/pub/databases/alphafold/latest/UP000002438_208964_PSEAE_v2.tar",
74+
"https://ftp.ebi.ac.uk/pub/databases/alphafold/latest/UP000001014_99287_SALTY_v2.tar",
75+
"https://ftp.ebi.ac.uk/pub/databases/alphafold/latest/UP000008854_6183_SCHMA_v2.tar",
76+
"https://ftp.ebi.ac.uk/pub/databases/alphafold/latest/UP000002716_300267_SHIDS_v2.tar",
77+
"https://ftp.ebi.ac.uk/pub/databases/alphafold/latest/UP000018087_1391915_SPOS1_v2.tar",
78+
"https://ftp.ebi.ac.uk/pub/databases/alphafold/latest/UP000008816_93061_STAA8_v2.tar",
79+
"https://ftp.ebi.ac.uk/pub/databases/alphafold/latest/UP000000586_171101_STRR6_v2.tar",
80+
"https://ftp.ebi.ac.uk/pub/databases/alphafold/latest/UP000035681_6248_STRER_v2.tar",
81+
"https://ftp.ebi.ac.uk/pub/databases/alphafold/latest/UP000030665_36087_TRITR_v2.tar",
82+
"https://ftp.ebi.ac.uk/pub/databases/alphafold/latest/UP000008524_185431_TRYB2_v2.tar",
83+
"https://ftp.ebi.ac.uk/pub/databases/alphafold/latest/UP000002296_353153_TRYCC_v2.tar",
84+
"https://ftp.ebi.ac.uk/pub/databases/alphafold/latest/UP000270924_6293_WUCBA_v2.tar"
85+
]
86+
md5s = [
87+
"82b14d14404e39793cf73c1e0f083865", "9e26602ba2d9f233ef4fcf82703ddb59",
88+
"60a09db1e1c47a98763d09879784f536", "a0ab562b7372f149673c4518f949501f",
89+
"6205138b14fb7e7ec09b366e3e4f294b", "31f31359cd7254f82304e3886440bdd3",
90+
"a590096e65461ed4eb092b2147b97f0b", "8f1e120f372995644a7101ad58e5b2ae",
91+
"9a659c4aed2a8b833478dcd5fffc5fd8", "95d775f2ae271cf50a101c73335cd250",
92+
"e5b12da43f5bd77298ca50e19706bdeb", "90e953abba9c8fe202e0adf825c0dfcc",
93+
"38a11553c7e2d00482281e74f7daf321", "2bcdfe2c37154a355fe4e8150c279c13",
94+
"580a55e56a44fed935f0101c37a8c4ab", "b8d08a9033d111429fadb4e25820f9f7",
95+
"59d1167f414a86cbccfb204791fea0eb", "dfde6b44026f19a88f1abc8ac2798ce6",
96+
"a1c2047a16130d61cac4db23b2f5b560", "e4d4b72df8d075aeb607dcb095210304",
97+
"5cdad48c799ffd723636cae26433f1f9", "98a7c13987f578277bfb66ac48a1e242",
98+
]
99+
species_nsplit = [
100+
2, 1, 1, 2, 1, 1, 1, 3, 2, 1, 1, 1, 1, 2, 1, 1, 1, 1, 1, 1, 2, 20,
101+
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
102+
1, 1, 1, 1, 1, 1, 1, 1, 1, 1
103+
]
104+
split_length = 22000
105+
106+
def __init__(self, path, species_id=0, split_id=0, verbose=1, **kwargs):
107+
path = os.path.expanduser(path)
108+
if not os.path.exists(path):
109+
os.makedirs(path)
110+
self.path = path
111+
112+
species_name = os.path.basename(self.urls[species_id])[:-4]
113+
if split_id >= self.species_nsplit[species_id]:
114+
raise ValueError("Split id %d should be less than %d in species %s" %
115+
(split_id, self.species_nsplit[species_id], species_name))
116+
self.processed_file = "%s_%d.pkl.gz" % (species_name, split_id)
117+
pkl_file = os.path.join(path, self.processed_file)
118+
119+
if os.path.exists(pkl_file):
120+
self.load_pickle(pkl_file, verbose=verbose, **kwargs)
121+
else:
122+
tar_file = utils.download(self.urls[species_id], path, md5=self.md5s[species_id])
123+
pdb_path = utils.extract(tar_file)
124+
gz_files = sorted(glob.glob(os.path.join(pdb_path, "*.pdb.gz")))
125+
pdb_files = []
126+
index = slice(split_id * self.split_length, (split_id + 1) * self.split_length)
127+
for gz_file in gz_files[index]:
128+
pdb_files.append(utils.extract(gz_file))
129+
self.load_pdbs(pdb_files, verbose=verbose, **kwargs)
130+
self.save_pickle(pkl_file, verbose=verbose)
131+
132+
def get_item(self, index):
133+
if getattr(self, "lazy", False):
134+
protein = data.Protein.from_pdb(self.pdb_files[index], self.kwargs)
135+
else:
136+
protein = self.data[index].clone()
137+
# Zhaocheng: I didn't see any code that creates sparse residue features
138+
if hasattr(protein, "residue_feature"):
139+
with protein.residue():
140+
protein.residue_feature = protein.residue_feature.to_dense()
141+
item = {"graph": protein}
142+
if self.transform:
143+
item = self.transform(item)
144+
return item
145+
146+
def __repr__(self):
147+
lines = [
148+
"#sample: %d" % len(self),
149+
]
150+
return "%s(\n %s\n)" % (self.__class__.__name__, "\n ".join(lines))
Lines changed: 51 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,51 @@
1+
import os
2+
3+
from torch.utils import data as torch_data
4+
5+
from torchdrug import data, utils
6+
from torchdrug.core import Registry as R
7+
8+
9+
@R.register("datasets.BetaLactamase")
10+
@utils.copy_args(data.ProteinDataset.load_lmdbs, ignore=("target_fields",))
11+
class BetaLactamase(data.ProteinDataset):
12+
"""
13+
The activity values of first-order mutants of the TEM-1 beta-lactamase protein.
14+
15+
Statistics:
16+
- #Train: 4,158
17+
- #Valid: 520
18+
- #Test: 520
19+
20+
Parameters:
21+
path (str): the path to store the dataset
22+
verbose (int, optional): output verbose level
23+
**kwargs
24+
"""
25+
26+
url = "https://miladeepgraphlearningproteindata.s3.us-east-2.amazonaws.com/peerdata/beta_lactamase.tar.gz"
27+
md5 = "65766a3969cc0e94b101d4063d204ba4"
28+
splits = ["train", "valid", "test"]
29+
target_fields = ["scaled_effect1"]
30+
31+
def __init__(self, path, verbose=1, **kwargs):
32+
path = os.path.expanduser(path)
33+
if not os.path.exists(path):
34+
os.makedirs(path)
35+
self.path = path
36+
37+
zip_file = utils.download(self.url, path, md5=self.md5)
38+
data_path = utils.extract(zip_file)
39+
lmdb_files = [os.path.join(data_path, "beta_lactamase/beta_lactamase_%s.lmdb" % split)
40+
for split in self.splits]
41+
42+
self.load_lmdbs(lmdb_files, target_fields=self.target_fields, verbose=verbose, **kwargs)
43+
44+
def split(self):
45+
offset = 0
46+
splits = []
47+
for num_sample in self.num_samples:
48+
split = torch_data.Subset(self, range(offset, offset + num_sample))
49+
splits.append(split)
50+
offset += num_sample
51+
return splits
Lines changed: 52 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,52 @@
1+
import os
2+
3+
from torch.utils import data as torch_data
4+
5+
from torchdrug import data, utils
6+
from torchdrug.core import Registry as R
7+
8+
9+
@R.register("datasets.BinaryLocalization")
10+
@utils.copy_args(data.ProteinDataset.load_lmdbs, ignore=("target_fields",))
11+
class BinaryLocalization(data.ProteinDataset):
12+
"""
13+
Simpler version of the Subcellular Localization with binary labels indicating
14+
whether a protein is membrane-bound or soluble.
15+
16+
Statistics:
17+
- #Train: 5,161
18+
- #Valid: 1,727
19+
- #Test: 1,746
20+
21+
Parameters:
22+
path (str): the path to store the dataset
23+
verbose (int, optional): output verbose level
24+
**kwargs
25+
"""
26+
27+
url = "https://miladeepgraphlearningproteindata.s3.us-east-2.amazonaws.com/peerdata/subcellular_localization_2.tar.gz"
28+
md5 = "5d2309bf1c0c2aed450102578e434f4e"
29+
splits = ["train", "valid", "test"]
30+
target_fields = ["localization"]
31+
32+
def __init__(self, path, verbose=1, **kwargs):
33+
path = os.path.expanduser(path)
34+
if not os.path.exists(path):
35+
os.makedirs(path)
36+
self.path = path
37+
38+
zip_file = utils.download(self.url, path, md5=self.md5)
39+
data_path = utils.extract(zip_file)
40+
lmdb_files = [os.path.join(data_path, "subcellular_localization_2/subcellular_localization_2_%s.lmdb" % split)
41+
for split in self.splits]
42+
43+
self.load_lmdbs(lmdb_files, target_fields=self.target_fields, verbose=verbose, **kwargs)
44+
45+
def split(self):
46+
offset = 0
47+
splits = []
48+
for num_sample in self.num_samples:
49+
split = torch_data.Subset(self, range(offset, offset + num_sample))
50+
splits.append(split)
51+
offset += num_sample
52+
return splits

torchdrug/datasets/bindingdb.py

Lines changed: 72 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,72 @@
1+
import os
2+
3+
from rdkit import Chem
4+
5+
from torch.utils import data as torch_data
6+
7+
from torchdrug import data, utils
8+
from torchdrug.core import Registry as R
9+
10+
11+
@R.register("datasets.BindingDB")
12+
@utils.copy_args(data.ProteinLigandDataset.load_lmdbs, ignore=("sequence_field", "smiles_field", "target_fields"))
13+
class BindingDB(data.ProteinLigandDataset):
14+
"""
15+
The BindingDB dataset with binding affinity indicating the interaction strength
16+
between pairs of protein and ligand.
17+
18+
Statistics:
19+
- #Train: 7,900
20+
- #Valid: 878
21+
- #Test: 5,230
22+
23+
Parameters:
24+
path (str): the path to store the dataset
25+
verbose (int, optional): output verbose level
26+
**kwargs
27+
"""
28+
29+
url = "https://miladeepgraphlearningproteindata.s3.us-east-2.amazonaws.com/peerdata/BindingDB_Kd.tar.gz"
30+
md5 = "0b207cb962c4945f9003fc020b415a74"
31+
splits = ["train", "valid", "random_test", "holdout_test"]
32+
target_fields = ["affinity"]
33+
34+
def __init__(self, path, verbose=1, **kwargs):
35+
path = os.path.expanduser(path)
36+
if not os.path.exists(path):
37+
os.makedirs(path)
38+
self.path = path
39+
zip_file = utils.download(self.url, path, md5=self.md5)
40+
data_path = utils.extract(zip_file)
41+
lmdb_files = [os.path.join(data_path, "BindingDB_Kd_%s.lmdb" % split) for split in self.splits]
42+
43+
self.load_lmdbs(lmdb_files, sequence_field="target", smiles_field="drug",
44+
target_fields=self.target_fields, verbose=verbose, **kwargs)
45+
46+
def split(self, keys=None):
47+
keys = keys or self.splits
48+
offset = 0
49+
splits = []
50+
for split_name, num_sample in zip(self.splits, self.num_samples):
51+
if split_name in keys:
52+
split = torch_data.Subset(self, range(offset, offset + num_sample))
53+
splits.append(split)
54+
offset += num_sample
55+
return splits
56+
57+
def get_item(self, index):
58+
if self.lazy:
59+
graph1 = data.Protein.from_sequence(self.sequences[index], **self.kwargs)
60+
mol = Chem.MolFromSmiles(self.smiles[index])
61+
if not mol:
62+
graph2 = None
63+
else:
64+
graph2 = data.Molecule.from_molecule(mol, **self.kwargs)
65+
else:
66+
graph1 = self.data[index][0]
67+
graph2 = self.data[index][1]
68+
item = {"graph1": graph1, "graph2": graph2}
69+
item.update({k: v[index] for k, v in self.targets.items()})
70+
if self.transform:
71+
item = self.transform(item)
72+
return item

0 commit comments

Comments
 (0)