|
| 1 | +import os |
| 2 | +import glob |
| 3 | + |
| 4 | +from torchdrug import data, utils |
| 5 | +from torchdrug.core import Registry as R |
| 6 | + |
| 7 | + |
| 8 | +@R.register("datasets.AlphaFoldDB") |
| 9 | +@utils.copy_args(data.ProteinDataset.load_pdbs, ignore=("filtered_pdb",)) |
| 10 | +class AlphaFoldDB(data.ProteinDataset): |
| 11 | + """ |
| 12 | + 3D protein structures predicted by AlphaFold. |
| 13 | + This dataset covers proteomes of 48 organisms, as well as the majority of Swiss-Prot. |
| 14 | +
|
| 15 | + Statistics: |
| 16 | + See https://alphafold.ebi.ac.uk/download |
| 17 | +
|
| 18 | + Parameters: |
| 19 | + path (str): path to store the dataset |
| 20 | + species_id (int, optional): the id of species to be loaded. The species are numbered |
| 21 | + by the order appeared on https://alphafold.ebi.ac.uk/download (0-20 for model |
| 22 | + organism proteomes, 21 for Swiss-Prot) |
| 23 | + split_id (int, optional): the id of split to be loaded. To avoid large memory consumption |
| 24 | + for one dataset, we have cut each species into several splits, each of which contains |
| 25 | + at most 22000 proteins. |
| 26 | + verbose (int, optional): output verbose level |
| 27 | + **kwargs |
| 28 | + """ |
| 29 | + |
| 30 | + urls = [ |
| 31 | + "https://ftp.ebi.ac.uk/pub/databases/alphafold/latest/UP000006548_3702_ARATH_v2.tar", |
| 32 | + "https://ftp.ebi.ac.uk/pub/databases/alphafold/latest/UP000001940_6239_CAEEL_v2.tar", |
| 33 | + "https://ftp.ebi.ac.uk/pub/databases/alphafold/latest/UP000000559_237561_CANAL_v2.tar", |
| 34 | + "https://ftp.ebi.ac.uk/pub/databases/alphafold/latest/UP000000437_7955_DANRE_v2.tar", |
| 35 | + "https://ftp.ebi.ac.uk/pub/databases/alphafold/latest/UP000002195_44689_DICDI_v2.tar", |
| 36 | + "https://ftp.ebi.ac.uk/pub/databases/alphafold/latest/UP000000803_7227_DROME_v2.tar", |
| 37 | + "https://ftp.ebi.ac.uk/pub/databases/alphafold/latest/UP000000625_83333_ECOLI_v2.tar", |
| 38 | + "https://ftp.ebi.ac.uk/pub/databases/alphafold/latest/UP000008827_3847_SOYBN_v2.tar", |
| 39 | + "https://ftp.ebi.ac.uk/pub/databases/alphafold/latest/UP000005640_9606_HUMAN_v2.tar", |
| 40 | + "https://ftp.ebi.ac.uk/pub/databases/alphafold/latest/UP000008153_5671_LEIIN_v2.tar", |
| 41 | + "https://ftp.ebi.ac.uk/pub/databases/alphafold/latest/UP000000805_243232_METJA_v2.tar", |
| 42 | + "https://ftp.ebi.ac.uk/pub/databases/alphafold/latest/UP000000589_10090_MOUSE_v2.tar", |
| 43 | + "https://ftp.ebi.ac.uk/pub/databases/alphafold/latest/UP000001584_83332_MYCTU_v2.tar", |
| 44 | + "https://ftp.ebi.ac.uk/pub/databases/alphafold/latest/UP000059680_39947_ORYSJ_v2.tar", |
| 45 | + "https://ftp.ebi.ac.uk/pub/databases/alphafold/latest/UP000001450_36329_PLAF7_v2.tar", |
| 46 | + "https://ftp.ebi.ac.uk/pub/databases/alphafold/latest/UP000002494_10116_RAT_v2.tar", |
| 47 | + "https://ftp.ebi.ac.uk/pub/databases/alphafold/latest/UP000002311_559292_YEAST_v2.tar", |
| 48 | + "https://ftp.ebi.ac.uk/pub/databases/alphafold/latest/UP000002485_284812_SCHPO_v2.tar", |
| 49 | + "https://ftp.ebi.ac.uk/pub/databases/alphafold/latest/UP000008816_93061_STAA8_v2.tar", |
| 50 | + "https://ftp.ebi.ac.uk/pub/databases/alphafold/latest/UP000002296_353153_TRYCC_v2.tar", |
| 51 | + "https://ftp.ebi.ac.uk/pub/databases/alphafold/latest/UP000007305_4577_MAIZE_v2.tar", |
| 52 | + "https://ftp.ebi.ac.uk/pub/databases/alphafold/latest/swissprot_pdb_v2.tar", |
| 53 | + "https://ftp.ebi.ac.uk/pub/databases/alphafold/latest/UP000001631_447093_AJECG_v2.tar", |
| 54 | + "https://ftp.ebi.ac.uk/pub/databases/alphafold/latest/UP000006672_6279_BRUMA_v2.tar", |
| 55 | + "https://ftp.ebi.ac.uk/pub/databases/alphafold/latest/UP000000799_192222_CAMJE_v2.tar", |
| 56 | + "https://ftp.ebi.ac.uk/pub/databases/alphafold/latest/UP000094526_86049_9EURO1_v2.tar", |
| 57 | + "https://ftp.ebi.ac.uk/pub/databases/alphafold/latest/UP000274756_318479_DRAME_v2.tar", |
| 58 | + "https://ftp.ebi.ac.uk/pub/databases/alphafold/latest/UP000325664_1352_ENTFC_v2.tar", |
| 59 | + "https://ftp.ebi.ac.uk/pub/databases/alphafold/latest/UP000053029_1442368_9EURO2_v2.tar", |
| 60 | + "https://ftp.ebi.ac.uk/pub/databases/alphafold/latest/UP000000579_71421_HAEIN_v2.tar", |
| 61 | + "https://ftp.ebi.ac.uk/pub/databases/alphafold/latest/UP000000429_85962_HELPY_v2.tar", |
| 62 | + "https://ftp.ebi.ac.uk/pub/databases/alphafold/latest/UP000007841_1125630_KLEPH_v2.tar", |
| 63 | + "https://ftp.ebi.ac.uk/pub/databases/alphafold/latest/UP000008153_5671_LEIIN_v2.tar", |
| 64 | + "https://ftp.ebi.ac.uk/pub/databases/alphafold/latest/UP000078237_100816_9PEZI1_v2.tar", |
| 65 | + "https://ftp.ebi.ac.uk/pub/databases/alphafold/latest/UP000000806_272631_MYCLE_v2.tar", |
| 66 | + "https://ftp.ebi.ac.uk/pub/databases/alphafold/latest/UP000001584_83332_MYCTU_v2.tar", |
| 67 | + "https://ftp.ebi.ac.uk/pub/databases/alphafold/latest/UP000020681_1299332_MYCUL_v2.tar", |
| 68 | + "https://ftp.ebi.ac.uk/pub/databases/alphafold/latest/UP000000535_242231_NEIG1_v2.tar", |
| 69 | + "https://ftp.ebi.ac.uk/pub/databases/alphafold/latest/UP000006304_1133849_9NOCA1_v2.tar", |
| 70 | + "https://ftp.ebi.ac.uk/pub/databases/alphafold/latest/UP000024404_6282_ONCVO_v2.tar", |
| 71 | + "https://ftp.ebi.ac.uk/pub/databases/alphafold/latest/UP000002059_502779_PARBA_v2.tar", |
| 72 | + "https://ftp.ebi.ac.uk/pub/databases/alphafold/latest/UP000001450_36329_PLAF7_v2.tar", |
| 73 | + "https://ftp.ebi.ac.uk/pub/databases/alphafold/latest/UP000002438_208964_PSEAE_v2.tar", |
| 74 | + "https://ftp.ebi.ac.uk/pub/databases/alphafold/latest/UP000001014_99287_SALTY_v2.tar", |
| 75 | + "https://ftp.ebi.ac.uk/pub/databases/alphafold/latest/UP000008854_6183_SCHMA_v2.tar", |
| 76 | + "https://ftp.ebi.ac.uk/pub/databases/alphafold/latest/UP000002716_300267_SHIDS_v2.tar", |
| 77 | + "https://ftp.ebi.ac.uk/pub/databases/alphafold/latest/UP000018087_1391915_SPOS1_v2.tar", |
| 78 | + "https://ftp.ebi.ac.uk/pub/databases/alphafold/latest/UP000008816_93061_STAA8_v2.tar", |
| 79 | + "https://ftp.ebi.ac.uk/pub/databases/alphafold/latest/UP000000586_171101_STRR6_v2.tar", |
| 80 | + "https://ftp.ebi.ac.uk/pub/databases/alphafold/latest/UP000035681_6248_STRER_v2.tar", |
| 81 | + "https://ftp.ebi.ac.uk/pub/databases/alphafold/latest/UP000030665_36087_TRITR_v2.tar", |
| 82 | + "https://ftp.ebi.ac.uk/pub/databases/alphafold/latest/UP000008524_185431_TRYB2_v2.tar", |
| 83 | + "https://ftp.ebi.ac.uk/pub/databases/alphafold/latest/UP000002296_353153_TRYCC_v2.tar", |
| 84 | + "https://ftp.ebi.ac.uk/pub/databases/alphafold/latest/UP000270924_6293_WUCBA_v2.tar" |
| 85 | + ] |
| 86 | + md5s = [ |
| 87 | + "82b14d14404e39793cf73c1e0f083865", "9e26602ba2d9f233ef4fcf82703ddb59", |
| 88 | + "60a09db1e1c47a98763d09879784f536", "a0ab562b7372f149673c4518f949501f", |
| 89 | + "6205138b14fb7e7ec09b366e3e4f294b", "31f31359cd7254f82304e3886440bdd3", |
| 90 | + "a590096e65461ed4eb092b2147b97f0b", "8f1e120f372995644a7101ad58e5b2ae", |
| 91 | + "9a659c4aed2a8b833478dcd5fffc5fd8", "95d775f2ae271cf50a101c73335cd250", |
| 92 | + "e5b12da43f5bd77298ca50e19706bdeb", "90e953abba9c8fe202e0adf825c0dfcc", |
| 93 | + "38a11553c7e2d00482281e74f7daf321", "2bcdfe2c37154a355fe4e8150c279c13", |
| 94 | + "580a55e56a44fed935f0101c37a8c4ab", "b8d08a9033d111429fadb4e25820f9f7", |
| 95 | + "59d1167f414a86cbccfb204791fea0eb", "dfde6b44026f19a88f1abc8ac2798ce6", |
| 96 | + "a1c2047a16130d61cac4db23b2f5b560", "e4d4b72df8d075aeb607dcb095210304", |
| 97 | + "5cdad48c799ffd723636cae26433f1f9", "98a7c13987f578277bfb66ac48a1e242", |
| 98 | + ] |
| 99 | + species_nsplit = [ |
| 100 | + 2, 1, 1, 2, 1, 1, 1, 3, 2, 1, 1, 1, 1, 2, 1, 1, 1, 1, 1, 1, 2, 20, |
| 101 | + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, |
| 102 | + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1 |
| 103 | + ] |
| 104 | + split_length = 22000 |
| 105 | + |
| 106 | + def __init__(self, path, species_id=0, split_id=0, verbose=1, **kwargs): |
| 107 | + path = os.path.expanduser(path) |
| 108 | + if not os.path.exists(path): |
| 109 | + os.makedirs(path) |
| 110 | + self.path = path |
| 111 | + |
| 112 | + species_name = os.path.basename(self.urls[species_id])[:-4] |
| 113 | + if split_id >= self.species_nsplit[species_id]: |
| 114 | + raise ValueError("Split id %d should be less than %d in species %s" % |
| 115 | + (split_id, self.species_nsplit[species_id], species_name)) |
| 116 | + self.processed_file = "%s_%d.pkl.gz" % (species_name, split_id) |
| 117 | + pkl_file = os.path.join(path, self.processed_file) |
| 118 | + |
| 119 | + if os.path.exists(pkl_file): |
| 120 | + self.load_pickle(pkl_file, verbose=verbose, **kwargs) |
| 121 | + else: |
| 122 | + tar_file = utils.download(self.urls[species_id], path, md5=self.md5s[species_id]) |
| 123 | + pdb_path = utils.extract(tar_file) |
| 124 | + gz_files = sorted(glob.glob(os.path.join(pdb_path, "*.pdb.gz"))) |
| 125 | + pdb_files = [] |
| 126 | + index = slice(split_id * self.split_length, (split_id + 1) * self.split_length) |
| 127 | + for gz_file in gz_files[index]: |
| 128 | + pdb_files.append(utils.extract(gz_file)) |
| 129 | + self.load_pdbs(pdb_files, verbose=verbose, **kwargs) |
| 130 | + self.save_pickle(pkl_file, verbose=verbose) |
| 131 | + |
| 132 | + def get_item(self, index): |
| 133 | + if getattr(self, "lazy", False): |
| 134 | + protein = data.Protein.from_pdb(self.pdb_files[index], self.kwargs) |
| 135 | + else: |
| 136 | + protein = self.data[index].clone() |
| 137 | + # Zhaocheng: I didn't see any code that creates sparse residue features |
| 138 | + if hasattr(protein, "residue_feature"): |
| 139 | + with protein.residue(): |
| 140 | + protein.residue_feature = protein.residue_feature.to_dense() |
| 141 | + item = {"graph": protein} |
| 142 | + if self.transform: |
| 143 | + item = self.transform(item) |
| 144 | + return item |
| 145 | + |
| 146 | + def __repr__(self): |
| 147 | + lines = [ |
| 148 | + "#sample: %d" % len(self), |
| 149 | + ] |
| 150 | + return "%s(\n %s\n)" % (self.__class__.__name__, "\n ".join(lines)) |
0 commit comments