@@ -116,6 +116,13 @@ def load_csv(self, csv_file, smiles_field="smiles", target_fields=None, verbose=
116116 self .load_smiles (smiles , targets , verbose = verbose , ** kwargs )
117117
118118 def load_pickle (self , pkl_file , verbose = 0 ):
119+ """
120+ Load the dataset from a pickle file.
121+
122+ Parameters:
123+ pkl_file (str): file name
124+ verbose (int, optional): output verbose level
125+ """
119126 with utils .smart_open (pkl_file , "rb" ) as fin :
120127 num_sample , tasks = pickle .load (fin )
121128
@@ -133,6 +140,13 @@ def load_pickle(self, pkl_file, verbose=0):
133140 self .targets [task ] = value
134141
135142 def save_pickle (self , pkl_file , verbose = 0 ):
143+ """
144+ Save the dataset to a pickle file.
145+
146+ Parameters:
147+ pkl_file (str): file name
148+ verbose (int, optional): output verbose level
149+ """
136150 with utils .smart_open (pkl_file , "wb" ) as fout :
137151 num_sample = len (self .data )
138152 tasks = self .targets .keys ()
@@ -659,16 +673,16 @@ def load_sequence(self, sequences, targets, attributes=None, transform=None, laz
659673 self .targets [field ].append (targets [field ][i ])
660674
661675 @utils .copy_args (load_sequence )
662- def load_lmdbs (self , lmdb_files , number_field = "num_examples" , sequence_field = "primary" , target_fields = None ,
676+ def load_lmdbs (self , lmdb_files , sequence_field = "primary" , target_fields = None , number_field = "num_examples" ,
663677 transform = None , lazy = False , verbose = 0 , ** kwargs ):
664678 """
665679 Load the dataset from lmdb files.
666680
667681 Parameters:
668682 lmdb_files (list of str): list of lmdb files
669- number_field (str, optional): name of the field of sample count in lmdb files
670683 sequence_field (str, optional): name of the field of protein sequence in lmdb files
671684 target_fields (list of str, optional): name of target fields in lmdb files
685+ number_field (str, optional): name of the field of sample count in lmdb files
672686 transform (Callable, optional): protein sequence transformation function
673687 lazy (bool, optional): if lazy mode is used, the proteins are processed in the dataloader.
674688 This may slow down the data loading process, but save a lot of CPU memory and dataset loading time.
@@ -701,12 +715,13 @@ def load_lmdbs(self, lmdb_files, number_field="num_examples", sequence_field="pr
701715 self .num_samples = num_samples
702716
703717 @utils .copy_args (data .Protein .from_molecule )
704- def load_pdbs (self , pdb_files , transform = None , lazy = False , verbose = 0 , ** kwargs ):
718+ def load_pdbs (self , pdb_files , sanitize = True , transform = None , lazy = False , verbose = 0 , ** kwargs ):
705719 """
706720 Load the dataset from pdb files.
707721
708722 Parameters:
709723 pdb_files (list of str): pdb file names
724+ sanitize (bool, optional): whether to sanitize the molecule
710725 transform (Callable, optional): protein sequence transformation function
711726 lazy (bool, optional): if lazy mode is used, the proteins are processed in the dataloader.
712727 This may slow down the data loading process, but save a lot of CPU memory and dataset loading time.
@@ -729,7 +744,6 @@ def load_pdbs(self, pdb_files, transform=None, lazy=False, verbose=0, **kwargs):
729744 pdb_files = tqdm (pdb_files , "Constructing proteins from pdbs" )
730745 for i , pdb_file in enumerate (pdb_files ):
731746 if not lazy or i == 0 :
732- sanitize = kwargs .pop ("sanitize" , True )
733747 mol = Chem .MolFromPDBFile (pdb_file , sanitize = sanitize )
734748 if not mol :
735749 logger .debug ("Can't construct molecule from pdb file `%s`. Ignore this sample." % pdb_file )
@@ -779,10 +793,10 @@ def load_fasta(self, fasta_file, verbose=0, **kwargs):
779793 @utils .copy_args (data .Protein .from_molecule )
780794 def load_pickle (self , pkl_file , transform = None , lazy = False , verbose = 0 , ** kwargs ):
781795 """
782- Load the dataset from pickle files .
796+ Load the dataset from a pickle file .
783797
784798 Parameters:
785- pkl_file (str): pickle file name
799+ pkl_file (str): file name
786800 transform (Callable, optional): protein sequence transformation function
787801 lazy (bool, optional): if lazy mode is used, the proteins are processed in the dataloader.
788802 This may slow down the data loading process, but save a lot of CPU memory and dataset loading time.
@@ -808,13 +822,6 @@ def load_pickle(self, pkl_file, transform=None, lazy=False, verbose=0, **kwargs)
808822 self .data .append (protein )
809823
810824 def save_pickle (self , pkl_file , verbose = 0 ):
811- """
812- Save the dataset to pickle files.
813-
814- Parameters:
815- pkl_file (str): pickle file name
816- verbose (int, optional): output verbose level
817- """
818825 with utils .smart_open (pkl_file , "wb" ) as fout :
819826 num_sample = len (self .data )
820827 pickle .dump (num_sample , fout )
@@ -890,16 +897,16 @@ def load_sequence(self, sequences, targets, attributes=None, transform=None, laz
890897 self .targets [field ].append (targets [field ][i ])
891898
892899 @utils .copy_args (load_sequence )
893- def load_lmdbs (self , lmdb_files , number_field = "num_examples" , sequence_field = "primary" , target_fields = None ,
900+ def load_lmdbs (self , lmdb_files , sequence_field = "primary" , target_fields = None , number_field = "num_examples" ,
894901 transform = None , lazy = False , verbose = 0 , ** kwargs ):
895902 """
896903 Load the dataset from lmdb files.
897904
898905 Parameters:
899906 lmdb_files (list of str): file names
900- number_field (str, optional): name of the field of sample count in lmdb files
901907 sequence_field (str or list of str, optional): names of the fields of protein sequence in lmdb files
902908 target_fields (list of str, optional): name of target fields in lmdb files
909+ number_field (str, optional): name of the field of sample count in lmdb files
903910 transform (Callable, optional): protein sequence transformation function
904911 lazy (bool, optional): if lazy mode is used, the protein pairs are processed in the dataloader.
905912 This may slow down the data loading process, but save a lot of CPU memory and dataset loading time.
@@ -1022,17 +1029,17 @@ def load_sequence(self, sequences, smiles, targets, num_samples, attributes=None
10221029 return num_samples
10231030
10241031 @utils .copy_args (load_sequence )
1025- def load_lmdbs (self , lmdb_files , number_field = "num_examples" , sequence_field = "target" , smiles_field = "drug" ,
1026- target_fields = None , transform = None , lazy = False , verbose = 0 , ** kwargs ):
1032+ def load_lmdbs (self , lmdb_files , sequence_field = "target" , smiles_field = "drug" , target_fields = None ,
1033+ number_field = "num_examples" , transform = None , lazy = False , verbose = 0 , ** kwargs ):
10271034 """
10281035 Load the dataset from lmdb files.
10291036
10301037 Parameters:
10311038 lmdb_files (list of str): file names
1032- number_field (str, optional): name of the field of sample count in lmdb files
10331039 sequence_field (str, optional): name of the field of protein sequence in lmdb files
10341040 smiles_field (str, optional): name of the field of ligand SMILES string in lmdb files
10351041 target_fields (list of str, optional): name of target fields in lmdb files
1042+ number_field (str, optional): name of the field of sample count in lmdb files
10361043 transform (Callable, optional): protein sequence transformation function
10371044 lazy (bool, optional): if lazy mode is used, the protein-ligand pairs are processed in the dataloader.
10381045 This may slow down the data loading process, but save a lot of CPU memory and dataset loading time.
0 commit comments