33from wasabi import msg
44import pandas as pd
55from kern import authentication , api_calls , settings , exceptions
6- from typing import Optional , Dict
6+ from typing import List , Optional , Dict
77import json
8+ from tqdm import tqdm
9+ import spacy
810
911
1012class Client :
@@ -58,8 +60,23 @@ def get_project_details(self) -> Dict[str, str]:
5860 api_response = api_calls .get_request (url , self .session_token )
5961 return api_response
6062
63+ def get_lookup_list (self , list_id : str ) -> Dict [str , str ]:
64+ url = settings .get_lookup_list_url (self .project_id , list_id )
65+ api_response = api_calls .get_request (url , self .session_token )
66+ return api_response
67+
68+ def get_lookup_lists (self ) -> List [Dict [str , str ]]:
69+ lookup_lists = []
70+ for lookup_list_id in self .get_project_details ()["knowledge_base_ids" ]:
71+ lookup_list = self .get_lookup_list (lookup_list_id )
72+ lookup_lists .append (lookup_list )
73+ return lookup_lists
74+
6175 def get_record_export (
62- self , num_samples : Optional [int ] = None , download_to : Optional [str ] = None
76+ self ,
77+ num_samples : Optional [int ] = None ,
78+ download_to : Optional [str ] = None ,
79+ tokenize : Optional [bool ] = True ,
6380 ) -> pd .DataFrame :
6481 """Collects the export data of your project (i.e. the same data if you would export in the web app).
6582
@@ -74,6 +91,37 @@ def get_record_export(
7491 url , self .session_token , ** {"num_samples" : num_samples }
7592 )
7693 df = pd .DataFrame (api_response )
94+
95+ if tokenize :
96+ tokenize_attributes = []
97+ for column in df .columns :
98+ if "__confidence" in column :
99+ dtype = type (df [column ].iloc [0 ])
100+ if dtype == list :
101+ attribute = column .split ("__" )[0 ]
102+ tokenize_attributes .append (attribute )
103+
104+ if len (tokenize_attributes ) > 0 :
105+ tokenizer_package = self .get_project_details ()["tokenizer" ]
106+ if not spacy .util .is_package (tokenizer_package ):
107+ spacy .cli .download (tokenizer_package )
108+
109+ nlp = spacy .load (tokenizer_package )
110+
111+ msg .info (f"Tokenizing data with spaCy '{ tokenizer_package } '." )
112+ msg .info (
113+ "This will be provided from the server in future versions of Kern refinery."
114+ )
115+
116+ tqdm .pandas (desc = "Applying tokenization locally" )
117+ for attribute in tokenize_attributes :
118+ df [f"{ attribute } __tokenized" ] = df [attribute ].progress_apply (
119+ lambda x : nlp (x )
120+ )
121+
122+ else :
123+ msg .info ("No tokenization necessary." )
124+
77125 if download_to is not None :
78126 df .to_json (download_to , orient = "records" )
79127 msg .good (f"Downloaded export to { download_to } " )
0 commit comments