Skip to content

Commit 1c0a65d

Browse files
author
Johannes Hötter
committed
includes lookup list fetching and automated tokenization of record exports
1 parent 2b8b2d6 commit 1c0a65d

File tree

2 files changed

+55
-7
lines changed

2 files changed

+55
-7
lines changed

kern/__init__.py

Lines changed: 50 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -3,8 +3,10 @@
33
from wasabi import msg
44
import pandas as pd
55
from kern import authentication, api_calls, settings, exceptions
6-
from typing import Optional, Dict
6+
from typing import List, Optional, Dict
77
import json
8+
from tqdm import tqdm
9+
import spacy
810

911

1012
class Client:
@@ -58,8 +60,23 @@ def get_project_details(self) -> Dict[str, str]:
5860
api_response = api_calls.get_request(url, self.session_token)
5961
return api_response
6062

63+
def get_lookup_list(self, list_id: str) -> Dict[str, str]:
64+
url = settings.get_lookup_list_url(self.project_id, list_id)
65+
api_response = api_calls.get_request(url, self.session_token)
66+
return api_response
67+
68+
def get_lookup_lists(self) -> List[Dict[str, str]]:
69+
lookup_lists = []
70+
for lookup_list_id in self.get_project_details()["knowledge_base_ids"]:
71+
lookup_list = self.get_lookup_list(lookup_list_id)
72+
lookup_lists.append(lookup_list)
73+
return lookup_lists
74+
6175
def get_record_export(
62-
self, num_samples: Optional[int] = None, download_to: Optional[str] = None
76+
self,
77+
num_samples: Optional[int] = None,
78+
download_to: Optional[str] = None,
79+
tokenize: Optional[bool] = True,
6380
) -> pd.DataFrame:
6481
"""Collects the export data of your project (i.e. the same data if you would export in the web app).
6582
@@ -74,6 +91,37 @@ def get_record_export(
7491
url, self.session_token, **{"num_samples": num_samples}
7592
)
7693
df = pd.DataFrame(api_response)
94+
95+
if tokenize:
96+
tokenize_attributes = []
97+
for column in df.columns:
98+
if "__confidence" in column:
99+
dtype = type(df[column].iloc[0])
100+
if dtype == list:
101+
attribute = column.split("__")[0]
102+
tokenize_attributes.append(attribute)
103+
104+
if len(tokenize_attributes) > 0:
105+
tokenizer_package = self.get_project_details()["tokenizer"]
106+
if not spacy.util.is_package(tokenizer_package):
107+
spacy.cli.download(tokenizer_package)
108+
109+
nlp = spacy.load(tokenizer_package)
110+
111+
msg.info(f"Tokenizing data with spaCy '{tokenizer_package}'.")
112+
msg.info(
113+
"This will be provided from the server in future versions of Kern refinery."
114+
)
115+
116+
tqdm.pandas(desc="Applying tokenization locally")
117+
for attribute in tokenize_attributes:
118+
df[f"{attribute}__tokenized"] = df[attribute].progress_apply(
119+
lambda x: nlp(x)
120+
)
121+
122+
else:
123+
msg.info("No tokenization necessary.")
124+
77125
if download_to is not None:
78126
df.to_json(download_to, orient="records")
79127
msg.good(f"Downloaded export to {download_to}")

kern/settings.py

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -24,15 +24,15 @@ def get_authentication_url() -> str:
2424
return f"{BASE_URI}/.ory/kratos/public/self-service/login/api"
2525

2626

27-
def get_config_url():
28-
return f"{BASE_URI}/api/config/"
27+
def get_project_url(project_id: str) -> str:
28+
return f"{BASE_URI}/api/project/{project_id}"
2929

3030

31-
def get_project_url(project_id: str):
32-
return f"{BASE_URI}/api/project/{project_id}"
31+
def get_lookup_list_url(project_id: str, lookup_list_id: str) -> str:
32+
return f"{get_project_url(project_id)}/lookup_list/{lookup_list_id}"
3333

3434

35-
def get_records_url(project_id: str):
35+
def get_records_url(project_id: str) -> str:
3636
return f"{get_project_url(project_id)}/records"
3737

3838

0 commit comments

Comments
 (0)