Skip to content

Commit ddecb2f

Browse files
committed
Refactor get_about method with abstract factory pattern
This will make it simpler to implement other extractor classes.
1 parent cd8c345 commit ddecb2f

File tree

2 files changed

+89
-24
lines changed

2 files changed

+89
-24
lines changed

loading_sdk/sync_api/client.py

Lines changed: 22 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -8,7 +8,7 @@
88
EDITORIAL_SORT,
99
USER_AGENT,
1010
)
11-
from loading_sdk.sync_api.extractors import AboutPageExtractor
11+
from loading_sdk.sync_api.extractors import extract_data
1212

1313

1414
class LoadingApiClient:
@@ -462,6 +462,25 @@ def get_about(self):
462462
463463
:rtype dict
464464
"""
465-
about_page = AboutPageExtractor()
465+
data = extract_data("about")
466466

467-
return about_page.data
467+
if not data:
468+
return {"code": 404, "message": "No data found", "data": None}
469+
470+
data = {
471+
"code": 200,
472+
"message": "OK",
473+
"data": data,
474+
}
475+
476+
return data
477+
478+
def get_socials(self):
479+
"""Get social media links
480+
481+
:rtype dict
482+
"""
483+
484+
data = extract_data("socials")
485+
486+
return data

loading_sdk/sync_api/extractors.py

Lines changed: 67 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -1,32 +1,30 @@
11
import json
22
import re
3+
from abc import ABC, abstractmethod
34

45
import requests
56
from bs4 import BeautifulSoup
67
from loading_sdk.settings import BASE_URL, USER_AGENT
78

89

9-
class AboutPageExtractor:
10-
def __init__(self):
11-
about_page_source = self._get_source(f"{BASE_URL}/om")
12-
main_script_url = self._extract_main_script_url(about_page_source)
13-
main_script_source = self._get_source(f"{BASE_URL}/{main_script_url}")
14-
about_script_url = self._get_about_script_url(main_script_source)
15-
about_script_source = self._get_source(about_script_url)
16-
17-
self.data = self._get_about_data(about_script_source)
18-
19-
def _get_source(self, url):
10+
class Extractor(ABC):
11+
def get_source(self, url: str) -> str:
2012
headers = {"User-Agent": USER_AGENT}
2113
response = requests.get(url, headers=headers, timeout=10)
2214

2315
return response.text
2416

25-
def _get_about_script_url(self, source_code):
17+
def get_script(self, source: str) -> str:
18+
soup = BeautifulSoup(source, "html.parser")
19+
main_script = soup.find(src=re.compile(r"/static/js/main\.[0-9a-zA-Z]+\.js"))
20+
21+
return main_script["src"][1:]
22+
23+
def get_chunks(self, source: str) -> list:
2624
chunk_urls = []
2725

2826
# Extracts the code with the javascript chunks.
29-
match = re.search(r"(static/js/).+?(?=\{)(.+?(?=\[)).+(.chunk.js)", source_code)
27+
match = re.search(r"(static/js/).+?(?=\{)(.+?(?=\[)).+(.chunk.js)", source)
3028

3129
if match:
3230
# Transform the code into valid JSON so the chunk ids can be stored in a python dict.
@@ -37,10 +35,25 @@ def _get_about_script_url(self, source_code):
3735
chunk_url = f"{BASE_URL}/{match.group(1)}{key}.{value}{match.group(3)}"
3836
chunk_urls.append(chunk_url)
3937

40-
return chunk_urls[-1]
38+
return chunk_urls
4139

42-
def _get_about_data(self, source_code):
43-
match = re.search(r"var.e=(.+?)(?=\.map).+a=(.+?)(?=\.map)", source_code)
40+
@abstractmethod
41+
def get_data(self):
42+
pass
43+
44+
45+
class AboutExtractor(Extractor):
46+
def get_data(self):
47+
about_page_source = self.get_source(f"{BASE_URL}/om")
48+
main_script_url = self.get_script(about_page_source)
49+
main_script_source = self.get_source(f"{BASE_URL}/{main_script_url}")
50+
chunk_urls = self.get_chunks(main_script_source)
51+
about_script_url = chunk_urls[-1]
52+
about_script_source = self.get_source(about_script_url)
53+
54+
match = re.search(
55+
r"var.e=(.+?)(?=\.map).+a=(.+?)(?=\.map)", about_script_source
56+
)
4457

4558
if not match:
4659
return None
@@ -57,13 +70,46 @@ def _get_about_data(self, source_code):
5770
moderators = moderators.replace("\\n", "")
5871
moderators = moderators.encode("utf-8").decode("unicode_escape")
5972

60-
return {
73+
data = {
6174
"people": json.loads(people),
6275
"moderators": json.loads(moderators),
6376
}
6477

65-
def _extract_main_script_url(self, html):
66-
soup = BeautifulSoup(html, "html.parser")
67-
main_script = soup.find(src=re.compile(r"/static/js/main\.[0-9a-zA-Z]+\.js"))
78+
return data
6879

69-
return main_script["src"][1:]
80+
81+
class SocialsExtractor(Extractor):
82+
def get_data(self):
83+
pass
84+
85+
86+
class ExtractorFactory(ABC):
87+
@abstractmethod
88+
def get_extractor(self) -> Extractor:
89+
pass
90+
91+
92+
class AboutExtractorFactory(ExtractorFactory):
93+
def get_extractor(self) -> Extractor:
94+
return AboutExtractor()
95+
96+
97+
class SocialsExtractorFactory(ExtractorFactory):
98+
def get_extractor(self) -> Extractor:
99+
return SocialsExtractor()
100+
101+
102+
def extract_data(extractor_name):
103+
factories = {
104+
"about": AboutExtractorFactory(),
105+
"socials": SocialsExtractorFactory(),
106+
}
107+
108+
if extractor_name in factories:
109+
factory = factories[extractor_name]
110+
extractor = factory.get_extractor()
111+
data = extractor.get_data()
112+
113+
return data
114+
115+
return None

0 commit comments

Comments
 (0)