Skip to content

Commit 85b470e

Browse files
committed
Add async code for get_about and get_socials methods
1 parent d131ba2 commit 85b470e

File tree

2 files changed

+104
-26
lines changed

2 files changed

+104
-26
lines changed

loading_sdk/async_api/client.py

Lines changed: 20 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1,14 +1,14 @@
11
import math
22

33
import aiohttp
4-
from loading_sdk.async_api.extractors import AboutPageExtractor
54
from loading_sdk.settings import (
65
API_URL,
76
API_VERSION,
87
EDITORIAL_POST_TYPES,
98
EDITORIAL_SORT,
109
USER_AGENT,
1110
)
11+
from loading_sdk.async_api.extractors import extract_data
1212

1313

1414
async def async_loading_api_client(email=None, password=None):
@@ -497,7 +497,23 @@ async def get_about(self):
497497
498498
:rtype dict
499499
"""
500-
about_page = AboutPageExtractor()
501-
about_data = await about_page.extract_about_data()
502500

503-
return about_data
501+
data = await extract_data("about")
502+
503+
if not data:
504+
return {"code": 404, "message": "No data found", "data": None}
505+
506+
return {"code": 200, "message": "OK", "data": data}
507+
508+
async def get_socials(self):
509+
"""Get social media links
510+
511+
:rtype dict
512+
"""
513+
514+
data = await extract_data("socials")
515+
516+
if not data:
517+
return {"code": 404, "message": "No results found", "data": None}
518+
519+
return {"code": 200, "message": "OK", "data": data}
Lines changed: 84 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -1,33 +1,30 @@
11
import json
22
import re
3+
from abc import ABC, abstractmethod
34

45
import aiohttp
56
from bs4 import BeautifulSoup
67
from loading_sdk.settings import BASE_URL, USER_AGENT
78

89

9-
class AboutPageExtractor:
10-
async def extract_about_data(self):
11-
about_page_source = await self._get_source(f"{BASE_URL}/om")
12-
main_script_url = self._extract_main_script_url(about_page_source)
13-
main_script_source = await self._get_source(f"{BASE_URL}/{main_script_url}")
14-
about_script_url = self._get_about_script_url(main_script_source)
15-
about_script_source = await self._get_source(about_script_url)
16-
17-
return self._get_about_data(about_script_source)
18-
19-
async def _get_source(self, url):
10+
class Extractor(ABC):
11+
async def get_source(self, url: str) -> str:
2012
headers = {"User-Agent": USER_AGENT}
21-
2213
async with aiohttp.ClientSession() as session:
2314
async with session.get(url, headers=headers) as response:
2415
return await response.text()
2516

26-
def _get_about_script_url(self, source_code):
17+
def get_script(self, source: str) -> str:
18+
soup = BeautifulSoup(source, "html.parser")
19+
main_script = soup.find(src=re.compile(r"/static/js/main\.[0-9a-zA-Z]+\.js"))
20+
21+
return main_script["src"][1:]
22+
23+
def get_chunks(self, source: str) -> list:
2724
chunk_urls = []
2825

2926
# Extracts the code with the javascript chunks.
30-
match = re.search(r"(static/js/).+?(?=\{)(.+?(?=\[)).+(.chunk.js)", source_code)
27+
match = re.search(r"(static/js/).+?(?=\{)(.+?(?=\[)).+(.chunk.js)", source)
3128

3229
if match:
3330
# Transform the code into valid JSON so the chunk ids can be stored in a python dict.
@@ -38,10 +35,25 @@ def _get_about_script_url(self, source_code):
3835
chunk_url = f"{BASE_URL}/{match.group(1)}{key}.{value}{match.group(3)}"
3936
chunk_urls.append(chunk_url)
4037

41-
return chunk_urls[-1]
38+
return chunk_urls
39+
40+
@abstractmethod
41+
def get_data(self):
42+
pass
4243

43-
def _get_about_data(self, source_code):
44-
match = re.search(r"var.e=(.+?)(?=\.map).+a=(.+?)(?=\.map)", source_code)
44+
45+
class AboutExtractor(Extractor):
46+
async def get_data(self):
47+
about_page_source = await self.get_source(f"{BASE_URL}/om")
48+
main_script_url = self.get_script(about_page_source)
49+
main_script_source = await self.get_source(f"{BASE_URL}/{main_script_url}")
50+
chunk_urls = self.get_chunks(main_script_source)
51+
about_script_url = chunk_urls[-1]
52+
about_script_source = await self.get_source(about_script_url)
53+
54+
match = re.search(
55+
r"var.e=(.+?)(?=\.map).+a=(.+?)(?=\.map)", about_script_source
56+
)
4557

4658
if not match:
4759
return None
@@ -58,13 +70,63 @@ def _get_about_data(self, source_code):
5870
moderators = moderators.replace("\\n", "")
5971
moderators = moderators.encode("utf-8").decode("unicode_escape")
6072

61-
return {
73+
data = {
6274
"people": json.loads(people),
6375
"moderators": json.loads(moderators),
6476
}
6577

66-
def _extract_main_script_url(self, html):
67-
soup = BeautifulSoup(html, "html.parser")
68-
main_script = soup.find(src=re.compile(r"/static/js/main\.[0-9a-zA-Z]+\.js"))
78+
return data
6979

70-
return main_script["src"][1:]
80+
81+
class SocialsExtractor(Extractor):
82+
async def get_data(self):
83+
page_source = await self.get_source(BASE_URL)
84+
main_script_url = self.get_script(page_source)
85+
main_script_source = await self.get_source(f"{BASE_URL}/{main_script_url}")
86+
87+
match = re.findall(
88+
r"(?:href:\")"
89+
+ r"(https:\/\/|https:\/\/www.(.*?)\..*?\/.*?)"
90+
+ r"(?:\",target:\"_blank\",rel:\"noreferrer noopener\",className:)"
91+
+ r"(?:\"Footer-(?:icon|patreon)\")",
92+
main_script_source,
93+
)
94+
95+
if not match:
96+
return None
97+
98+
data = [{"name": social[1], "link": social[0]} for social in match]
99+
100+
return data
101+
102+
103+
class ExtractorFactory(ABC):
104+
@abstractmethod
105+
def get_extractor(self) -> Extractor:
106+
pass
107+
108+
109+
class AboutExtractorFactory(ExtractorFactory):
110+
def get_extractor(self) -> Extractor:
111+
return AboutExtractor()
112+
113+
114+
class SocialsExtractorFactory(ExtractorFactory):
115+
def get_extractor(self) -> Extractor:
116+
return SocialsExtractor()
117+
118+
119+
async def extract_data(extractor_name):
120+
factories = {
121+
"about": AboutExtractorFactory(),
122+
"socials": SocialsExtractorFactory(),
123+
}
124+
125+
if extractor_name in factories:
126+
factory = factories[extractor_name]
127+
extractor = factory.get_extractor()
128+
data = await extractor.get_data()
129+
130+
return data
131+
132+
return None

0 commit comments

Comments
 (0)