11import json
22import re
3+ from abc import ABC , abstractmethod
34
45import aiohttp
56from bs4 import BeautifulSoup
67from loading_sdk .settings import BASE_URL , USER_AGENT
78
89
9- class AboutPageExtractor :
10- async def extract_about_data (self ):
11- about_page_source = await self ._get_source (f"{ BASE_URL } /om" )
12- main_script_url = self ._extract_main_script_url (about_page_source )
13- main_script_source = await self ._get_source (f"{ BASE_URL } /{ main_script_url } " )
14- about_script_url = self ._get_about_script_url (main_script_source )
15- about_script_source = await self ._get_source (about_script_url )
16-
17- return self ._get_about_data (about_script_source )
18-
19- async def _get_source (self , url ):
10+ class Extractor (ABC ):
11+ async def get_source (self , url : str ) -> str :
2012 headers = {"User-Agent" : USER_AGENT }
21-
2213 async with aiohttp .ClientSession () as session :
2314 async with session .get (url , headers = headers ) as response :
2415 return await response .text ()
2516
26- def _get_about_script_url (self , source_code ):
17+ def get_script (self , source : str ) -> str :
18+ soup = BeautifulSoup (source , "html.parser" )
19+ main_script = soup .find (src = re .compile (r"/static/js/main\.[0-9a-zA-Z]+\.js" ))
20+
21+ return main_script ["src" ][1 :]
22+
23+ def get_chunks (self , source : str ) -> list :
2724 chunk_urls = []
2825
2926 # Extracts the code with the javascript chunks.
30- match = re .search (r"(static/js/).+?(?=\{)(.+?(?=\[)).+(.chunk.js)" , source_code )
27+ match = re .search (r"(static/js/).+?(?=\{)(.+?(?=\[)).+(.chunk.js)" , source )
3128
3229 if match :
3330 # Transform the code into valid JSON so the chunk ids can be stored in a python dict.
@@ -38,10 +35,25 @@ def _get_about_script_url(self, source_code):
3835 chunk_url = f"{ BASE_URL } /{ match .group (1 )} { key } .{ value } { match .group (3 )} "
3936 chunk_urls .append (chunk_url )
4037
41- return chunk_urls [- 1 ]
38+ return chunk_urls
39+
40+ @abstractmethod
41+ def get_data (self ):
42+ pass
4243
43- def _get_about_data (self , source_code ):
44- match = re .search (r"var.e=(.+?)(?=\.map).+a=(.+?)(?=\.map)" , source_code )
44+
45+ class AboutExtractor (Extractor ):
46+ async def get_data (self ):
47+ about_page_source = await self .get_source (f"{ BASE_URL } /om" )
48+ main_script_url = self .get_script (about_page_source )
49+ main_script_source = await self .get_source (f"{ BASE_URL } /{ main_script_url } " )
50+ chunk_urls = self .get_chunks (main_script_source )
51+ about_script_url = chunk_urls [- 1 ]
52+ about_script_source = await self .get_source (about_script_url )
53+
54+ match = re .search (
55+ r"var.e=(.+?)(?=\.map).+a=(.+?)(?=\.map)" , about_script_source
56+ )
4557
4658 if not match :
4759 return None
@@ -58,13 +70,63 @@ def _get_about_data(self, source_code):
5870 moderators = moderators .replace ("\\ n" , "" )
5971 moderators = moderators .encode ("utf-8" ).decode ("unicode_escape" )
6072
61- return {
73+ data = {
6274 "people" : json .loads (people ),
6375 "moderators" : json .loads (moderators ),
6476 }
6577
66- def _extract_main_script_url (self , html ):
67- soup = BeautifulSoup (html , "html.parser" )
68- main_script = soup .find (src = re .compile (r"/static/js/main\.[0-9a-zA-Z]+\.js" ))
78+ return data
6979
70- return main_script ["src" ][1 :]
80+
81+ class SocialsExtractor (Extractor ):
82+ async def get_data (self ):
83+ page_source = await self .get_source (BASE_URL )
84+ main_script_url = self .get_script (page_source )
85+ main_script_source = await self .get_source (f"{ BASE_URL } /{ main_script_url } " )
86+
87+ match = re .findall (
88+ r"(?:href:\")"
89+ + r"(https:\/\/|https:\/\/www.(.*?)\..*?\/.*?)"
90+ + r"(?:\",target:\"_blank\",rel:\"noreferrer noopener\",className:)"
91+ + r"(?:\"Footer-(?:icon|patreon)\")" ,
92+ main_script_source ,
93+ )
94+
95+ if not match :
96+ return None
97+
98+ data = [{"name" : social [1 ], "link" : social [0 ]} for social in match ]
99+
100+ return data
101+
102+
103+ class ExtractorFactory (ABC ):
104+ @abstractmethod
105+ def get_extractor (self ) -> Extractor :
106+ pass
107+
108+
109+ class AboutExtractorFactory (ExtractorFactory ):
110+ def get_extractor (self ) -> Extractor :
111+ return AboutExtractor ()
112+
113+
114+ class SocialsExtractorFactory (ExtractorFactory ):
115+ def get_extractor (self ) -> Extractor :
116+ return SocialsExtractor ()
117+
118+
119+ async def extract_data (extractor_name ):
120+ factories = {
121+ "about" : AboutExtractorFactory (),
122+ "socials" : SocialsExtractorFactory (),
123+ }
124+
125+ if extractor_name in factories :
126+ factory = factories [extractor_name ]
127+ extractor = factory .get_extractor ()
128+ data = await extractor .get_data ()
129+
130+ return data
131+
132+ return None
0 commit comments