11import json
22import re
3+ from abc import ABC , abstractmethod
34
45import requests
56from bs4 import BeautifulSoup
67from loading_sdk .settings import BASE_URL , USER_AGENT
78
89
9- class AboutPageExtractor :
10- def __init__ (self ):
11- about_page_source = self ._get_source (f"{ BASE_URL } /om" )
12- main_script_url = self ._extract_main_script_url (about_page_source )
13- main_script_source = self ._get_source (f"{ BASE_URL } /{ main_script_url } " )
14- about_script_url = self ._get_about_script_url (main_script_source )
15- about_script_source = self ._get_source (about_script_url )
16-
17- self .data = self ._get_about_data (about_script_source )
18-
19- def _get_source (self , url ):
10+ class Extractor (ABC ):
11+ def get_source (self , url : str ) -> str :
2012 headers = {"User-Agent" : USER_AGENT }
2113 response = requests .get (url , headers = headers , timeout = 10 )
2214
2315 return response .text
2416
25- def _get_about_script_url (self , source_code ):
17+ def get_script (self , source : str ) -> str :
18+ soup = BeautifulSoup (source , "html.parser" )
19+ main_script = soup .find (src = re .compile (r"/static/js/main\.[0-9a-zA-Z]+\.js" ))
20+
21+ return main_script ["src" ][1 :]
22+
23+ def get_chunks (self , source : str ) -> list :
2624 chunk_urls = []
2725
2826 # Extracts the code with the javascript chunks.
29- match = re .search (r"(static/js/).+?(?=\{)(.+?(?=\[)).+(.chunk.js)" , source_code )
27+ match = re .search (r"(static/js/).+?(?=\{)(.+?(?=\[)).+(.chunk.js)" , source )
3028
3129 if match :
3230 # Transform the code into valid JSON so the chunk ids can be stored in a python dict.
@@ -37,10 +35,25 @@ def _get_about_script_url(self, source_code):
3735 chunk_url = f"{ BASE_URL } /{ match .group (1 )} { key } .{ value } { match .group (3 )} "
3836 chunk_urls .append (chunk_url )
3937
40- return chunk_urls [ - 1 ]
38+ return chunk_urls
4139
42- def _get_about_data (self , source_code ):
43- match = re .search (r"var.e=(.+?)(?=\.map).+a=(.+?)(?=\.map)" , source_code )
40+ @abstractmethod
41+ def get_data (self ):
42+ pass
43+
44+
45+ class AboutExtractor (Extractor ):
46+ def get_data (self ):
47+ about_page_source = self .get_source (f"{ BASE_URL } /om" )
48+ main_script_url = self .get_script (about_page_source )
49+ main_script_source = self .get_source (f"{ BASE_URL } /{ main_script_url } " )
50+ chunk_urls = self .get_chunks (main_script_source )
51+ about_script_url = chunk_urls [- 1 ]
52+ about_script_source = self .get_source (about_script_url )
53+
54+ match = re .search (
55+ r"var.e=(.+?)(?=\.map).+a=(.+?)(?=\.map)" , about_script_source
56+ )
4457
4558 if not match :
4659 return None
@@ -57,13 +70,46 @@ def _get_about_data(self, source_code):
5770 moderators = moderators .replace ("\\ n" , "" )
5871 moderators = moderators .encode ("utf-8" ).decode ("unicode_escape" )
5972
60- return {
73+ data = {
6174 "people" : json .loads (people ),
6275 "moderators" : json .loads (moderators ),
6376 }
6477
65- def _extract_main_script_url (self , html ):
66- soup = BeautifulSoup (html , "html.parser" )
67- main_script = soup .find (src = re .compile (r"/static/js/main\.[0-9a-zA-Z]+\.js" ))
78+ return data
6879
69- return main_script ["src" ][1 :]
80+
81+ class SocialsExtractor (Extractor ):
82+ def get_data (self ):
83+ pass
84+
85+
86+ class ExtractorFactory (ABC ):
87+ @abstractmethod
88+ def get_extractor (self ) -> Extractor :
89+ pass
90+
91+
92+ class AboutExtractorFactory (ExtractorFactory ):
93+ def get_extractor (self ) -> Extractor :
94+ return AboutExtractor ()
95+
96+
97+ class SocialsExtractorFactory (ExtractorFactory ):
98+ def get_extractor (self ) -> Extractor :
99+ return SocialsExtractor ()
100+
101+
102+ def extract_data (extractor_name ):
103+ factories = {
104+ "about" : AboutExtractorFactory (),
105+ "socials" : SocialsExtractorFactory (),
106+ }
107+
108+ if extractor_name in factories :
109+ factory = factories [extractor_name ]
110+ extractor = factory .get_extractor ()
111+ data = extractor .get_data ()
112+
113+ return data
114+
115+ return None
0 commit comments