Skip to content
This repository was archived by the owner on Apr 17, 2023. It is now read-only.

Commit 729744d

Browse files
committed
Refactor TwitterCrawler to be more desriptive
1 parent 6a10821 commit 729744d

File tree

1 file changed

+26
-14
lines changed

1 file changed

+26
-14
lines changed

utils/twitter_crawler.py

Lines changed: 26 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -30,16 +30,28 @@ def login(self, username: str, password: str, timeout: Optional[float] = 10000)
3030
self.page.wait_for_url('https://twitter.com/home')
3131

3232
def get_all_liked_tweets(self, username: str, scroll_timeout: float = 0.8) -> list[str]:
33-
return self.get_liked_tweets_until(username, 'nothing', scroll_timeout)
33+
"""Get the username's all liked tweets
34+
Returns the list of links of liked tweets
35+
"""
36+
return self.get_liked_tweets_until(
37+
username, 'nothing', scroll_timeout
38+
) # 'nothing' was intended because the given `until_link` would be never found on the links list
3439

3540
def get_liked_tweets_until(self, username: str, until_link: str, scroll_timeout: float = 0.8) -> list[str]:
36-
self._open_liked_tweets(username)
41+
"""Scrolling down the list of liked tweets until the given `until_link` found
42+
Returns the list of links of liked tweets
43+
"""
44+
self._goto_liked_tweets(username)
3745
links: list[str] = []
3846

3947
previous_height = self.page_current_height
4048
while True:
49+
# 1. scroll down
50+
# 2. get the link of tweets in the current screen(tweets are not reachable if it's not rendering)
51+
# 3. break if page reaches to the bottom or the given `until_link` found
52+
4153
self.page.mouse.wheel(0, 1500)
42-
time.sleep(scroll_timeout) # wait for mouse cursor down
54+
time.sleep(scroll_timeout) # wait for mouse wheel to scroll down
4355
is_page_bottom = self.page_current_height == previous_height
4456
if is_page_bottom:
4557
break
@@ -57,31 +69,31 @@ def get_liked_tweets_until(self, username: str, until_link: str, scroll_timeout:
5769
return links
5870

5971
def get_recent_liked_tweet(self, username: str) -> str:
60-
self._open_liked_tweets(username)
72+
self._goto_liked_tweets(username)
6173
return self._get_article_links_in_current_screen()[0]
6274

63-
def get_video_of_tweet(self, link: str, timeout: Optional[float] = 5000) -> Optional[tuple[str, list[str]]]:
64-
links: list[str] = []
75+
def get_video_of_tweet(self, link: str, timeout: Optional[float] = 5000) -> list[tuple[str, str]]:
76+
video_links: list[str] = []
6577

6678
def _request_m3u8_capture_handler(request: Request) -> None:
6779
if 'm3u8' in request.url:
68-
links.append(request.url)
80+
video_links.append(request.url)
6981

7082
self.page.on('request', _request_m3u8_capture_handler)
7183
self.page.goto(link)
7284
try:
7385
self.page.wait_for_selector('video', timeout=timeout)
7486
except Error:
75-
return None
87+
return []
7688

77-
return self._get_video_name(), links
89+
return [(f'{self._parse_tweet_name()}_{index}.mp4', link) for index, link in enumerate(video_links)]
7890

79-
def _get_video_name(self) -> str:
91+
def _parse_tweet_name(self) -> str:
8092
uploader = self.page.get_by_test_id('primaryColumn').get_by_role('link').nth(0).inner_text().strip()
8193
content = self.page.get_by_role('article').get_by_test_id('tweetText').nth(0).inner_text().strip()
82-
return f'{uploader} - {content}.mp4'
94+
return f'{uploader} - {content}'
8395

84-
def _open_liked_tweets(self, username: str) -> None:
96+
def _goto_liked_tweets(self, username: str) -> None:
8597
self.page.goto(f'https://twitter.com/{username}/likes')
8698
self.page.wait_for_selector('article')
8799

@@ -98,7 +110,7 @@ def _get_article_links_in_current_screen(self) -> list[str]:
98110
for i in range(article_length)
99111
]
100112
break
101-
except Error:
102-
self.page.mouse.wheel(0, 500)
113+
except Error: # if articles in the page are not reachable
114+
self.page.mouse.wheel(0, 500) # scrolling down to refresh the articles
103115

104116
return links

0 commit comments

Comments
 (0)