Skip to content
This repository was archived by the owner on Apr 17, 2023. It is now read-only.

Commit b0bad4d

Browse files
committed
Add methods for crawling target user's media
1 parent 29beba3 commit b0bad4d

File tree

1 file changed

+57
-14
lines changed

1 file changed

+57
-14
lines changed

twitter_video_tools/twitter_crawler.py

Lines changed: 57 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -72,6 +72,25 @@ def get_recent_liked_tweet(self, username: str) -> str:
7272
self._goto_liked_tweets(username)
7373
return self._get_tweets_in_current_screen()[0]
7474

75+
def _get_tweets_in_current_screen(self) -> list[str]:
76+
links: list[str] = []
77+
78+
while True:
79+
articles = self.page.locator('article')
80+
article_length = articles.count()
81+
try:
82+
links = [
83+
'https://twitter.com' +
84+
(articles.nth(i).locator('div').locator('a').nth(3).get_attribute('href', timeout=500) or '')
85+
for i in range(article_length)
86+
]
87+
break
88+
except Error: # if articles in the page are not reachable
89+
self.page.mouse.wheel(0, 500) # scrolling down to refresh the articles
90+
self.page.mouse.wheel(0, -500) # scrolling down to refresh the articles
91+
92+
return links
93+
7594
def get_video_of_tweet(self, link: str, timeout: Optional[float] = 10000) -> list[tuple[str, str]]:
7695
video_links: list[str] = []
7796

@@ -97,30 +116,54 @@ def _goto_liked_tweets(self, username: str) -> None:
97116
self.page.goto(f'https://twitter.com/{username}/likes')
98117
self.page.wait_for_selector('article')
99118

100-
def _get_video_tweets_in_current_screen(self) -> list[str]:
119+
def get_all_media_tweets(self, username: str, scroll_timeout: float = 0.8) -> list[str]:
120+
"""Get the username's all liked tweets
121+
Returns the list of links of liked tweets
122+
"""
123+
return self.get_media_tweets_until(
124+
username, 'nothing', scroll_timeout
125+
) # 'nothing' was intended because the given `until_link` would be never found on the links list
126+
127+
def get_media_tweets_until(self, username: str, until_link: str, scroll_timeout: float = 0.8) -> list[str]:
128+
"""Scrolling down the list of media tweets until the given `until_link` found
129+
Returns the list of links of media tweets
130+
"""
131+
self._goto_media_tweets(username)
101132
links: list[str] = []
102133

134+
previous_height = self.page_current_height
103135
while True:
104-
articles = self.page.locator('article:has(video)')
105-
article_length = articles.count()
106-
try:
107-
links = [
108-
'https://twitter.com' +
109-
(articles.nth(i).locator('div').locator('a').nth(3).get_attribute('href', timeout=500) or '')
110-
for i in range(article_length)
111-
]
136+
# 1. scroll down
137+
# 2. get the link of tweets in the current screen(tweets are not reachable if it's not rendering)
138+
# 3. break if page reaches to the bottom or the given `until_link` found
139+
140+
self.page.mouse.wheel(0, 1500)
141+
time.sleep(scroll_timeout) # wait for mouse wheel to scroll down
142+
is_page_bottom = self.page_current_height == previous_height
143+
if is_page_bottom:
144+
break
145+
previous_height = self.page_current_height
146+
147+
new_links = self._get_video_tweets_in_current_screen()
148+
links.extend(new_links)
149+
links = list(set(links))
150+
151+
print(f'Found {len(links)} media tweets.')
152+
153+
if until_link in links:
112154
break
113-
except Error: # if articles in the page are not reachable
114-
self.page.mouse.wheel(0, 500) # scrolling down to refresh the articles
115-
self.page.mouse.wheel(0, -500) # scrolling down to refresh the articles
116155

117156
return links
118157

119-
def _get_tweets_in_current_screen(self) -> list[str]:
158+
def _goto_media_tweets(self, username: str) -> None:
159+
self.page.goto(f'https://twitter.com/{username}/media')
160+
self.page.wait_for_selector('article')
161+
162+
def _get_video_tweets_in_current_screen(self) -> list[str]:
120163
links: list[str] = []
121164

122165
while True:
123-
articles = self.page.locator('article')
166+
articles = self.page.locator('article:has(video)')
124167
article_length = articles.count()
125168
try:
126169
links = [

0 commit comments

Comments
 (0)