Add methods for crawling target user's media

code-yeongyu · code-yeongyu · commit b0bad4d24876 · 2022-12-05T20:56:06.000+09:00
diff --git a/twitter_video_tools/twitter_crawler.py b/twitter_video_tools/twitter_crawler.py
@@ -72,6 +72,25 @@ def get_recent_liked_tweet(self, username: str) -> str:
         self._goto_liked_tweets(username)
         return self._get_tweets_in_current_screen()[0]
 
+    def _get_tweets_in_current_screen(self) -> list[str]:
+        links: list[str] = []
+
+        while True:
+            articles = self.page.locator('article')
+            article_length = articles.count()
+            try:
+                links = [
+                    'https://twitter.com' +
+                    (articles.nth(i).locator('div').locator('a').nth(3).get_attribute('href', timeout=500) or '')
+                    for i in range(article_length)
+                ]
+                break
+            except Error:    # if articles in the page are not reachable
+                self.page.mouse.wheel(0, 500)    #  scrolling down to refresh the articles
+                self.page.mouse.wheel(0, -500)    #  scrolling down to refresh the articles
+
+        return links
+
     def get_video_of_tweet(self, link: str, timeout: Optional[float] = 10000) -> list[tuple[str, str]]:
         video_links: list[str] = []
 
@@ -97,30 +116,54 @@ def _goto_liked_tweets(self, username: str) -> None:
         self.page.goto(f'https://twitter.com/{username}/likes')
         self.page.wait_for_selector('article')
 
-    def _get_video_tweets_in_current_screen(self) -> list[str]:
+    def get_all_media_tweets(self, username: str, scroll_timeout: float = 0.8) -> list[str]:
+        """Get the username's all liked tweets
+        Returns the list of links of liked tweets
+        """
+        return self.get_media_tweets_until(
+            username, 'nothing', scroll_timeout
+        )    # 'nothing' was intended because the given `until_link` would be never found on the links list
+
+    def get_media_tweets_until(self, username: str, until_link: str, scroll_timeout: float = 0.8) -> list[str]:
+        """Scrolling down the list of media tweets until the given `until_link` found
+        Returns the list of links of media tweets
+        """
+        self._goto_media_tweets(username)
         links: list[str] = []
 
+        previous_height = self.page_current_height
         while True:
-            articles = self.page.locator('article:has(video)')
-            article_length = articles.count()
-            try:
-                links = [
-                    'https://twitter.com' +
-                    (articles.nth(i).locator('div').locator('a').nth(3).get_attribute('href', timeout=500) or '')
-                    for i in range(article_length)
-                ]
+            # 1. scroll down
+            # 2. get the link of tweets in the current screen(tweets are not reachable if it's not rendering)
+            # 3. break if page reaches to the bottom or the given `until_link` found
+
+            self.page.mouse.wheel(0, 1500)
+            time.sleep(scroll_timeout)    # wait for mouse wheel to scroll down
+            is_page_bottom = self.page_current_height == previous_height
+            if is_page_bottom:
+                break
+            previous_height = self.page_current_height
+
+            new_links = self._get_video_tweets_in_current_screen()
+            links.extend(new_links)
+            links = list(set(links))
+
+            print(f'Found {len(links)} media tweets.')
+
+            if until_link in links:
                 break
-            except Error:    # if articles in the page are not reachable
-                self.page.mouse.wheel(0, 500)    #  scrolling down to refresh the articles
-                self.page.mouse.wheel(0, -500)    #  scrolling down to refresh the articles
 
         return links
 
-    def _get_tweets_in_current_screen(self) -> list[str]:
+    def _goto_media_tweets(self, username: str) -> None:
+        self.page.goto(f'https://twitter.com/{username}/media')
+        self.page.wait_for_selector('article')
+
+    def _get_video_tweets_in_current_screen(self) -> list[str]:
         links: list[str] = []
 
         while True:
-            articles = self.page.locator('article')
+            articles = self.page.locator('article:has(video)')
             article_length = articles.count()
             try:
                 links = [