From f9dadc167a4ee29ed62be86c8752ee25813e5c67 Mon Sep 17 00:00:00 2001 From: Douglas Cardoso Date: Sun, 23 Jul 2023 02:28:48 -0300 Subject: [PATCH 1/2] async version --- README_ASYNC.md | 20 +++++++++ requirements.txt | 1 + webscraping_example_async.py | 84 ++++++++++++++++++++++++++++++++++++ 3 files changed, 105 insertions(+) create mode 100644 README_ASYNC.md create mode 100644 requirements.txt create mode 100644 webscraping_example_async.py diff --git a/README_ASYNC.md b/README_ASYNC.md new file mode 100644 index 0000000..6918b0a --- /dev/null +++ b/README_ASYNC.md @@ -0,0 +1,20 @@ +# Run the asynchronous crawler +## Simple start +Install the lastest version of **Caqui** + +``` +pip install caqui +``` +Start the WebDriver as a server +``` +$ ./chromedriver --port=9999 + +Starting ChromeDriver 94.0.4606.61 (418b78f5838ed0b1c69bb4e51ea0252171854915-refs/branch-heads/4606@{#1204}) on port 9999 +Only local connections are allowed. +Please see https://chromedriver.chromium.org/security-considerations for suggestions on keeping ChromeDriver safe. +ChromeDriver was started successfully. +``` +Run the crawler +``` +$ python webscraping_example_async.py +``` \ No newline at end of file diff --git a/requirements.txt b/requirements.txt new file mode 100644 index 0000000..53ba8e8 --- /dev/null +++ b/requirements.txt @@ -0,0 +1 @@ +caqui==2.0.0rc3 \ No newline at end of file diff --git a/webscraping_example_async.py b/webscraping_example_async.py new file mode 100644 index 0000000..8e8964e --- /dev/null +++ b/webscraping_example_async.py @@ -0,0 +1,84 @@ +# from selenium import webdriver +# from selenium.webdriver.common.by import By +# from selenium.webdriver.support.ui import WebDriverWait +# from selenium.webdriver.support import expected_conditions as EC +# from selenium.common.exceptions import TimeoutException +import asyncio +from caqui.easy import AsyncDriver +from caqui.by import By + + +async def async_scraping(): + # Specifying incognito mode as you launch your browser[OPTIONAL] + # option = webdriver.ChromeOptions() + # option.add_argument("--incognito") + + # Create new Instance of Chrome in incognito mode + # browser = webdriver.Chrome(executable_path='/Library/Application Support/Google/chromedriver', chrome_options=option) + remote = "http://127.0.0.1:9999" + capabilities = { + "desiredCapabilities": { + "name": "webdriver", + "browserName": "chrome", + "acceptInsecureCerts": True, + # "goog:chromeOptions": {"extensions": [], "args": ["--headless"]}, + } + } + browser = AsyncDriver(remote, capabilities) + + # Wait 20 seconds for page to load + timeout = 20 + await browser.implicitly_wait(timeout) + + # Go to desired website + await browser.get("https://github.com/TheDancerCodes") + + # try: + # # Wait until the final element [Avatar link] is loaded. + # # Assumption: If Avatar link is loaded, the whole page would be relatively loaded because it is among + # # the last things to be loaded. + # WebDriverWait(browser, timeout).until( + # EC.visibility_of_element_located( + # (By.XPATH, "//img[@class='avatar width-full rounded-2']") + # ) + # ) + # except TimeoutException: + # print("Timed out waiting for page to load") + # browser.quit() + + # Get all of the titles for the pinned repositories + # We are not just getting pure titles but we are getting a selenium object + # with selenium elements of the titles. + + # find_elements_by_xpath - Returns an array of selenium objects. + # titles_element = browser.find_elements_by_xpath("//a[@class='text-bold']") + titles_element = await browser.find_elements(By.XPATH, "//a[@class='text-bold']") + + # List Comprehension to get the actual repo titles and not the selenium objects. + titles = [x.text for x in titles_element] + + # print response in terminal + print("TITLES:") + print(titles, "\n") + + # Get all of the pinned repo languages + # language_element = browser.find_elements_by_xpath("//p[@class='mb-0 f6 text-gray']") + language_element = await browser.find_elements( + By.XPATH, "//p[@class='mb-0 f6 text-gray']" + ) + languages = [ + x.text for x in language_element + ] # same concept as for-loop/ list-comprehension above. + + # print response in terminal + print("LANGUAGES:") + print(languages, "\n") + + # Pair each title with its corresponding language using zip function and print each pair + for title, language in zip(titles, languages): + print("RepoName : Language") + print(title + ": " + language, "\n") + + +if __name__ == "__main__": + asyncio.run(async_scraping()) From 713f7cba96fe038b52dce26f1811b75469c2642f Mon Sep 17 00:00:00 2001 From: Douglas Cardoso Date: Sun, 23 Jul 2023 02:30:31 -0300 Subject: [PATCH 2/2] fix caqui version --- requirements.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/requirements.txt b/requirements.txt index 53ba8e8..c9dc8c6 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1 +1 @@ -caqui==2.0.0rc3 \ No newline at end of file +caqui \ No newline at end of file