diff --git a/nginx.conf b/nginx.conf index 3b52b274c4..48bf0142ca 100644 --- a/nginx.conf +++ b/nginx.conf @@ -346,11 +346,39 @@ server { # Rename output schema to dataset schema rewrite ^/platform/actors/development/actor-definition/output-schema$ /platform/actors/development/actor-definition/dataset-schema permanent; - rewrite ^academy/deploying-your-code/output-schema$ /academy/deploying-your-code/dataset-schema permanent; + rewrite ^/academy/deploying-your-code/output-schema$ /academy/deploying-your-code/dataset-schema permanent; # Academy restructuring - rewrite ^academy/advanced-web-scraping/scraping-paginated-sites$ /academy/advanced-web-scraping/crawling/crawling-with-search permanent; - rewrite ^academy/php$ /academy/php/use-apify-from-php redirect; # not permanent in case we want to reuse /php in the future + rewrite ^/academy/advanced-web-scraping/scraping-paginated-sites$ /academy/advanced-web-scraping/crawling/crawling-with-search permanent; + rewrite ^/academy/php$ /academy/php/use-apify-from-php redirect; # not permanent in case we want to reuse /php in the future + + # Academy: replacing the 'Web Scraping for Beginners' course + rewrite ^/academy/web-scraping-for-beginners/best-practices$ /academy/scraping-basics-javascript?legacy-js-course=/best-practices permanent; + rewrite ^/academy/web-scraping-for-beginners/introduction$ /academy/scraping-basics-javascript?legacy-js-course=/introduction permanent; + rewrite ^/academy/web-scraping-for-beginners/challenge/initializing-and-setting-up$ /academy/scraping-basics-javascript?legacy-js-course=/challenge/initializing-and-setting-up permanent; + rewrite ^/academy/web-scraping-for-beginners/challenge/modularity$ /academy/scraping-basics-javascript?legacy-js-course=/challenge/modularity permanent; + rewrite ^/academy/web-scraping-for-beginners/challenge/scraping-amazon$ /academy/scraping-basics-javascript?legacy-js-course=/challenge/scraping-amazon permanent; + rewrite ^/academy/web-scraping-for-beginners/challenge$ /academy/scraping-basics-javascript?legacy-js-course=/challenge permanent; + rewrite ^/academy/web-scraping-for-beginners/crawling/exporting-data$ /academy/scraping-basics-javascript/framework?legacy-js-course=/crawling/exporting-data permanent; + rewrite ^/academy/web-scraping-for-beginners/crawling/filtering-links$ /academy/scraping-basics-javascript/getting-links?legacy-js-course=/crawling/filtering-links permanent; + rewrite ^/academy/web-scraping-for-beginners/crawling/finding-links$ /academy/scraping-basics-javascript/getting-links?legacy-js-course=/crawling/finding-links permanent; + rewrite ^/academy/web-scraping-for-beginners/crawling/first-crawl$ /academy/scraping-basics-javascript/crawling?legacy-js-course=/crawling/first-crawl permanent; + rewrite ^/academy/web-scraping-for-beginners/crawling/headless-browser$ /academy/scraping-basics-javascript?legacy-js-course=/crawling/headless-browser permanent; + rewrite ^/academy/web-scraping-for-beginners/crawling/pro-scraping$ /academy/scraping-basics-javascript/framework?legacy-js-course=/crawling/pro-scraping permanent; + rewrite ^/academy/web-scraping-for-beginners/crawling/recap-extraction-basics$ /academy/scraping-basics-javascript/extracting-data?legacy-js-course=/crawling/recap-extraction-basics permanent; + rewrite ^/academy/web-scraping-for-beginners/crawling/relative-urls$ /academy/scraping-basics-javascript/getting-links?legacy-js-course=/crawling/relative-urls permanent; + rewrite ^/academy/web-scraping-for-beginners/crawling/scraping-the-data$ /academy/scraping-basics-javascript/scraping-variants?legacy-js-course=/crawling/scraping-the-data permanent; + rewrite ^/academy/web-scraping-for-beginners/crawling$ /academy/scraping-basics-javascript/crawling?legacy-js-course=/crawling permanent; + rewrite ^/academy/web-scraping-for-beginners/data-extraction/browser-devtools$ /academy/scraping-basics-javascript/devtools-inspecting?legacy-js-course=/data-extraction/browser-devtools permanent; + rewrite ^/academy/web-scraping-for-beginners/data-extraction/computer-preparation$ /academy/scraping-basics-javascript/downloading-html?legacy-js-course=/data-extraction/computer-preparation permanent; + rewrite ^/academy/web-scraping-for-beginners/data-extraction/devtools-continued$ /academy/scraping-basics-javascript/devtools-extracting-data?legacy-js-course=/data-extraction/devtools-continued permanent; + rewrite ^/academy/web-scraping-for-beginners/data-extraction/node-continued$ /academy/scraping-basics-javascript/extracting-data?legacy-js-course=/data-extraction/node-continued permanent; + rewrite ^/academy/web-scraping-for-beginners/data-extraction/node-js-scraper$ /academy/scraping-basics-javascript/downloading-html?legacy-js-course=/data-extraction/node-js-scraper permanent; + rewrite ^/academy/web-scraping-for-beginners/data-extraction/project-setup$ /academy/scraping-basics-javascript/downloading-html?legacy-js-course=/data-extraction/project-setup permanent; + rewrite ^/academy/web-scraping-for-beginners/data-extraction/save-to-csv$ /academy/scraping-basics-javascript/saving-data?legacy-js-course=/data-extraction/save-to-csv permanent; + rewrite ^/academy/web-scraping-for-beginners/data-extraction/using-devtools$ /academy/scraping-basics-javascript/devtools-locating-elements?legacy-js-course=/data-extraction/using-devtools permanent; + rewrite ^/academy/web-scraping-for-beginners/data-extraction$ /academy/scraping-basics-javascript/devtools-inspecting?legacy-js-course=/data-extraction permanent; + rewrite ^/academy/web-scraping-for-beginners$ /academy/scraping-basics-javascript?legacy-js-course=/ permanent; # Removed pages # GPT plugins were discontinued April 9th, 2024 - https://help.openai.com/en/articles/8988022-winding-down-the-chatgpt-plugins-beta diff --git a/sources/academy/glossary/concepts/robot_process_automation.md b/sources/academy/glossary/concepts/robot_process_automation.md index 27d61dcdee..3671fe7cb6 100644 --- a/sources/academy/glossary/concepts/robot_process_automation.md +++ b/sources/academy/glossary/concepts/robot_process_automation.md @@ -1,12 +1,10 @@ --- -title: Robotic process automation +title: What is robotic process automation (RPA) description: Learn the basics of robotic process automation. Make your processes on the web and other software more efficient by automating repetitive tasks. sidebar_position: 8.7 slug: /concepts/robotic-process-automation --- -# What is robotic process automation (RPA)? {#what-is-robotic-process-automation-rpa} - **Learn the basics of robotic process automation. Make your processes on the web and other software more efficient by automating repetitive tasks.** --- @@ -31,7 +29,7 @@ With the advance of [machine learning](https://en.wikipedia.org/wiki/Machine_lea ## Is RPA the same as web scraping? {#is-rpa-the-same-as-web-scraping} -While [web scraping](../../webscraping/scraping_basics_javascript/index.md) is a kind of RPA, it focuses on extracting structured data. RPA focuses on the other tasks in browsers - everything except for extracting information. +While web scraping is a kind of RPA, it focuses on extracting structured data. RPA focuses on the other tasks in browsers - everything except for extracting information. ## Additional resources {#additional-resources} diff --git a/sources/academy/glossary/tools/apify_cli.md b/sources/academy/glossary/tools/apify_cli.md index 82cb187e77..64c8b11218 100644 --- a/sources/academy/glossary/tools/apify_cli.md +++ b/sources/academy/glossary/tools/apify_cli.md @@ -15,7 +15,7 @@ The [Apify CLI](/cli) helps you create, develop, build and run Apify Actors, and ## Installing {#installing} -To install the Apfiy CLI, you'll first need npm, which comes preinstalled with Node.js. If you haven't yet installed Node, learn how to do that [here](../../webscraping/scraping_basics_javascript/data_extraction/computer_preparation.md). Additionally, make sure you've got an Apify account, as you will need to log in to the CLI to gain access to its full potential. +To install the Apfiy CLI, you'll first need npm, which comes preinstalled with Node.js. Additionally, make sure you've got an Apify account, as you will need to log in to the CLI to gain access to its full potential. Open up a terminal instance and run the following command: diff --git a/sources/academy/homepage_content.json b/sources/academy/homepage_content.json index 264993d52b..7c436913ce 100644 --- a/sources/academy/homepage_content.json +++ b/sources/academy/homepage_content.json @@ -2,7 +2,7 @@ "Beginner courses": [ { "title": "Web scraping basics with JS", - "link": "/academy/web-scraping-for-beginners", + "link": "/academy/scraping-basics-javascript", "description": "Learn how to use JavaScript to extract information from websites in this practical course, starting from the absolute basics.", "imageUrl": "/img/academy/intro.svg" }, diff --git a/sources/academy/platform/expert_scraping_with_apify/actors_webhooks.md b/sources/academy/platform/expert_scraping_with_apify/actors_webhooks.md index 53814c0033..bbb59ac3bb 100644 --- a/sources/academy/platform/expert_scraping_with_apify/actors_webhooks.md +++ b/sources/academy/platform/expert_scraping_with_apify/actors_webhooks.md @@ -1,21 +1,24 @@ --- -title: I - Webhooks & advanced Actor overview +title: Webhooks & advanced Actor overview description: Learn more advanced details about Actors, how they work, and the default configurations they can take. Also, learn how to integrate your Actor with webhooks. sidebar_position: 6.1 +sidebar_label: I - Webhooks & advanced Actor overview slug: /expert-scraping-with-apify/actors-webhooks --- -# Webhooks & advanced Actor overview {#webhooks-and-advanced-actors} - **Learn more advanced details about Actors, how they work, and the default configurations they can take. Also, learn how to integrate your Actor with webhooks.** +:::caution Updates coming +This lesson is subject to change because it currently relies on code from our archived **Web scraping basics for JavaScript devs** course. For now you can still access the archived course, but we plan to completely retire it in a few months. This lesson will be updated to remove the dependency. +::: + --- Thus far, you've run Actors on the platform and written an Actor of your own, which you published to the platform yourself using the Apify CLI; therefore, it's fair to say that you are becoming more familiar and comfortable with the concept of **Actors**. Within this lesson, we'll take a more in-depth look at Actors and what they can do. ## Advanced Actor overview {#advanced-actors} -In this course, we'll be working out of the Amazon scraper project from the **Web scraping basics for JavaScript devs** course. If you haven't already built that project, you can do it in three short lessons [here](../../webscraping/scraping_basics_javascript/challenge/index.md). We've made a few small modifications to the project with the Apify SDK, but 99% of the code is still the same. +In this course, we'll be working out of the Amazon scraper project from the **Web scraping basics for JavaScript devs** course. If you haven't already built that project, you can do it in [three short lessons](../../webscraping/scraping_basics_legacy/challenge/index.md). We've made a few small modifications to the project with the Apify SDK, but 99% of the code is still the same. Take another look at the files within your Amazon scraper project. You'll notice that there is a **Dockerfile**. Every single Actor has a Dockerfile (the Actor's **Image**) which tells Docker how to spin up a container on the Apify platform which can successfully run the Actor's code. "Apify Actors" is a serverless platform that runs multiple Docker containers. For a deeper understanding of Actor Dockerfiles, refer to the [Apify Actor Dockerfile docs](/sdk/js/docs/guides/docker-images#example-dockerfile). @@ -41,7 +44,7 @@ Prior to moving forward, please read over these resources: ## Our task {#our-task} -In this task, we'll be building on top of what we already created in the [Web scraping basics for JavaScript devs](/academy/web-scraping-for-beginners/challenge) course's final challenge, so keep those files safe! +In this task, we'll be building on top of what we already created in the [Web scraping basics for JavaScript devs](../../webscraping/scraping_basics_legacy/challenge/index.md) course's final challenge, so keep those files safe! Once our Amazon Actor has completed its run, we will, rather than sending an email to ourselves, call an Actor through a webhook. The Actor called will be a new Actor that we will create together, which will take the dataset ID as input, then subsequently filter through all of the results and return only the cheapest one for each product. All of the results of the Actor will be pushed to its default dataset. diff --git a/sources/academy/platform/expert_scraping_with_apify/apify_api_and_client.md b/sources/academy/platform/expert_scraping_with_apify/apify_api_and_client.md index 02e55777f3..15e5e42266 100644 --- a/sources/academy/platform/expert_scraping_with_apify/apify_api_and_client.md +++ b/sources/academy/platform/expert_scraping_with_apify/apify_api_and_client.md @@ -1,12 +1,11 @@ --- -title: IV - Apify API & client +title: Apify API & client description: Gain an in-depth understanding of the two main ways of programmatically interacting with the Apify platform - through the API, and through a client. sidebar_position: 6.4 +sidebar_label: IV - Apify API & client slug: /expert-scraping-with-apify/apify-api-and-client --- -# Apify API & client {#api-and-client} - **Gain an in-depth understanding of the two main ways of programmatically interacting with the Apify platform - through the API, and through a client.** --- diff --git a/sources/academy/platform/expert_scraping_with_apify/bypassing_anti_scraping.md b/sources/academy/platform/expert_scraping_with_apify/bypassing_anti_scraping.md index ccc9c62f3e..e5f034e1aa 100644 --- a/sources/academy/platform/expert_scraping_with_apify/bypassing_anti_scraping.md +++ b/sources/academy/platform/expert_scraping_with_apify/bypassing_anti_scraping.md @@ -1,12 +1,11 @@ --- -title: VI - Bypassing anti-scraping methods +title: Bypassing anti-scraping methods description: Learn about bypassing anti-scraping methods using proxies and proxy/session rotation together with Crawlee and the Apify SDK. sidebar_position: 6.6 +sidebar_label: VI - Bypassing anti-scraping methods slug: /expert-scraping-with-apify/bypassing-anti-scraping --- -# Bypassing anti-scraping methods {#bypassing-anti-scraping-methods} - **Learn about bypassing anti-scraping methods using proxies and proxy/session rotation together with Crawlee and the Apify SDK.** --- diff --git a/sources/academy/platform/expert_scraping_with_apify/index.md b/sources/academy/platform/expert_scraping_with_apify/index.md index 95bc0a92c7..7d3689b1da 100644 --- a/sources/academy/platform/expert_scraping_with_apify/index.md +++ b/sources/academy/platform/expert_scraping_with_apify/index.md @@ -20,13 +20,9 @@ Before developing a pro-level Apify scraper, there are some important things you > If you've already gone through the [Web scraping basics for JavaScript devs](../../webscraping/scraping_basics_javascript/index.md) and the first courses of the [Apify platform category](../apify_platform.md), you will be more than well equipped to continue on with the lessons in this course. - - ### Crawlee, Apify SDK, and the Apify CLI {#crawlee-apify-sdk-and-cli} -If you're feeling ambitious, you don't need to have any prior experience with Crawlee to get started with this course; however, at least 5β10 minutes of exposure is recommended. If you haven't yet tried out Crawlee, you can refer to [this lesson](../../webscraping/scraping_basics_javascript/crawling/pro_scraping.md) in the **Web scraping basics for JavaScript devs** course (and ideally follow along). To familiarize yourself with the Apify SDK, you can refer to the [Apify Platform](../apify_platform.md) category. +If you're feeling ambitious, you don't need to have any prior experience with Crawlee to get started with this course; however, at least 5β10 minutes of exposure is recommended. If you haven't yet tried out Crawlee, you can refer to the [Using a scraping framework with Node.js](../../webscraping/scraping_basics_javascript/12_framework.md) lesson of the **Web scraping basics for JavaScript devs** course. To familiarize yourself with the Apify SDK, you can refer to the [Apify Platform](../apify_platform.md) category. The Apify CLI will play a core role in the running and testing of the Actor you will build, so if you haven't gotten it installed already, please refer to [this short lesson](../../glossary/tools/apify_cli.md). diff --git a/sources/academy/platform/expert_scraping_with_apify/managing_source_code.md b/sources/academy/platform/expert_scraping_with_apify/managing_source_code.md index 38d3fdaa95..e9cf36d342 100644 --- a/sources/academy/platform/expert_scraping_with_apify/managing_source_code.md +++ b/sources/academy/platform/expert_scraping_with_apify/managing_source_code.md @@ -1,12 +1,11 @@ --- -title: II - Managing source code +title: Managing source code description: Learn how to manage your Actor's source code more efficiently by integrating it with a GitHub repository. This is standard on the Apify platform. sidebar_position: 6.2 +sidebar_label: II - Managing source code slug: /expert-scraping-with-apify/managing-source-code --- -# Managing source code {#managing-source-code} - **Learn how to manage your Actor's source code more efficiently by integrating it with a GitHub repository. This is standard on the Apify platform.** --- diff --git a/sources/academy/platform/expert_scraping_with_apify/migrations_maintaining_state.md b/sources/academy/platform/expert_scraping_with_apify/migrations_maintaining_state.md index c59da80531..707e64fd7d 100644 --- a/sources/academy/platform/expert_scraping_with_apify/migrations_maintaining_state.md +++ b/sources/academy/platform/expert_scraping_with_apify/migrations_maintaining_state.md @@ -1,12 +1,11 @@ --- -title: V - Migrations & maintaining state +title: Migrations & maintaining state description: Learn about what Actor migrations are and how to handle them properly so that the state is not lost and runs can safely be resurrected. sidebar_position: 6.5 +sidebar_label: V - Migrations & maintaining state slug: /expert-scraping-with-apify/migrations-maintaining-state --- -# Migrations & maintaining state {#migrations-maintaining-state} - **Learn about what Actor migrations are and how to handle them properly so that the state is not lost and runs can safely be resurrected.** --- diff --git a/sources/academy/platform/expert_scraping_with_apify/saving_useful_stats.md b/sources/academy/platform/expert_scraping_with_apify/saving_useful_stats.md index bcb7cee71d..6bc13433f1 100644 --- a/sources/academy/platform/expert_scraping_with_apify/saving_useful_stats.md +++ b/sources/academy/platform/expert_scraping_with_apify/saving_useful_stats.md @@ -1,19 +1,18 @@ --- -title: VII - Saving useful run statistics +title: Saving useful run statistics description: Understand how to save statistics about an Actor's run, what types of statistics you can save, and why you might want to save them for a large-scale scraper. sidebar_position: 6.7 +sidebar_label: VII - Saving useful run statistics slug: /expert-scraping-with-apify/saving-useful-stats --- -# Saving useful run statistics {#savings-useful-run-statistics} - **Understand how to save statistics about an Actor's run, what types of statistics you can save, and why you might want to save them for a large-scale scraper.** --- Using Crawlee and the Apify SDK, we are now able to collect and format data coming directly from websites and save it into a Key-Value store or Dataset. This is great, but sometimes, we want to store some extra data about the run itself, or about each request. We might want to store some extra general run information separately from our results or potentially include statistics about each request within its corresponding dataset item. -The types of values that are saved are totally up to you, but the most common are error scores, number of total saved items, number of request retries, number of captchas hit, etc. Storing these values is not always necessary, but can be valuable when debugging and maintaining an Actor. As your projects scale, this will become more and more useful and important. +The types of values that are saved are totally up to you, but the most common are error scores, number of total saved items, number of request retries, number of CAPTCHAs hit, etc. Storing these values is not always necessary, but can be valuable when debugging and maintaining an Actor. As your projects scale, this will become more and more useful and important. ## Learning π§ {#learning} diff --git a/sources/academy/platform/expert_scraping_with_apify/solutions/handling_migrations.md b/sources/academy/platform/expert_scraping_with_apify/solutions/handling_migrations.md index 635971ff65..24399f8df6 100644 --- a/sources/academy/platform/expert_scraping_with_apify/solutions/handling_migrations.md +++ b/sources/academy/platform/expert_scraping_with_apify/solutions/handling_migrations.md @@ -1,12 +1,11 @@ --- -title: V - Handling migrations +title: Handling migrations description: Get real-world experience of maintaining a stateful object stored in memory, which will be persisted through migrations and even graceful aborts. sidebar_position: 5 +sidebar_label: V - Handling migrations slug: /expert-scraping-with-apify/solutions/handling-migrations --- -# Handling migrations {#handling-migrations} - **Get real-world experience of maintaining a stateful object stored in memory, which will be persisted through migrations and even graceful aborts.** --- diff --git a/sources/academy/platform/expert_scraping_with_apify/solutions/index.md b/sources/academy/platform/expert_scraping_with_apify/solutions/index.md index e9d71e69db..171cfd8dc3 100644 --- a/sources/academy/platform/expert_scraping_with_apify/solutions/index.md +++ b/sources/academy/platform/expert_scraping_with_apify/solutions/index.md @@ -5,8 +5,6 @@ sidebar_position: 6.7 slug: /expert-scraping-with-apify/solutions --- -# Solutions - **View all of the solutions for all of the activities and tasks of this course. Please try to complete each task on your own before reading the solution!** --- diff --git a/sources/academy/platform/expert_scraping_with_apify/solutions/integrating_webhooks.md b/sources/academy/platform/expert_scraping_with_apify/solutions/integrating_webhooks.md index 301eab24d2..2a22f0fc78 100644 --- a/sources/academy/platform/expert_scraping_with_apify/solutions/integrating_webhooks.md +++ b/sources/academy/platform/expert_scraping_with_apify/solutions/integrating_webhooks.md @@ -1,14 +1,17 @@ --- -title: I - Integrating webhooks +title: Integrating webhooks description: Learn how to integrate webhooks into your Actors. Webhooks are a super powerful tool, and can be used to do almost anything! sidebar_position: 1 +sidebar_label: I - Integrating webhooks slug: /expert-scraping-with-apify/solutions/integrating-webhooks --- -# Integrating webhooks {#integrating-webhooks} - **Learn how to integrate webhooks into your Actors. Webhooks are a super powerful tool, and can be used to do almost anything!** +:::caution Updates coming +This lesson is subject to change because it currently relies on code from our archived **Web scraping basics for JavaScript devs** course. For now you can still access the archived course, but we plan to completely retire it in a few months. This lesson will be updated to remove the dependency. +::: + --- In this lesson we'll be writing a new Actor and integrating it with our beloved Amazon scraping Actor. First, we'll navigate to the same directory where our **demo-actor** folder lives, and run `apify create filter-actor` _(once again, you can name the Actor whatever you want, but for this lesson, we'll be calling the new Actor **filter-actor**)_. When prompted about the programming language, select **JavaScript**: diff --git a/sources/academy/platform/expert_scraping_with_apify/solutions/managing_source.md b/sources/academy/platform/expert_scraping_with_apify/solutions/managing_source.md index 2273af81ed..9fed4b5d2d 100644 --- a/sources/academy/platform/expert_scraping_with_apify/solutions/managing_source.md +++ b/sources/academy/platform/expert_scraping_with_apify/solutions/managing_source.md @@ -1,12 +1,11 @@ --- -title: II - Managing source +title: Managing source description: View in-depth answers for all three of the quiz questions that were provided in the corresponding lesson about managing source code. sidebar_position: 2 +sidebar_label: II - Managing source slug: /expert-scraping-with-apify/solutions/managing-source --- -# Managing source - **View in-depth answers for all three of the quiz questions that were provided in the corresponding lesson about managing source code.** --- diff --git a/sources/academy/platform/expert_scraping_with_apify/solutions/rotating_proxies.md b/sources/academy/platform/expert_scraping_with_apify/solutions/rotating_proxies.md index 04fdd869d6..88755208eb 100644 --- a/sources/academy/platform/expert_scraping_with_apify/solutions/rotating_proxies.md +++ b/sources/academy/platform/expert_scraping_with_apify/solutions/rotating_proxies.md @@ -1,12 +1,11 @@ --- -title: VI - Rotating proxies/sessions +title: Rotating proxies/sessions description: Learn firsthand how to rotate proxies and sessions in order to avoid the majority of the most common anti-scraping protections. sidebar_position: 6 +sidebar_label: VI - Rotating proxies/sessions slug: /expert-scraping-with-apify/solutions/rotating-proxies --- -# Rotating proxies/sessions {#rotating-proxy-sessions} - **Learn firsthand how to rotate proxies and sessions in order to avoid the majority of the most common anti-scraping protections.** --- diff --git a/sources/academy/platform/expert_scraping_with_apify/solutions/saving_stats.md b/sources/academy/platform/expert_scraping_with_apify/solutions/saving_stats.md index a730536c7d..3915dee01c 100644 --- a/sources/academy/platform/expert_scraping_with_apify/solutions/saving_stats.md +++ b/sources/academy/platform/expert_scraping_with_apify/solutions/saving_stats.md @@ -1,12 +1,11 @@ --- -title: VII - Saving run stats +title: Saving run stats description: Implement the saving of general statistics about an Actor's run, as well as adding request-specific statistics to dataset items. sidebar_position: 7 +sidebar_label: VII - Saving run stats slug: /expert-scraping-with-apify/solutions/saving-stats --- -# Saving run stats {#saving-stats} - **Implement the saving of general statistics about an Actor's run, as well as adding request-specific statistics to dataset items.** --- diff --git a/sources/academy/platform/expert_scraping_with_apify/solutions/using_api_and_client.md b/sources/academy/platform/expert_scraping_with_apify/solutions/using_api_and_client.md index c5633be937..e2cf1cf0d3 100644 --- a/sources/academy/platform/expert_scraping_with_apify/solutions/using_api_and_client.md +++ b/sources/academy/platform/expert_scraping_with_apify/solutions/using_api_and_client.md @@ -1,12 +1,11 @@ --- -title: IV - Using the Apify API & JavaScript client +title: Using the Apify API & JavaScript client description: Learn how to interact with the Apify API directly through the well-documented RESTful routes, or by using the proprietary Apify JavaScript client. sidebar_position: 4 +sidebar_label: IV - Using the Apify API & JavaScript client slug: /expert-scraping-with-apify/solutions/using-api-and-client --- -# Using the Apify API & JavaScript client {#using-api-and-client} - **Learn how to interact with the Apify API directly through the well-documented RESTful routes, or by using the proprietary Apify JavaScript client.** --- diff --git a/sources/academy/platform/expert_scraping_with_apify/solutions/using_storage_creating_tasks.md b/sources/academy/platform/expert_scraping_with_apify/solutions/using_storage_creating_tasks.md index 5c01c45a8f..4df5c3d43f 100644 --- a/sources/academy/platform/expert_scraping_with_apify/solutions/using_storage_creating_tasks.md +++ b/sources/academy/platform/expert_scraping_with_apify/solutions/using_storage_creating_tasks.md @@ -1,12 +1,11 @@ --- -title: III - Using storage & creating tasks +title: Using storage & creating tasks description: Get quiz answers and explanations for the lesson about using storage and creating tasks on the Apify platform. sidebar_position: 3 +sidebar_label: III - Using storage & creating tasks slug: /expert-scraping-with-apify/solutions/using-storage-creating-tasks --- -# Using storage & creating tasks {#using-storage-creating-tasks} - ## Quiz answers π {#quiz-answers} **Q: What is the relationship between Actors and tasks?** diff --git a/sources/academy/platform/expert_scraping_with_apify/tasks_and_storage.md b/sources/academy/platform/expert_scraping_with_apify/tasks_and_storage.md index d18009c241..ea5640c14d 100644 --- a/sources/academy/platform/expert_scraping_with_apify/tasks_and_storage.md +++ b/sources/academy/platform/expert_scraping_with_apify/tasks_and_storage.md @@ -1,12 +1,11 @@ --- -title: III - Tasks & storage +title: Tasks & storage description: Understand how to save the configurations for Actors with Actor tasks. Also, learn about storage and the different types Apify offers. sidebar_position: 6.3 +sidebar_label: III - Tasks & storage slug: /expert-scraping-with-apify/tasks-and-storage --- -# Tasks & storage {#tasks-and-storage} - **Understand how to save the configurations for Actors with Actor tasks. Also, learn about storage and the different types Apify offers.** --- diff --git a/sources/academy/tutorials/node_js/analyzing_pages_and_fixing_errors.md b/sources/academy/tutorials/node_js/analyzing_pages_and_fixing_errors.md index 892a3dd59b..d2314a7bd7 100644 --- a/sources/academy/tutorials/node_js/analyzing_pages_and_fixing_errors.md +++ b/sources/academy/tutorials/node_js/analyzing_pages_and_fixing_errors.md @@ -5,8 +5,6 @@ sidebar_position: 14.1 slug: /node-js/analyzing-pages-and-fixing-errors --- -# How to analyze and fix errors when scraping a website {#scraping-with-sitemaps} - **Learn how to deal with random crashes in your web-scraping and automation jobs. Find out the essentials of debugging and fixing problems in your crawlers.** --- @@ -71,8 +69,6 @@ try { } ``` -Read more information about logging and error handling in our developer [best practices](../../webscraping/scraping_basics_javascript/best_practices.md) section. - ### Saving snapshots {#saving-snapshots} By snapshots, we mean **screenshots** if you use a [browser with Puppeteer/Playwright](../../webscraping/puppeteer_playwright/index.md) and HTML saved into a [key-value store](https://crawlee.dev/api/core/class/KeyValueStore) that you can display in your own browser. Snapshots are useful throughout your code but especially important in error handling. diff --git a/sources/academy/tutorials/node_js/dealing_with_dynamic_pages.md b/sources/academy/tutorials/node_js/dealing_with_dynamic_pages.md index 21b8aee9a7..d1c1b60ecc 100644 --- a/sources/academy/tutorials/node_js/dealing_with_dynamic_pages.md +++ b/sources/academy/tutorials/node_js/dealing_with_dynamic_pages.md @@ -7,8 +7,6 @@ slug: /node-js/dealing-with-dynamic-pages import Example from '!!raw-loader!roa-loader!./dealing_with_dynamic_pages.js'; -# How to scrape from dynamic pages {#dealing-with-dynamic-pages} - **Learn about dynamic pages and dynamic content. How can we find out if a page is dynamic? How do we programmatically scrape dynamic content?** --- @@ -43,7 +41,7 @@ If you're in a brand new project, don't forget to initialize your project, then npm init -y && npm i crawlee ``` -Now, let's write some data extraction code to extract each product's data. This should look familiar if you went through the [Data Extraction](../../webscraping/scraping_basics_javascript/data_extraction/index.md) lessons: +Now, let's write some data extraction code to extract each product's data. This should look familiar if you went through the [Web scraping basics for JavaScript devs](/academy/scraping-basics-javascript) course: ```js import { CheerioCrawler } from 'crawlee'; diff --git a/sources/academy/webscraping/advanced_web_scraping/crawling/sitemaps-vs-search.md b/sources/academy/webscraping/advanced_web_scraping/crawling/sitemaps-vs-search.md index f34b24d261..5fea5d36df 100644 --- a/sources/academy/webscraping/advanced_web_scraping/crawling/sitemaps-vs-search.md +++ b/sources/academy/webscraping/advanced_web_scraping/crawling/sitemaps-vs-search.md @@ -5,7 +5,7 @@ sidebar_position: 1 slug: /advanced-web-scraping/crawling/sitemaps-vs-search --- -The core crawling problem comes to down to ensuring that we reliably find all detail pages on the target website or inside its categories. This is trivial for small sites. We just open the home page or category pages and paginate to the end as we did in the [Web scraping basics for JavaScript devs](/academy/web-scraping-for-beginners) course. +The core crawling problem comes to down to ensuring that we reliably find all detail pages on the target website or inside its categories. This is trivial for small sites. We just open the home page or category pages and paginate to the end. Unfortunately, _most modern websites restrict pagination_ only to somewhere between 1 and 10,000 products. Solving this problem might seem relatively straightforward at first but there are multiple hurdles that we will explore in this lesson. @@ -31,7 +31,7 @@ Sitemap is usually a simple XML file that contains a list of all pages on the we - _Does not directly reflect the website_ - There is no way you can ensure that all pages on the website are in the sitemap. The sitemap also can contain pages that were already removed and will return 404s. This is a major downside of sitemaps which prevents us from using them as the only source of URLs. - _Updated in intervals_ - Sitemaps are usually not updated in real-time. This means that you might miss some pages if you scrape them too soon after they were added to the website. Common update intervals are 1 day or 1 week. - _Hard to find or unavailable_ - Sitemaps are not always trivial to locate. They can be deployed on a CDN with unpredictable URLs. Sometimes they are not available at all. -- _Streamed, compressed, and archived_ - Sitemaps are often streamed and archived with .tgz extensions and compressed with gzip. This means that you cannot use default HTTP client settings and must handle these cases with extra code or use a scraping framework. +- _Streamed, compressed, and archived_ - Sitemaps are often streamed and archived with .tgz extensions and compressed with Gzip. This means that you cannot use default HTTP client settings and must handle these cases with extra code or use a scraping framework. ## Pros and cons of categories, search, and filters diff --git a/sources/academy/webscraping/advanced_web_scraping/index.md b/sources/academy/webscraping/advanced_web_scraping/index.md index fe58884117..33ffc603b0 100644 --- a/sources/academy/webscraping/advanced_web_scraping/index.md +++ b/sources/academy/webscraping/advanced_web_scraping/index.md @@ -6,7 +6,7 @@ category: web scraping & automation slug: /advanced-web-scraping --- -In the [Web scraping basics for JavaScript devs](/academy/web-scraping-for-beginners) course, we have learned the necessary basics required to create a scraper. In the following courses, we learned more about specific practices and techniques that will help us to solve most of the problems we will face. +In the [Web scraping basics for JavaScript devs](/academy/scraping-basics-javascript) course, we have learned the necessary basics required to create a scraper. In the following courses, we learned more about specific practices and techniques that will help us to solve most of the problems we will face. In this course, we will take all of that knowledge, add a few more advanced concepts, and apply them to learn how to build a production-ready web scraper. diff --git a/sources/academy/webscraping/anti_scraping/mitigation/using_proxies.md b/sources/academy/webscraping/anti_scraping/mitigation/using_proxies.md index 819a50c6fc..fe57c53323 100644 --- a/sources/academy/webscraping/anti_scraping/mitigation/using_proxies.md +++ b/sources/academy/webscraping/anti_scraping/mitigation/using_proxies.md @@ -5,19 +5,21 @@ sidebar_position: 2 slug: /anti-scraping/mitigation/using-proxies --- -# Using proxies {#using-proxies} - **Learn how to use and automagically rotate proxies in your scrapers by using Crawlee, and a bit about how to obtain pools of proxies.** +:::caution Updates coming +This lesson is subject to change because it currently relies on code from our archived **Web scraping basics for JavaScript devs** course. For now you can still access the archived course, but we plan to completely retire it in a few months. This lesson will be updated to remove the dependency. +::: + --- -In the [**Web scraping basics for JavaScript devs**](../../scraping_basics_javascript/crawling/pro_scraping.md) course, we learned about the power of Crawlee, and how it can streamline the development process of web crawlers. You've already seen how powerful the `crawlee` package is; however, what you've been exposed to thus far is only the tip of the iceberg. +In the [**Web scraping basics for JavaScript devs**](../../scraping_basics_legacy/crawling/pro_scraping.md) course, we learned about the power of Crawlee, and how it can streamline the development process of web crawlers. You've already seen how powerful the `crawlee` package is; however, what you've been exposed to thus far is only the tip of the iceberg. Because proxies are so widely used in the scraping world, Crawlee has built-in features for implementing them in an effective way. One of the main functionalities that comes baked into Crawlee is proxy rotation, which is when each request is sent through a different proxy from a proxy pool. ## Implementing proxies in a scraper {#implementing-proxies} -Let's borrow some scraper code from the end of the [pro-scraping](../../scraping_basics_javascript/crawling/pro_scraping.md) lesson in our **Web scraping basics for JavaScript devs** course and paste it into a new file called **proxies.js**. This code enqueues all of the product links on [demo-webstore.apify.org](https://demo-webstore.apify.org)'s on-sale page, then makes a request to each product page and scrapes data about each one: +Let's borrow some scraper code from the end of the [pro-scraping](../../scraping_basics_legacy/crawling/pro_scraping.md) lesson in our **Web scraping basics for JavaScript devs** course and paste it into a new file called **proxies.js**. This code enqueues all of the product links on [demo-webstore.apify.org](https://demo-webstore.apify.org)'s on-sale page, then makes a request to each product page and scrapes data about each one: ```js // crawlee.js diff --git a/sources/academy/webscraping/puppeteer_playwright/executing_scripts/extracting_data.md b/sources/academy/webscraping/puppeteer_playwright/executing_scripts/extracting_data.md index 4fb52aa83a..f2ebab8b8d 100644 --- a/sources/academy/webscraping/puppeteer_playwright/executing_scripts/extracting_data.md +++ b/sources/academy/webscraping/puppeteer_playwright/executing_scripts/extracting_data.md @@ -8,13 +8,11 @@ slug: /puppeteer-playwright/executing-scripts/collecting-data import Tabs from '@theme/Tabs'; import TabItem from '@theme/TabItem'; -# Extracting data {#extracting-data} - **Learn how to extract data from a page with evaluate functions, then how to parse it by using a second library called Cheerio.** --- -Now that we know how to execute scripts on a page, we're ready to learn a bit about [data extraction](../../scraping_basics_javascript/data_extraction/index.md). In this lesson, we'll be scraping all the on-sale products from our [Fakestore](https://demo-webstore.apify.org/search/on-sale) website. Playwright & Puppeteer offer two main methods for data extraction: +Now that we know how to execute scripts on a page, we're ready to learn a bit about data extraction. In this lesson, we'll be scraping all the on-sale products from our [Fakestore](https://demo-webstore.apify.org/search/on-sale) website. Playwright & Puppeteer offer two main methods for data extraction: 1. Directly in `page.evaluate()` and other evaluate functions such as `page.$$eval()`. 2. In the Node.js context using a parsing library such as [Cheerio](https://www.npmjs.com/package/cheerio) diff --git a/sources/academy/webscraping/puppeteer_playwright/index.md b/sources/academy/webscraping/puppeteer_playwright/index.md index 77f8781993..6b40e6ddeb 100644 --- a/sources/academy/webscraping/puppeteer_playwright/index.md +++ b/sources/academy/webscraping/puppeteer_playwright/index.md @@ -1,5 +1,5 @@ --- -title: Puppeteer & Playwright +title: Puppeteer & Playwright course description: Learn in-depth how to use two of the most popular Node.js libraries for controlling a headless browser - Puppeteer and Playwright. sidebar_position: 3 category: web scraping & automation @@ -9,8 +9,6 @@ slug: /puppeteer-playwright import Tabs from '@theme/Tabs'; import TabItem from '@theme/TabItem'; -# Puppeteer & Playwright course {#puppeteer-playwright-course} - **Learn in-depth how to use two of the most popular Node.js libraries for controlling a headless browser - Puppeteer and Playwright.** --- @@ -63,8 +61,6 @@ npm install puppeteer -> For a more in-depth guide on how to set up the basic environment we'll be using in this tutorial, check out the [**Computer preparation**](../scraping_basics_javascript/data_extraction/computer_preparation.md) lesson in the **Web scraping basics for JavaScript devs** course - ## Course overview {#course-overview} 1. [Launching a browser](./browser.md) diff --git a/sources/academy/webscraping/puppeteer_playwright/page/interacting_with_a_page.md b/sources/academy/webscraping/puppeteer_playwright/page/interacting_with_a_page.md index ec1c5d0db7..ce247e84e6 100644 --- a/sources/academy/webscraping/puppeteer_playwright/page/interacting_with_a_page.md +++ b/sources/academy/webscraping/puppeteer_playwright/page/interacting_with_a_page.md @@ -8,15 +8,13 @@ slug: /puppeteer-playwright/page/interacting-with-a-page import Tabs from '@theme/Tabs'; import TabItem from '@theme/TabItem'; -# Interacting with a page {#interacting-with-a-page} - **Learn how to programmatically do actions on a page such as clicking, typing, and pressing keys. Also, discover a common roadblock that comes up when automating.** --- The **Page** object has a whole boat-load of functions which can be used to interact with the loaded page. We're not going to go over every single one of them right now, but we _will_ use a few of the most common ones to add some functionality to our current project. -Let's say that we want to automate searching for **hello world** on Google, then click on the first result and log the title of the page to the console, then take a screenshot and write it it to the filesystem. In order to understand how we're going to automate this, let's break down how we would do it manually: +Let's say that we want to automate searching for **hello world** on Google, then click on the first result and log the title of the page to the console, then take a screenshot and write it to the filesystem. In order to understand how we're going to automate this, let's break down how we would do it manually: 1. Click on the button which accepts Google's cookies policy (To see how it looks, open Google in an anonymous window.) 2. Type **hello world** into the search bar @@ -55,7 +53,7 @@ With `page.click()`, Puppeteer and Playwright actually drag the mouse and click, Notice that in the Playwright example, we are using a different selector than in the Puppeteer example. This is because Playwright supports [many custom CSS selectors](https://playwright.dev/docs/other-locators#css-elements-matching-one-of-the-conditions), such as the **has-text** pseudo class. As a rule of thumb, using text selectors is much more preferable to using regular selectors, as they are much less likely to break. If Google makes the sibling above the **Accept all** button a `