Skip to content

Commit 75c4208

Browse files
committed
Add books example spider
1 parent b7c7374 commit 75c4208

File tree

2 files changed

+64
-0
lines changed

2 files changed

+64
-0
lines changed

examples/books.py

Lines changed: 63 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,63 @@
1+
import hashlib
2+
from pathlib import Path
3+
from typing import Generator
4+
5+
from scrapy import Spider
6+
from scrapy.crawler import CrawlerProcess
7+
from scrapy.http.response import Response
8+
9+
10+
class BooksSpider(Spider):
11+
"""Extract all books, save screenshots."""
12+
13+
name = "books"
14+
start_urls = ["http://books.toscrape.com"]
15+
custom_settings = {
16+
"CLOSESPIDER_ITEMCOUNT": 100,
17+
"CONCURRENT_REQUESTS": 32,
18+
"FEEDS": {
19+
"books.json": {"format": "json", "encoding": "utf-8", "indent": 4},
20+
},
21+
}
22+
23+
def parse(self, response: Response) -> Generator:
24+
page_count = response.css(".pager .current::text").re_first("Page \d+ of (\d+)")
25+
page_count = int(page_count)
26+
for page in range(2, page_count + 1):
27+
yield response.follow(f"/catalogue/page-{page}.html")
28+
29+
for book in response.css("article.product_pod a"):
30+
yield response.follow(
31+
book,
32+
callback=self.parse_book,
33+
meta={"playwright": True, "playwright_include_page": True},
34+
)
35+
36+
async def parse_book(self, response: Response) -> dict:
37+
url_md5 = hashlib.md5(response.url.encode("utf-8")).hexdigest()
38+
page = response.meta["playwright_page"]
39+
await page.screenshot(
40+
path=Path(__file__).parent / "books" / f"{url_md5}.png", full_page=True
41+
)
42+
await page.close()
43+
return {
44+
"url": response.url,
45+
"title": response.css("h1::text").get(),
46+
"price": response.css("p.price_color::text").get(),
47+
"breadcrumbs": response.css(".breadcrumb a::text").getall(),
48+
"image": f"books/{url_md5}.png",
49+
}
50+
51+
52+
if __name__ == "__main__":
53+
process = CrawlerProcess(
54+
settings={
55+
"TWISTED_REACTOR": "twisted.internet.asyncioreactor.AsyncioSelectorReactor",
56+
"DOWNLOAD_HANDLERS": {
57+
# "https": "scrapy_playwright.handler.ScrapyPlaywrightDownloadHandler",
58+
"http": "scrapy_playwright.handler.ScrapyPlaywrightDownloadHandler",
59+
},
60+
}
61+
)
62+
process.crawl(BooksSpider)
63+
process.start()

examples/books/.gitignore

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
*.png

0 commit comments

Comments
 (0)