Skip to content

Commit 241eff0

Browse files
committed
working shopify image scraping
1 parent 83988a6 commit 241eff0

File tree

9 files changed

+429
-4
lines changed

9 files changed

+429
-4
lines changed

.gitignore

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -31,3 +31,5 @@ yarn-error.log*
3131

3232
# vercel
3333
.vercel
34+
35+
shopify-scraper/scraped-images/

.nvmrc

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1 +1 @@
1-
12
1+
14

explorer/package.json

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,7 @@
11
{
22
"name": "explodingdog-explorer",
3-
"version": "0.4.0",
3+
"version": "2.0.0",
4+
"private": true,
45
"description": "explodingdog explorer. An alternate explodingdog experience.",
56
"author": "Patrick Fisher <patrick@pwfisher.com>",
67
"license": "MIT",

scraper/package.json

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,9 +1,10 @@
11
{
22
"name": "explodingdog-scraper",
33
"version": "1.0.0",
4+
"private": true,
45
"description": "Scrape explodingdog.com.",
5-
"main": "server.js",
6-
"author": "pwfisher",
6+
"author": "Patrick Fisher <patrick@pwfisher.com>",
7+
"license": "MIT",
78
"dependencies": {
89
"express": "*",
910
"node-fetch": "*",

shopify-scraper/package.json

Lines changed: 25 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,25 @@
1+
{
2+
"name": "explodingdog-shopify-scraper",
3+
"version": "1.0.0",
4+
"private": true,
5+
"description": "Scrape buildingaworld.com",
6+
"author": "Patrick Fisher <patrick@pwfisher.com>",
7+
"license": "MIT",
8+
"type": "module",
9+
"bin": {
10+
"scraper": "scraper.ts"
11+
},
12+
"scripts": {
13+
"scrape": "node --loader ts-node/esm.mjs --experimental-top-level-await scraper.ts"
14+
},
15+
"dependencies": {
16+
"got": "^11.8.1",
17+
"ts-node": "^9.1.1",
18+
"typescript": "^4.1.3",
19+
"xml2js": "^0.4.23"
20+
},
21+
"devDependencies": {
22+
"@types/node": "^14.14.22",
23+
"@types/xml2js": "^0.4.8"
24+
}
25+
}

shopify-scraper/readme.md

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,9 @@
1+
# buildingaworld.com scraper
2+
3+
Run with
4+
5+
```zsh
6+
yarn scrape
7+
```
8+
9+
Script will update `./scraped-products` and `./scraped-images`.

shopify-scraper/scraper.ts

Lines changed: 76 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,76 @@
1+
#!/usr/bin/env ts-node-script
2+
import got from 'got'
3+
import { parseStringPromise } from 'xml2js'
4+
import { Drawing } from '../explorer/types'
5+
import fs from 'fs'
6+
import stream from 'stream'
7+
import { promisify } from 'util'
8+
9+
const pipeline = promisify(stream.pipeline)
10+
11+
const baseSitemapUrl = 'https://www.buildingaworld.com/sitemap.xml'
12+
const yearDrawingLists: Record<string, Drawing[]> = {}
13+
14+
type LocUrl = {
15+
loc: string // e.g. 'https://www.buildingaworld.com/products/crazy-monster-takes-a-walk'
16+
lastmod: string // e.g. '2020-12-16T15:31:48-05:00'
17+
changeFreq: string[]
18+
'image:image': LocImage[]
19+
}
20+
21+
type LocImage = {
22+
'image:loc': string[]
23+
'image:title': string[]
24+
}
25+
26+
try {
27+
const response = await got(baseSitemapUrl)
28+
const data = await parseStringPromise(response.body)
29+
const productSitemapUrls: string[] = data.sitemapindex.sitemap
30+
.map((o: any) => o.loc[0])
31+
.filter((s: string) => s.match(/sitemap_products_/))
32+
33+
await productSitemapUrls.forEach(async (url) => {
34+
const response = await got(url)
35+
const data = await parseStringPromise(response.body)
36+
37+
console.log('First 5 product locations:', data.urlset.url.slice(0, 5))
38+
39+
data.urlset.url
40+
.filter((o: any) => o['image:image'] && !o.loc.includes('/products/any-drawing'))
41+
.forEach(async (o: LocUrl) => {
42+
const year = o.lastmod.slice(0, 4)
43+
if (!yearDrawingLists[year]) yearDrawingLists[year] = []
44+
45+
const number: string = (yearDrawingLists[year].length + 1).toString()
46+
const title = o['image:image'][0]['image:title'][0]
47+
const slugMatch = o.loc[0].match(/\/products\/(.*)/)
48+
const slug = slugMatch?.[1] ?? ''
49+
50+
const src = o['image:image'][0]['image:loc'][0]
51+
const srcMatch = src.match(/([^/]+)\?/)
52+
const srcBase = srcMatch?.[1]
53+
const srcPath = `./scraped-images/${srcBase}`;
54+
55+
if (fs.existsSync(srcPath)) {
56+
console.log(`Skipping ${srcBase}, found ${srcPath}`);
57+
} else {
58+
await pipeline(got.stream(src), fs.createWriteStream(srcPath))
59+
}
60+
61+
const drawing: Drawing = {
62+
id: `${year}.${number}`,
63+
year,
64+
number,
65+
date: o.lastmod.slice(0, 10),
66+
title,
67+
slug,
68+
image: `shopify/${srcBase}`,
69+
}
70+
yearDrawingLists[year].push(drawing)
71+
})
72+
})
73+
console.log('yearDrawingLists:', yearDrawingLists)
74+
} catch (error) {
75+
console.log(error?.response?.body || error);
76+
}

shopify-scraper/tsconfig.json

Lines changed: 22 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,22 @@
1+
{
2+
"compilerOptions": {
3+
"alwaysStrict": true,
4+
"esModuleInterop": true,
5+
"forceConsistentCasingInFileNames": true,
6+
"isolatedModules": true,
7+
"jsx": "preserve",
8+
"lib": ["es2020"],
9+
"module": "esnext",
10+
"moduleResolution": "node",
11+
"noEmit": true,
12+
"noFallthroughCasesInSwitch": true,
13+
"noUnusedLocals": true,
14+
"noUnusedParameters": true,
15+
"resolveJsonModule": true,
16+
"skipLibCheck": true,
17+
"strict": true,
18+
"target": "es2020"
19+
},
20+
"exclude": ["node_modules"],
21+
"include": ["**/*.ts"]
22+
}

0 commit comments

Comments
 (0)