Skip to content

Commit 43616f1

Browse files
committed
Adding html scraper block.
1 parent 348759f commit 43616f1

File tree

5 files changed

+70
-1
lines changed

5 files changed

+70
-1
lines changed

blocks/scrape-url-html/README.md

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,6 @@
1+
# Scrape HTML
2+
3+
Requires a free account w/
4+
[Scraper API](https://www.scraperapi.com/?fp_ref=hudson) (includes 1000
5+
request/mo, sessions, and an unbelievable affiliate program), and setting the
6+
following two `.env` variables: `SCRAPER_API_KEY` and `SCRAPER_API_ORIGIN`.

blocks/scrape-url-html/handler.js

Lines changed: 32 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,32 @@
1+
const axios = require('axios')
2+
3+
const { SCRAPER_API_ORIGIN, SCRAPER_API_KEY } = process.env
4+
5+
const handler = async inputs => {
6+
const { url, session } = inputs
7+
const output = await axios
8+
.get(SCRAPER_API_ORIGIN, {
9+
params: {
10+
api_key: SCRAPER_API_KEY,
11+
url,
12+
session_number: session
13+
}
14+
})
15+
.then(res => {
16+
if (res.status !== 200) {
17+
throw new Error(res.status)
18+
}
19+
20+
return { html: res.data }
21+
})
22+
.catch(err => {
23+
const { message, stack } = err
24+
return {
25+
error: { message, stack }
26+
}
27+
})
28+
29+
return output
30+
}
31+
32+
module.exports = handler

blocks/scrape-url-html/index.js

Lines changed: 20 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,20 @@
1+
const { Block } = require('node-webpipe')
2+
const handler = require('./handler')
3+
4+
new Block()
5+
.name('Scrape URL HTML')
6+
.description('Retrieve the fully rendered HTML content via Scraper API.')
7+
.input('url', 'string', 'A url to scrape.')
8+
.input(
9+
'session',
10+
'number',
11+
'Any random integer. Allows you to continue using the same proxy for each request with that session number. Send a new integer to create a new session.'
12+
)
13+
.output('html', 'string', 'The raw HTML from the requested URL.')
14+
.handle(async (inputs, cb) => {
15+
// @TODO: This handler should be the main responsibility in this file,
16+
// handle definition in export like lambda-api, or elsewhere
17+
const html = await handler(inputs)
18+
cb(null, { html: html.html })
19+
})
20+
.listen()
Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,7 @@
1+
{
2+
"name": "scrape-url-html",
3+
"version": "1.0.0",
4+
"dependencies": {
5+
"axios": "^0.19.0"
6+
}
7+
}

now.json

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -3,9 +3,11 @@
33
"version": 2,
44
"builds": [
55
{ "src": "blocks/calculate-square-root/*.js", "use": "@now/node" },
6+
{ "src": "blocks/parse-url/*.js", "use": "@now/node" },
67
{ "src": "blocks/parse-rss/*.js", "use": "@now/node" },
78
{ "src": "blocks/convert-markdown-to-html/*.js", "use": "@now/node" },
89
{ "src": "blocks/pkg-news-api/*.js", "use": "@now/node" },
10+
{ "src": "blocks/scrape-url-html/*.js", "use": "@now/node" },
911
{ "src": "blocks/upload-url-to-s3/*.js", "use": "@now/node" },
1012
{ "src": "blocks/inspect-hostname-dns/*.js", "use": "@now/node" }
1113
],
@@ -20,6 +22,8 @@
2022
"AWS_S3_ACCESS_KEY": "@aws_s3_access_key",
2123
"AWS_S3_SECRET_KEY": "@aws_s3_secret_key",
2224
"AWS_S3_BUCKET": "@aws_s3_bucket",
23-
"API_KEY_NEWSAPI": "api_key_newsapi"
25+
"API_KEY_NEWSAPI": "@api_key_newsapi",
26+
"SCRAPER_API_ORIGIN": "@scraper_api_origin",
27+
"SCRAPER_API_KEY": "@scraper_api_key"
2428
}
2529
}

0 commit comments

Comments
 (0)