Skip to content

Commit ccd6e16

Browse files
committed
add: new pratice, 09 need to be fixed
1 parent 454510b commit ccd6e16

8 files changed

+1410
-0
lines changed

practice/02_file_crawling.ipynb

Lines changed: 163 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,163 @@
1+
{
2+
"cells": [
3+
{
4+
"cell_type": "markdown",
5+
"metadata": {},
6+
"source": [
7+
"# 練習\n",
8+
"\n",
9+
"- 觀察 http://exam.lib.ntu.edu.tw/graduate 並撰寫爬蟲程式\n",
10+
"- request 附上 User-Agent 資訊\n",
11+
"- 下載頁面上所有 pdf 考古題檔案"
12+
]
13+
},
14+
{
15+
"cell_type": "code",
16+
"execution_count": 1,
17+
"metadata": {
18+
"collapsed": true
19+
},
20+
"outputs": [],
21+
"source": [
22+
"import requests\n",
23+
"import re\n",
24+
"import os\n",
25+
"\n",
26+
"from PIL import Image\n",
27+
"from bs4 import BeautifulSoup\n",
28+
"from fake_useragent import UserAgent\n",
29+
"from urllib.parse import urljoin\n",
30+
"from pprint import pprint\n",
31+
"\n",
32+
"url = 'http://exam.lib.ntu.edu.tw/graduate'"
33+
]
34+
},
35+
{
36+
"cell_type": "code",
37+
"execution_count": 2,
38+
"metadata": {},
39+
"outputs": [],
40+
"source": [
41+
"fu = UserAgent()\n",
42+
"headers = {'User-Agent': fu.random}\n",
43+
"resp = requests.get(url, headers=headers)\n",
44+
"soup = BeautifulSoup(resp.text, 'lxml')"
45+
]
46+
},
47+
{
48+
"cell_type": "code",
49+
"execution_count": 3,
50+
"metadata": {},
51+
"outputs": [
52+
{
53+
"name": "stdout",
54+
"output_type": "stream",
55+
"text": [
56+
"(1/30) catch the filename 106_graduate_4.pdf\n",
57+
"(1/30) save file /home/dirl/github/Python-Crawling-Tutorial/results/106_graduate_4.pdf\n",
58+
"(2/30) catch the filename 106_graduate_6.pdf\n",
59+
"(2/30) save file /home/dirl/github/Python-Crawling-Tutorial/results/106_graduate_6.pdf\n",
60+
"(3/30) catch the filename 106_graduate_3.pdf\n",
61+
"(3/30) save file /home/dirl/github/Python-Crawling-Tutorial/results/106_graduate_3.pdf\n",
62+
"(4/30) catch the filename 106_graduate_1.pdf\n",
63+
"(4/30) save file /home/dirl/github/Python-Crawling-Tutorial/results/106_graduate_1.pdf\n",
64+
"(5/30) catch the filename 106_graduate_2.pdf\n",
65+
"(5/30) save file /home/dirl/github/Python-Crawling-Tutorial/results/106_graduate_2.pdf\n",
66+
"(6/30) catch the filename 106_graduate_8.pdf\n",
67+
"(6/30) save file /home/dirl/github/Python-Crawling-Tutorial/results/106_graduate_8.pdf\n",
68+
"(7/30) catch the filename 106_graduate_5.pdf\n",
69+
"(7/30) save file /home/dirl/github/Python-Crawling-Tutorial/results/106_graduate_5.pdf\n",
70+
"(8/30) catch the filename 106_graduate_10.pdf\n",
71+
"(8/30) save file /home/dirl/github/Python-Crawling-Tutorial/results/106_graduate_10.pdf\n",
72+
"(9/30) catch the filename 106_graduate_7.pdf\n",
73+
"(9/30) save file /home/dirl/github/Python-Crawling-Tutorial/results/106_graduate_7.pdf\n",
74+
"(10/30) catch the filename 106_graduate_11.pdf\n",
75+
"(10/30) save file /home/dirl/github/Python-Crawling-Tutorial/results/106_graduate_11.pdf\n",
76+
"(11/30) catch the filename 106_graduate_13.pdf\n",
77+
"(11/30) save file /home/dirl/github/Python-Crawling-Tutorial/results/106_graduate_13.pdf\n",
78+
"(12/30) catch the filename 106_graduate_15.pdf\n",
79+
"(12/30) save file /home/dirl/github/Python-Crawling-Tutorial/results/106_graduate_15.pdf\n",
80+
"(13/30) catch the filename 106_graduate_14.pdf\n",
81+
"(13/30) save file /home/dirl/github/Python-Crawling-Tutorial/results/106_graduate_14.pdf\n",
82+
"(14/30) catch the filename 106_graduate_8.pdf\n",
83+
"(14/30) save file /home/dirl/github/Python-Crawling-Tutorial/results/106_graduate_8.pdf\n",
84+
"(15/30) catch the filename 106_graduate_5.pdf\n",
85+
"(15/30) save file /home/dirl/github/Python-Crawling-Tutorial/results/106_graduate_5.pdf\n",
86+
"(16/30) catch the filename 106_graduate_16.pdf\n",
87+
"(16/30) save file /home/dirl/github/Python-Crawling-Tutorial/results/106_graduate_16.pdf\n",
88+
"(17/30) catch the filename 106_graduate_17.pdf\n",
89+
"(17/30) save file /home/dirl/github/Python-Crawling-Tutorial/results/106_graduate_17.pdf\n",
90+
"(18/30) catch the filename 106_graduate_18.pdf\n",
91+
"(18/30) save file /home/dirl/github/Python-Crawling-Tutorial/results/106_graduate_18.pdf\n",
92+
"(19/30) catch the filename 106_graduate_19.pdf\n",
93+
"(19/30) save file /home/dirl/github/Python-Crawling-Tutorial/results/106_graduate_19.pdf\n",
94+
"(20/30) catch the filename 106_graduate_17.pdf\n",
95+
"(20/30) save file /home/dirl/github/Python-Crawling-Tutorial/results/106_graduate_17.pdf\n",
96+
"(21/30) catch the filename 106_graduate_20.pdf\n",
97+
"(21/30) save file /home/dirl/github/Python-Crawling-Tutorial/results/106_graduate_20.pdf\n",
98+
"(22/30) catch the filename 106_graduate_22.pdf\n",
99+
"(22/30) save file /home/dirl/github/Python-Crawling-Tutorial/results/106_graduate_22.pdf\n",
100+
"(23/30) catch the filename 106_graduate_21.pdf\n",
101+
"(23/30) save file /home/dirl/github/Python-Crawling-Tutorial/results/106_graduate_21.pdf\n",
102+
"(24/30) catch the filename 106_graduate_8.pdf\n",
103+
"(24/30) save file /home/dirl/github/Python-Crawling-Tutorial/results/106_graduate_8.pdf\n",
104+
"(25/30) catch the filename 106_graduate_25.pdf\n",
105+
"(25/30) save file /home/dirl/github/Python-Crawling-Tutorial/results/106_graduate_25.pdf\n",
106+
"(26/30) catch the filename 106_graduate_23.pdf\n",
107+
"(26/30) save file /home/dirl/github/Python-Crawling-Tutorial/results/106_graduate_23.pdf\n",
108+
"(27/30) catch the filename 106_graduate_24.pdf\n",
109+
"(27/30) save file /home/dirl/github/Python-Crawling-Tutorial/results/106_graduate_24.pdf\n",
110+
"(28/30) catch the filename 106_graduate_8.pdf\n",
111+
"(28/30) save file /home/dirl/github/Python-Crawling-Tutorial/results/106_graduate_8.pdf\n",
112+
"(29/30) catch the filename 106_graduate_26.pdf\n",
113+
"(29/30) save file /home/dirl/github/Python-Crawling-Tutorial/results/106_graduate_26.pdf\n",
114+
"(30/30) catch the filename 106_graduate_28.pdf\n",
115+
"(30/30) save file /home/dirl/github/Python-Crawling-Tutorial/results/106_graduate_28.pdf\n"
116+
]
117+
}
118+
],
119+
"source": [
120+
"results = os.path.abspath('../results')\n",
121+
"if not os.path.exists(results):\n",
122+
" os.makedirs(results)\n",
123+
"\n",
124+
"pdfs = soup.find_all('img', class_=re.compile('.*field-icon-application-pdf$'))\n",
125+
"for i, pdf in enumerate(pdfs):\n",
126+
" href = pdf.parent['href']\n",
127+
" abs_href = urljoin(resp.url, href)\n",
128+
" file_resp = requests.get(abs_href, headers=headers, stream=True)\n",
129+
" \n",
130+
" filename = os.path.basename(abs_href)\n",
131+
" filename = filename.split('&')[0]\n",
132+
" print('({}/{}) catch the filename {}'.format(i+1, len(pdfs), filename))\n",
133+
" filename = os.path.join(results, filename)\n",
134+
"\n",
135+
" with open(filename, 'wb') as f:\n",
136+
" for chunk in file_resp.iter_content(2048):\n",
137+
" f.write(chunk)\n",
138+
" print('({}/{}) save file {}'.format(i+1, len(pdfs),filename))"
139+
]
140+
}
141+
],
142+
"metadata": {
143+
"kernelspec": {
144+
"display_name": "Python 3",
145+
"language": "python",
146+
"name": "python3"
147+
},
148+
"language_info": {
149+
"codemirror_mode": {
150+
"name": "ipython",
151+
"version": 3
152+
},
153+
"file_extension": ".py",
154+
"mimetype": "text/x-python",
155+
"name": "python",
156+
"nbconvert_exporter": "python",
157+
"pygments_lexer": "ipython3",
158+
"version": "3.5.2"
159+
}
160+
},
161+
"nbformat": 4,
162+
"nbformat_minor": 2
163+
}

practice/03_website_crawling.ipynb

Lines changed: 180 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,180 @@
1+
{
2+
"cells": [
3+
{
4+
"cell_type": "markdown",
5+
"metadata": {},
6+
"source": [
7+
"# 練習\n",
8+
"\n",
9+
"- 觀察 https://afuntw.github.io/Test-Crawling-Website/pages/blog/index.html 並撰寫爬蟲程式\n",
10+
"- request 附上 User-Agent 資訊\n",
11+
"- 下載網站上每個網頁的標題"
12+
]
13+
},
14+
{
15+
"cell_type": "code",
16+
"execution_count": 1,
17+
"metadata": {
18+
"collapsed": true
19+
},
20+
"outputs": [],
21+
"source": [
22+
"import requests\n",
23+
"import re\n",
24+
"import os\n",
25+
"\n",
26+
"from PIL import Image\n",
27+
"from bs4 import BeautifulSoup\n",
28+
"from fake_useragent import UserAgent\n",
29+
"from urllib.parse import urljoin\n",
30+
"from pprint import pprint\n",
31+
"\n",
32+
"url = 'https://afuntw.github.io/Test-Crawling-Website/pages/blog/index.html'"
33+
]
34+
},
35+
{
36+
"cell_type": "code",
37+
"execution_count": 2,
38+
"metadata": {
39+
"collapsed": true
40+
},
41+
"outputs": [],
42+
"source": [
43+
"fu = UserAgent()\n",
44+
"headers = {'User-Agent': fu.random}\n",
45+
"resp = requests.get(url, headers=headers)\n",
46+
"soup = BeautifulSoup(resp.text, 'lxml')"
47+
]
48+
},
49+
{
50+
"cell_type": "code",
51+
"execution_count": 3,
52+
"metadata": {},
53+
"outputs": [],
54+
"source": [
55+
"wait_list = []\n",
56+
"view_list = []\n",
57+
"links = soup.find_all('a')\n",
58+
"links = [link['href'] for link in links]\n",
59+
"links = [urljoin(resp.url, link) for link in links]\n",
60+
"links = list(set(links))\n",
61+
"wait_list += links"
62+
]
63+
},
64+
{
65+
"cell_type": "code",
66+
"execution_count": 4,
67+
"metadata": {},
68+
"outputs": [
69+
{
70+
"name": "stdout",
71+
"output_type": "stream",
72+
"text": [
73+
"https://afuntw.github.io/Test-Crawling-Website/pages/blog/post.html\n",
74+
"wait list:\n",
75+
"['https://afuntw.github.io/Test-Crawling-Website/pages/blog/index.html',\n",
76+
" 'https://afuntw.github.io/Test-Crawling-Website/pages/blog/contact.html',\n",
77+
" 'https://afuntw.github.io/Test-Crawling-Website/pages/blog/about.html']\n",
78+
"view list:\n",
79+
"['https://afuntw.github.io/Test-Crawling-Website/pages/blog/post.html']\n",
80+
"all text:\n",
81+
"['Man must explore, and this is exploration at its greatest']\n",
82+
"=======================================================================================\n",
83+
"https://afuntw.github.io/Test-Crawling-Website/pages/blog/about.html\n",
84+
"wait list:\n",
85+
"['https://afuntw.github.io/Test-Crawling-Website/pages/blog/index.html',\n",
86+
" 'https://afuntw.github.io/Test-Crawling-Website/pages/blog/contact.html']\n",
87+
"view list:\n",
88+
"['https://afuntw.github.io/Test-Crawling-Website/pages/blog/post.html',\n",
89+
" 'https://afuntw.github.io/Test-Crawling-Website/pages/blog/about.html']\n",
90+
"all text:\n",
91+
"['Man must explore, and this is exploration at its greatest', 'About Me']\n",
92+
"=======================================================================================\n",
93+
"https://afuntw.github.io/Test-Crawling-Website/pages/blog/contact.html\n",
94+
"wait list:\n",
95+
"['https://afuntw.github.io/Test-Crawling-Website/pages/blog/index.html']\n",
96+
"view list:\n",
97+
"['https://afuntw.github.io/Test-Crawling-Website/pages/blog/post.html',\n",
98+
" 'https://afuntw.github.io/Test-Crawling-Website/pages/blog/about.html',\n",
99+
" 'https://afuntw.github.io/Test-Crawling-Website/pages/blog/contact.html']\n",
100+
"all text:\n",
101+
"['Man must explore, and this is exploration at its greatest',\n",
102+
" 'About Me',\n",
103+
" 'Contact Me']\n",
104+
"=======================================================================================\n",
105+
"https://afuntw.github.io/Test-Crawling-Website/pages/blog/index.html\n",
106+
"wait list:\n",
107+
"[]\n",
108+
"view list:\n",
109+
"['https://afuntw.github.io/Test-Crawling-Website/pages/blog/post.html',\n",
110+
" 'https://afuntw.github.io/Test-Crawling-Website/pages/blog/about.html',\n",
111+
" 'https://afuntw.github.io/Test-Crawling-Website/pages/blog/contact.html',\n",
112+
" 'https://afuntw.github.io/Test-Crawling-Website/pages/blog/index.html']\n",
113+
"all text:\n",
114+
"['Man must explore, and this is exploration at its greatest',\n",
115+
" 'About Me',\n",
116+
" 'Contact Me',\n",
117+
" 'Clean Blog']\n",
118+
"=======================================================================================\n"
119+
]
120+
}
121+
],
122+
"source": [
123+
"all_h1_text = []\n",
124+
"\n",
125+
"while wait_list:\n",
126+
"\n",
127+
" link = wait_list.pop()\n",
128+
" if link in view_list:\n",
129+
" continue\n",
130+
" \n",
131+
" print(link)\n",
132+
" view_list.append(link)\n",
133+
" \n",
134+
" page_resp = requests.get(link, headers=headers)\n",
135+
" page_soup = BeautifulSoup(page_resp.text, 'lxml')\n",
136+
" \n",
137+
" # get h1 tag on current page\n",
138+
" h1s = page_soup.find_all('h1')\n",
139+
" h1s = [h1.text for h1 in h1s]\n",
140+
" all_h1_text += h1s\n",
141+
" \n",
142+
" # search new links in current page\n",
143+
" links = page_soup.find_all('a')\n",
144+
" links = [link['href'] for link in links]\n",
145+
" links = [urljoin(page_resp.url, link) for link in links]\n",
146+
" links = list(filter(lambda x: x not in view_list, links))\n",
147+
" wait_list += links\n",
148+
" wait_list = list(set(wait_list))\n",
149+
" print('wait list:')\n",
150+
" pprint(wait_list)\n",
151+
" print('view list:')\n",
152+
" pprint(view_list)\n",
153+
" print('all text:')\n",
154+
" pprint(all_h1_text)\n",
155+
" print('='*87)"
156+
]
157+
}
158+
],
159+
"metadata": {
160+
"kernelspec": {
161+
"display_name": "Python 3",
162+
"language": "python",
163+
"name": "python3"
164+
},
165+
"language_info": {
166+
"codemirror_mode": {
167+
"name": "ipython",
168+
"version": 3
169+
},
170+
"file_extension": ".py",
171+
"mimetype": "text/x-python",
172+
"name": "python",
173+
"nbconvert_exporter": "python",
174+
"pygments_lexer": "ipython3",
175+
"version": "3.5.2"
176+
}
177+
},
178+
"nbformat": 4,
179+
"nbformat_minor": 2
180+
}

0 commit comments

Comments
 (0)