afunTW
diff --git a/‎practice/02_file_crawling.ipynb‎
Lines changed: 163 additions & 0 deletions b/‎practice/02_file_crawling.ipynb‎
Lines changed: 163 additions & 0 deletions
diff --git a/‎practice/03_website_crawling.ipynb‎
Lines changed: 180 additions & 0 deletions b/‎practice/03_website_crawling.ipynb‎
Lines changed: 180 additions & 0 deletions
@@ -0,0 +1,163 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# 練習\n",
+    "\n",
+    "- 觀察 http://exam.lib.ntu.edu.tw/graduate 並撰寫爬蟲程式\n",
+    "- request 附上 User-Agent 資訊\n",
+    "- 下載頁面上所有 pdf 考古題檔案"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "metadata": {
+    "collapsed": true
+   },
+   "outputs": [],
+   "source": [
+    "import requests\n",
+    "import re\n",
+    "import os\n",
+    "\n",
+    "from PIL import Image\n",
+    "from bs4 import BeautifulSoup\n",
+    "from fake_useragent import UserAgent\n",
+    "from urllib.parse import urljoin\n",
+    "from pprint import pprint\n",
+    "\n",
+    "url = 'http://exam.lib.ntu.edu.tw/graduate'"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "fu = UserAgent()\n",
+    "headers = {'User-Agent': fu.random}\n",
+    "resp = requests.get(url, headers=headers)\n",
+    "soup = BeautifulSoup(resp.text, 'lxml')"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "(1/30) catch the filename 106_graduate_4.pdf\n",
+      "(1/30) save file /home/dirl/github/Python-Crawling-Tutorial/results/106_graduate_4.pdf\n",
+      "(2/30) catch the filename 106_graduate_6.pdf\n",
+      "(2/30) save file /home/dirl/github/Python-Crawling-Tutorial/results/106_graduate_6.pdf\n",
+      "(3/30) catch the filename 106_graduate_3.pdf\n",
+      "(3/30) save file /home/dirl/github/Python-Crawling-Tutorial/results/106_graduate_3.pdf\n",
+      "(4/30) catch the filename 106_graduate_1.pdf\n",
+      "(4/30) save file /home/dirl/github/Python-Crawling-Tutorial/results/106_graduate_1.pdf\n",
+      "(5/30) catch the filename 106_graduate_2.pdf\n",
+      "(5/30) save file /home/dirl/github/Python-Crawling-Tutorial/results/106_graduate_2.pdf\n",
+      "(6/30) catch the filename 106_graduate_8.pdf\n",
+      "(6/30) save file /home/dirl/github/Python-Crawling-Tutorial/results/106_graduate_8.pdf\n",
+      "(7/30) catch the filename 106_graduate_5.pdf\n",
+      "(7/30) save file /home/dirl/github/Python-Crawling-Tutorial/results/106_graduate_5.pdf\n",
+      "(8/30) catch the filename 106_graduate_10.pdf\n",
+      "(8/30) save file /home/dirl/github/Python-Crawling-Tutorial/results/106_graduate_10.pdf\n",
+      "(9/30) catch the filename 106_graduate_7.pdf\n",
+      "(9/30) save file /home/dirl/github/Python-Crawling-Tutorial/results/106_graduate_7.pdf\n",
+      "(10/30) catch the filename 106_graduate_11.pdf\n",
+      "(10/30) save file /home/dirl/github/Python-Crawling-Tutorial/results/106_graduate_11.pdf\n",
+      "(11/30) catch the filename 106_graduate_13.pdf\n",
+      "(11/30) save file /home/dirl/github/Python-Crawling-Tutorial/results/106_graduate_13.pdf\n",
+      "(12/30) catch the filename 106_graduate_15.pdf\n",
+      "(12/30) save file /home/dirl/github/Python-Crawling-Tutorial/results/106_graduate_15.pdf\n",
+      "(13/30) catch the filename 106_graduate_14.pdf\n",
+      "(13/30) save file /home/dirl/github/Python-Crawling-Tutorial/results/106_graduate_14.pdf\n",
+      "(14/30) catch the filename 106_graduate_8.pdf\n",
+      "(14/30) save file /home/dirl/github/Python-Crawling-Tutorial/results/106_graduate_8.pdf\n",
+      "(15/30) catch the filename 106_graduate_5.pdf\n",
+      "(15/30) save file /home/dirl/github/Python-Crawling-Tutorial/results/106_graduate_5.pdf\n",
+      "(16/30) catch the filename 106_graduate_16.pdf\n",
+      "(16/30) save file /home/dirl/github/Python-Crawling-Tutorial/results/106_graduate_16.pdf\n",
+      "(17/30) catch the filename 106_graduate_17.pdf\n",
+      "(17/30) save file /home/dirl/github/Python-Crawling-Tutorial/results/106_graduate_17.pdf\n",
+      "(18/30) catch the filename 106_graduate_18.pdf\n",
+      "(18/30) save file /home/dirl/github/Python-Crawling-Tutorial/results/106_graduate_18.pdf\n",
+      "(19/30) catch the filename 106_graduate_19.pdf\n",
+      "(19/30) save file /home/dirl/github/Python-Crawling-Tutorial/results/106_graduate_19.pdf\n",
+      "(20/30) catch the filename 106_graduate_17.pdf\n",
+      "(20/30) save file /home/dirl/github/Python-Crawling-Tutorial/results/106_graduate_17.pdf\n",
+      "(21/30) catch the filename 106_graduate_20.pdf\n",
+      "(21/30) save file /home/dirl/github/Python-Crawling-Tutorial/results/106_graduate_20.pdf\n",
+      "(22/30) catch the filename 106_graduate_22.pdf\n",
+      "(22/30) save file /home/dirl/github/Python-Crawling-Tutorial/results/106_graduate_22.pdf\n",
+      "(23/30) catch the filename 106_graduate_21.pdf\n",
+      "(23/30) save file /home/dirl/github/Python-Crawling-Tutorial/results/106_graduate_21.pdf\n",
+      "(24/30) catch the filename 106_graduate_8.pdf\n",
+      "(24/30) save file /home/dirl/github/Python-Crawling-Tutorial/results/106_graduate_8.pdf\n",
+      "(25/30) catch the filename 106_graduate_25.pdf\n",
+      "(25/30) save file /home/dirl/github/Python-Crawling-Tutorial/results/106_graduate_25.pdf\n",
+      "(26/30) catch the filename 106_graduate_23.pdf\n",
+      "(26/30) save file /home/dirl/github/Python-Crawling-Tutorial/results/106_graduate_23.pdf\n",
+      "(27/30) catch the filename 106_graduate_24.pdf\n",
+      "(27/30) save file /home/dirl/github/Python-Crawling-Tutorial/results/106_graduate_24.pdf\n",
+      "(28/30) catch the filename 106_graduate_8.pdf\n",
+      "(28/30) save file /home/dirl/github/Python-Crawling-Tutorial/results/106_graduate_8.pdf\n",
+      "(29/30) catch the filename 106_graduate_26.pdf\n",
+      "(29/30) save file /home/dirl/github/Python-Crawling-Tutorial/results/106_graduate_26.pdf\n",
+      "(30/30) catch the filename 106_graduate_28.pdf\n",
+      "(30/30) save file /home/dirl/github/Python-Crawling-Tutorial/results/106_graduate_28.pdf\n"
+     ]
+    }
+   ],
+   "source": [
+    "results = os.path.abspath('../results')\n",
+    "if not os.path.exists(results):\n",
+    "    os.makedirs(results)\n",
+    "\n",
+    "pdfs = soup.find_all('img', class_=re.compile('.*field-icon-application-pdf$'))\n",
+    "for i, pdf in enumerate(pdfs):\n",
+    "    href = pdf.parent['href']\n",
+    "    abs_href = urljoin(resp.url, href)\n",
+    "    file_resp = requests.get(abs_href, headers=headers, stream=True)\n",
+    "    \n",
+    "    filename = os.path.basename(abs_href)\n",
+    "    filename = filename.split('&')[0]\n",
+    "    print('({}/{}) catch the filename {}'.format(i+1, len(pdfs), filename))\n",
+    "    filename = os.path.join(results, filename)\n",
+    "\n",
+    "    with open(filename, 'wb') as f:\n",
+    "        for chunk in file_resp.iter_content(2048):\n",
+    "            f.write(chunk)\n",
+    "        print('({}/{}) save file {}'.format(i+1, len(pdfs),filename))"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.5.2"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}
@@ -0,0 +1,180 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# 練習\n",
+    "\n",
+    "- 觀察 https://afuntw.github.io/Test-Crawling-Website/pages/blog/index.html 並撰寫爬蟲程式\n",
+    "- request 附上 User-Agent 資訊\n",
+    "- 下載網站上每個網頁的標題"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "metadata": {
+    "collapsed": true
+   },
+   "outputs": [],
+   "source": [
+    "import requests\n",
+    "import re\n",
+    "import os\n",
+    "\n",
+    "from PIL import Image\n",
+    "from bs4 import BeautifulSoup\n",
+    "from fake_useragent import UserAgent\n",
+    "from urllib.parse import urljoin\n",
+    "from pprint import pprint\n",
+    "\n",
+    "url = 'https://afuntw.github.io/Test-Crawling-Website/pages/blog/index.html'"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "metadata": {
+    "collapsed": true
+   },
+   "outputs": [],
+   "source": [
+    "fu = UserAgent()\n",
+    "headers = {'User-Agent': fu.random}\n",
+    "resp = requests.get(url, headers=headers)\n",
+    "soup = BeautifulSoup(resp.text, 'lxml')"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "wait_list = []\n",
+    "view_list = []\n",
+    "links = soup.find_all('a')\n",
+    "links = [link['href'] for link in links]\n",
+    "links = [urljoin(resp.url, link) for link in links]\n",
+    "links = list(set(links))\n",
+    "wait_list += links"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "https://afuntw.github.io/Test-Crawling-Website/pages/blog/post.html\n",
+      "wait list:\n",
+      "['https://afuntw.github.io/Test-Crawling-Website/pages/blog/index.html',\n",
+      " 'https://afuntw.github.io/Test-Crawling-Website/pages/blog/contact.html',\n",
+      " 'https://afuntw.github.io/Test-Crawling-Website/pages/blog/about.html']\n",
+      "view list:\n",
+      "['https://afuntw.github.io/Test-Crawling-Website/pages/blog/post.html']\n",
+      "all text:\n",
+      "['Man must explore, and this is exploration at its greatest']\n",
+      "=======================================================================================\n",
+      "https://afuntw.github.io/Test-Crawling-Website/pages/blog/about.html\n",
+      "wait list:\n",
+      "['https://afuntw.github.io/Test-Crawling-Website/pages/blog/index.html',\n",
+      " 'https://afuntw.github.io/Test-Crawling-Website/pages/blog/contact.html']\n",
+      "view list:\n",
+      "['https://afuntw.github.io/Test-Crawling-Website/pages/blog/post.html',\n",
+      " 'https://afuntw.github.io/Test-Crawling-Website/pages/blog/about.html']\n",
+      "all text:\n",
+      "['Man must explore, and this is exploration at its greatest', 'About Me']\n",
+      "=======================================================================================\n",
+      "https://afuntw.github.io/Test-Crawling-Website/pages/blog/contact.html\n",
+      "wait list:\n",
+      "['https://afuntw.github.io/Test-Crawling-Website/pages/blog/index.html']\n",
+      "view list:\n",
+      "['https://afuntw.github.io/Test-Crawling-Website/pages/blog/post.html',\n",
+      " 'https://afuntw.github.io/Test-Crawling-Website/pages/blog/about.html',\n",
+      " 'https://afuntw.github.io/Test-Crawling-Website/pages/blog/contact.html']\n",
+      "all text:\n",
+      "['Man must explore, and this is exploration at its greatest',\n",
+      " 'About Me',\n",
+      " 'Contact Me']\n",
+      "=======================================================================================\n",
+      "https://afuntw.github.io/Test-Crawling-Website/pages/blog/index.html\n",
+      "wait list:\n",
+      "[]\n",
+      "view list:\n",
+      "['https://afuntw.github.io/Test-Crawling-Website/pages/blog/post.html',\n",
+      " 'https://afuntw.github.io/Test-Crawling-Website/pages/blog/about.html',\n",
+      " 'https://afuntw.github.io/Test-Crawling-Website/pages/blog/contact.html',\n",
+      " 'https://afuntw.github.io/Test-Crawling-Website/pages/blog/index.html']\n",
+      "all text:\n",
+      "['Man must explore, and this is exploration at its greatest',\n",
+      " 'About Me',\n",
+      " 'Contact Me',\n",
+      " 'Clean Blog']\n",
+      "=======================================================================================\n"
+     ]
+    }
+   ],
+   "source": [
+    "all_h1_text = []\n",
+    "\n",
+    "while wait_list:\n",
+    "\n",
+    "    link = wait_list.pop()\n",
+    "    if link in view_list:\n",
+    "        continue\n",
+    "    \n",
+    "    print(link)\n",
+    "    view_list.append(link)\n",
+    "    \n",
+    "    page_resp = requests.get(link, headers=headers)\n",
+    "    page_soup = BeautifulSoup(page_resp.text, 'lxml')\n",
+    "    \n",
+    "    # get h1 tag on current page\n",
+    "    h1s = page_soup.find_all('h1')\n",
+    "    h1s = [h1.text for h1 in h1s]\n",
+    "    all_h1_text += h1s\n",
+    "    \n",
+    "    # search new links in current page\n",
+    "    links = page_soup.find_all('a')\n",
+    "    links = [link['href'] for link in links]\n",
+    "    links = [urljoin(page_resp.url, link) for link in links]\n",
+    "    links = list(filter(lambda x: x not in view_list, links))\n",
+    "    wait_list += links\n",
+    "    wait_list = list(set(wait_list))\n",
+    "    print('wait list:')\n",
+    "    pprint(wait_list)\n",
+    "    print('view list:')\n",
+    "    pprint(view_list)\n",
+    "    print('all text:')\n",
+    "    pprint(all_h1_text)\n",
+    "    print('='*87)"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.5.2"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}