|
16 | 16 | { |
17 | 17 | "cell_type": "code", |
18 | 18 | "execution_count": 1, |
19 | | - "metadata": { |
20 | | - "collapsed": true |
21 | | - }, |
| 19 | + "metadata": {}, |
22 | 20 | "outputs": [], |
23 | 21 | "source": [ |
24 | 22 | "import os\n", |
|
34 | 32 | { |
35 | 33 | "cell_type": "code", |
36 | 34 | "execution_count": 2, |
37 | | - "metadata": { |
38 | | - "collapsed": false |
39 | | - }, |
| 35 | + "metadata": {}, |
40 | 36 | "outputs": [ |
41 | 37 | { |
42 | 38 | "name": "stdout", |
43 | 39 | "output_type": "stream", |
44 | 40 | "text": [ |
45 | | - "The date after one week - 2018/02/09\n" |
| 41 | + "The date after one week - 2018/02/28\n" |
46 | 42 | ] |
47 | 43 | } |
48 | 44 | ], |
|
65 | 61 | { |
66 | 62 | "cell_type": "code", |
67 | 63 | "execution_count": 3, |
68 | | - "metadata": { |
69 | | - "collapsed": true |
70 | | - }, |
| 64 | + "metadata": {}, |
71 | 65 | "outputs": [], |
72 | 66 | "source": [ |
73 | 67 | "resp = requests.post(url, data=form_data)\n", |
|
78 | 72 | { |
79 | 73 | "cell_type": "code", |
80 | 74 | "execution_count": 4, |
81 | | - "metadata": { |
82 | | - "collapsed": false |
83 | | - }, |
| 75 | + "metadata": {}, |
84 | 76 | "outputs": [ |
85 | 77 | { |
86 | 78 | "data": { |
87 | 79 | "text/html": [ |
88 | 80 | "<div>\n", |
| 81 | + "<style scoped>\n", |
| 82 | + " .dataframe tbody tr th:only-of-type {\n", |
| 83 | + " vertical-align: middle;\n", |
| 84 | + " }\n", |
| 85 | + "\n", |
| 86 | + " .dataframe tbody tr th {\n", |
| 87 | + " vertical-align: top;\n", |
| 88 | + " }\n", |
| 89 | + "\n", |
| 90 | + " .dataframe thead th {\n", |
| 91 | + " text-align: right;\n", |
| 92 | + " }\n", |
| 93 | + "</style>\n", |
89 | 94 | "<table border=\"1\" class=\"dataframe\">\n", |
90 | 95 | " <thead>\n", |
91 | 96 | " <tr style=\"text-align: right;\">\n", |
|
104 | 109 | " <td>14:11</td>\n", |
105 | 110 | " <td>16:11</td>\n", |
106 | 111 | " <td>02:00</td>\n", |
107 | | - " <td>65折起</td>\n", |
108 | | - " </tr>\n", |
109 | | - " <tr>\n", |
110 | | - " <th>1</th>\n", |
111 | | - " <td>0833</td>\n", |
112 | | - " <td>14:11</td>\n", |
113 | | - " <td>16:11</td>\n", |
114 | | - " <td>02:00</td>\n", |
115 | | - " <td>65折起</td>\n", |
116 | | - " </tr>\n", |
117 | | - " <tr>\n", |
118 | | - " <th>2</th>\n", |
119 | | - " <td>1649</td>\n", |
120 | | - " <td>14:21</td>\n", |
121 | | - " <td>16:06</td>\n", |
122 | | - " <td>01:45</td>\n", |
123 | 112 | " <td>8折起</td>\n", |
124 | 113 | " </tr>\n", |
125 | 114 | " <tr>\n", |
126 | | - " <th>3</th>\n", |
127 | | - " <td>1649</td>\n", |
128 | | - " <td>14:21</td>\n", |
129 | | - " <td>16:06</td>\n", |
130 | | - " <td>01:45</td>\n", |
131 | | - " <td>8折起</td>\n", |
132 | | - " </tr>\n", |
133 | | - " <tr>\n", |
134 | | - " <th>4</th>\n", |
135 | | - " <td>0651</td>\n", |
136 | | - " <td>14:46</td>\n", |
137 | | - " <td>16:32</td>\n", |
138 | | - " <td>01:46</td>\n", |
139 | | - " <td></td>\n", |
140 | | - " </tr>\n", |
141 | | - " <tr>\n", |
142 | | - " <th>5</th>\n", |
| 115 | + " <th>1</th>\n", |
143 | 116 | " <td>0651</td>\n", |
144 | 117 | " <td>14:46</td>\n", |
145 | 118 | " <td>16:32</td>\n", |
146 | 119 | " <td>01:46</td>\n", |
147 | 120 | " <td></td>\n", |
148 | 121 | " </tr>\n", |
149 | 122 | " <tr>\n", |
150 | | - " <th>6</th>\n", |
151 | | - " <td>0837</td>\n", |
152 | | - " <td>15:11</td>\n", |
153 | | - " <td>17:11</td>\n", |
154 | | - " <td>02:00</td>\n", |
155 | | - " <td>65折起</td>\n", |
156 | | - " </tr>\n", |
157 | | - " <tr>\n", |
158 | | - " <th>7</th>\n", |
| 123 | + " <th>2</th>\n", |
159 | 124 | " <td>0837</td>\n", |
160 | 125 | " <td>15:11</td>\n", |
161 | 126 | " <td>17:11</td>\n", |
162 | 127 | " <td>02:00</td>\n", |
163 | | - " <td>65折起</td>\n", |
164 | | - " </tr>\n", |
165 | | - " <tr>\n", |
166 | | - " <th>8</th>\n", |
167 | | - " <td>1655</td>\n", |
168 | | - " <td>15:21</td>\n", |
169 | | - " <td>17:06</td>\n", |
170 | | - " <td>01:45</td>\n", |
171 | | - " <td>8折起</td>\n", |
172 | | - " </tr>\n", |
173 | | - " <tr>\n", |
174 | | - " <th>9</th>\n", |
175 | | - " <td>1655</td>\n", |
176 | | - " <td>15:21</td>\n", |
177 | | - " <td>17:06</td>\n", |
178 | | - " <td>01:45</td>\n", |
179 | | - " <td>8折起</td>\n", |
180 | | - " </tr>\n", |
181 | | - " <tr>\n", |
182 | | - " <th>10</th>\n", |
183 | | - " <td>0657</td>\n", |
184 | | - " <td>15:46</td>\n", |
185 | | - " <td>17:32</td>\n", |
186 | | - " <td>01:46</td>\n", |
187 | 128 | " <td>8折起</td>\n", |
188 | 129 | " </tr>\n", |
189 | 130 | " <tr>\n", |
190 | | - " <th>11</th>\n", |
| 131 | + " <th>3</th>\n", |
191 | 132 | " <td>0657</td>\n", |
192 | 133 | " <td>15:46</td>\n", |
193 | 134 | " <td>17:32</td>\n", |
194 | 135 | " <td>01:46</td>\n", |
195 | | - " <td>8折起</td>\n", |
196 | | - " </tr>\n", |
197 | | - " <tr>\n", |
198 | | - " <th>12</th>\n", |
199 | | - " <td>1237</td>\n", |
200 | | - " <td>15:51</td>\n", |
201 | | - " <td>17:17</td>\n", |
202 | | - " <td>01:26</td>\n", |
203 | | - " <td></td>\n", |
204 | | - " </tr>\n", |
205 | | - " <tr>\n", |
206 | | - " <th>13</th>\n", |
207 | | - " <td>1237</td>\n", |
208 | | - " <td>15:51</td>\n", |
209 | | - " <td>17:17</td>\n", |
210 | | - " <td>01:26</td>\n", |
211 | 136 | " <td></td>\n", |
212 | 137 | " </tr>\n", |
213 | 138 | " <tr>\n", |
214 | | - " <th>14</th>\n", |
215 | | - " <td>0841</td>\n", |
216 | | - " <td>16:11</td>\n", |
217 | | - " <td>18:11</td>\n", |
218 | | - " <td>02:00</td>\n", |
219 | | - " <td>8折起</td>\n", |
220 | | - " </tr>\n", |
221 | | - " <tr>\n", |
222 | | - " <th>15</th>\n", |
| 139 | + " <th>4</th>\n", |
223 | 140 | " <td>0841</td>\n", |
224 | 141 | " <td>16:11</td>\n", |
225 | 142 | " <td>18:11</td>\n", |
226 | 143 | " <td>02:00</td>\n", |
227 | | - " <td>8折起</td>\n", |
228 | | - " </tr>\n", |
229 | | - " <tr>\n", |
230 | | - " <th>16</th>\n", |
231 | | - " <td>0661</td>\n", |
232 | | - " <td>16:21</td>\n", |
233 | | - " <td>18:06</td>\n", |
234 | | - " <td>01:45</td>\n", |
235 | | - " <td></td>\n", |
| 144 | + " <td>65折起</td>\n", |
236 | 145 | " </tr>\n", |
237 | 146 | " <tr>\n", |
238 | | - " <th>17</th>\n", |
| 147 | + " <th>5</th>\n", |
239 | 148 | " <td>0661</td>\n", |
240 | 149 | " <td>16:21</td>\n", |
241 | 150 | " <td>18:06</td>\n", |
242 | 151 | " <td>01:45</td>\n", |
243 | | - " <td></td>\n", |
| 152 | + " <td>8折起</td>\n", |
244 | 153 | " </tr>\n", |
245 | 154 | " <tr>\n", |
246 | | - " <th>18</th>\n", |
| 155 | + " <th>6</th>\n", |
247 | 156 | " <td>0663</td>\n", |
248 | 157 | " <td>16:46</td>\n", |
249 | 158 | " <td>18:32</td>\n", |
250 | 159 | " <td>01:46</td>\n", |
251 | 160 | " <td></td>\n", |
252 | 161 | " </tr>\n", |
253 | 162 | " <tr>\n", |
254 | | - " <th>19</th>\n", |
255 | | - " <td>0663</td>\n", |
256 | | - " <td>16:46</td>\n", |
257 | | - " <td>18:32</td>\n", |
| 163 | + " <th>7</th>\n", |
| 164 | + " <td>0845</td>\n", |
| 165 | + " <td>17:11</td>\n", |
| 166 | + " <td>19:11</td>\n", |
| 167 | + " <td>02:00</td>\n", |
| 168 | + " <td>65折起</td>\n", |
| 169 | + " </tr>\n", |
| 170 | + " <tr>\n", |
| 171 | + " <th>8</th>\n", |
| 172 | + " <td>0667</td>\n", |
| 173 | + " <td>17:21</td>\n", |
| 174 | + " <td>19:06</td>\n", |
| 175 | + " <td>01:45</td>\n", |
| 176 | + " <td>8折起</td>\n", |
| 177 | + " </tr>\n", |
| 178 | + " <tr>\n", |
| 179 | + " <th>9</th>\n", |
| 180 | + " <td>0669</td>\n", |
| 181 | + " <td>17:46</td>\n", |
| 182 | + " <td>19:32</td>\n", |
258 | 183 | " <td>01:46</td>\n", |
259 | 184 | " <td></td>\n", |
260 | 185 | " </tr>\n", |
|
263 | 188 | "</div>" |
264 | 189 | ], |
265 | 190 | "text/plain": [ |
266 | | - " 車次 出發時間 抵達時間 行車時間 早鳥\n", |
267 | | - "0 0833 14:11 16:11 02:00 65折起\n", |
268 | | - "1 0833 14:11 16:11 02:00 65折起\n", |
269 | | - "2 1649 14:21 16:06 01:45 8折起\n", |
270 | | - "3 1649 14:21 16:06 01:45 8折起\n", |
271 | | - "4 0651 14:46 16:32 01:46 \n", |
272 | | - "5 0651 14:46 16:32 01:46 \n", |
273 | | - "6 0837 15:11 17:11 02:00 65折起\n", |
274 | | - "7 0837 15:11 17:11 02:00 65折起\n", |
275 | | - "8 1655 15:21 17:06 01:45 8折起\n", |
276 | | - "9 1655 15:21 17:06 01:45 8折起\n", |
277 | | - "10 0657 15:46 17:32 01:46 8折起\n", |
278 | | - "11 0657 15:46 17:32 01:46 8折起\n", |
279 | | - "12 1237 15:51 17:17 01:26 \n", |
280 | | - "13 1237 15:51 17:17 01:26 \n", |
281 | | - "14 0841 16:11 18:11 02:00 8折起\n", |
282 | | - "15 0841 16:11 18:11 02:00 8折起\n", |
283 | | - "16 0661 16:21 18:06 01:45 \n", |
284 | | - "17 0661 16:21 18:06 01:45 \n", |
285 | | - "18 0663 16:46 18:32 01:46 \n", |
286 | | - "19 0663 16:46 18:32 01:46 " |
| 191 | + " 車次 出發時間 抵達時間 行車時間 早鳥\n", |
| 192 | + "0 0833 14:11 16:11 02:00 8折起\n", |
| 193 | + "1 0651 14:46 16:32 01:46 \n", |
| 194 | + "2 0837 15:11 17:11 02:00 8折起\n", |
| 195 | + "3 0657 15:46 17:32 01:46 \n", |
| 196 | + "4 0841 16:11 18:11 02:00 65折起\n", |
| 197 | + "5 0661 16:21 18:06 01:45 8折起\n", |
| 198 | + "6 0663 16:46 18:32 01:46 \n", |
| 199 | + "7 0845 17:11 19:11 02:00 65折起\n", |
| 200 | + "8 0667 17:21 19:06 01:45 8折起\n", |
| 201 | + "9 0669 17:46 19:32 01:46 " |
287 | 202 | ] |
288 | 203 | }, |
289 | 204 | "execution_count": 4, |
|
292 | 207 | } |
293 | 208 | ], |
294 | 209 | "source": [ |
295 | | - "rows = soup.table.find_all('tr')\n", |
| 210 | + "rows = soup.table.find_all('tr', recursive=False)\n", |
296 | 211 | "\n", |
297 | 212 | "colname, rows = rows[1], rows[2:]\n", |
298 | 213 | "colname = list(colname.stripped_strings)\n", |
|
312 | 227 | " early_ticket = early_ticket[0] if early_ticket else ''\n", |
313 | 228 | " \n", |
314 | 229 | " rows[i] = [trips, t_departure, t_arrive, duration, early_ticket]\n", |
315 | | - " \n", |
| 230 | + "\n", |
316 | 231 | "df = pd.DataFrame(rows, columns=colname)\n", |
317 | 232 | "df" |
318 | 233 | ] |
319 | 234 | }, |
320 | 235 | { |
321 | 236 | "cell_type": "code", |
322 | 237 | "execution_count": 5, |
323 | | - "metadata": { |
324 | | - "collapsed": false |
325 | | - }, |
| 238 | + "metadata": {}, |
326 | 239 | "outputs": [ |
327 | 240 | { |
328 | 241 | "name": "stdout", |
329 | 242 | "output_type": "stream", |
330 | 243 | "text": [ |
331 | | - "Save csv to /home/afun/github/Python-Crawling-Tutorial/results/thsrc_20180209.csv\n" |
| 244 | + "Save csv to /home/afun/github/Python-Crawling-Tutorial/results/thsrc_20180228.csv\n" |
332 | 245 | ] |
333 | 246 | } |
334 | 247 | ], |
|
0 commit comments