|
16 | 16 | {
|
17 | 17 | "cell_type": "code",
|
18 | 18 | "execution_count": 1,
|
19 |
| - "metadata": { |
20 |
| - "collapsed": true |
21 |
| - }, |
| 19 | + "metadata": {}, |
22 | 20 | "outputs": [],
|
23 | 21 | "source": [
|
24 | 22 | "import os\n",
|
|
34 | 32 | {
|
35 | 33 | "cell_type": "code",
|
36 | 34 | "execution_count": 2,
|
37 |
| - "metadata": { |
38 |
| - "collapsed": false |
39 |
| - }, |
| 35 | + "metadata": {}, |
40 | 36 | "outputs": [
|
41 | 37 | {
|
42 | 38 | "name": "stdout",
|
43 | 39 | "output_type": "stream",
|
44 | 40 | "text": [
|
45 |
| - "The date after one week - 2018/02/09\n" |
| 41 | + "The date after one week - 2018/02/28\n" |
46 | 42 | ]
|
47 | 43 | }
|
48 | 44 | ],
|
|
65 | 61 | {
|
66 | 62 | "cell_type": "code",
|
67 | 63 | "execution_count": 3,
|
68 |
| - "metadata": { |
69 |
| - "collapsed": true |
70 |
| - }, |
| 64 | + "metadata": {}, |
71 | 65 | "outputs": [],
|
72 | 66 | "source": [
|
73 | 67 | "resp = requests.post(url, data=form_data)\n",
|
|
78 | 72 | {
|
79 | 73 | "cell_type": "code",
|
80 | 74 | "execution_count": 4,
|
81 |
| - "metadata": { |
82 |
| - "collapsed": false |
83 |
| - }, |
| 75 | + "metadata": {}, |
84 | 76 | "outputs": [
|
85 | 77 | {
|
86 | 78 | "data": {
|
87 | 79 | "text/html": [
|
88 | 80 | "<div>\n",
|
| 81 | + "<style scoped>\n", |
| 82 | + " .dataframe tbody tr th:only-of-type {\n", |
| 83 | + " vertical-align: middle;\n", |
| 84 | + " }\n", |
| 85 | + "\n", |
| 86 | + " .dataframe tbody tr th {\n", |
| 87 | + " vertical-align: top;\n", |
| 88 | + " }\n", |
| 89 | + "\n", |
| 90 | + " .dataframe thead th {\n", |
| 91 | + " text-align: right;\n", |
| 92 | + " }\n", |
| 93 | + "</style>\n", |
89 | 94 | "<table border=\"1\" class=\"dataframe\">\n",
|
90 | 95 | " <thead>\n",
|
91 | 96 | " <tr style=\"text-align: right;\">\n",
|
|
104 | 109 | " <td>14:11</td>\n",
|
105 | 110 | " <td>16:11</td>\n",
|
106 | 111 | " <td>02:00</td>\n",
|
107 |
| - " <td>65折起</td>\n", |
108 |
| - " </tr>\n", |
109 |
| - " <tr>\n", |
110 |
| - " <th>1</th>\n", |
111 |
| - " <td>0833</td>\n", |
112 |
| - " <td>14:11</td>\n", |
113 |
| - " <td>16:11</td>\n", |
114 |
| - " <td>02:00</td>\n", |
115 |
| - " <td>65折起</td>\n", |
116 |
| - " </tr>\n", |
117 |
| - " <tr>\n", |
118 |
| - " <th>2</th>\n", |
119 |
| - " <td>1649</td>\n", |
120 |
| - " <td>14:21</td>\n", |
121 |
| - " <td>16:06</td>\n", |
122 |
| - " <td>01:45</td>\n", |
123 | 112 | " <td>8折起</td>\n",
|
124 | 113 | " </tr>\n",
|
125 | 114 | " <tr>\n",
|
126 |
| - " <th>3</th>\n", |
127 |
| - " <td>1649</td>\n", |
128 |
| - " <td>14:21</td>\n", |
129 |
| - " <td>16:06</td>\n", |
130 |
| - " <td>01:45</td>\n", |
131 |
| - " <td>8折起</td>\n", |
132 |
| - " </tr>\n", |
133 |
| - " <tr>\n", |
134 |
| - " <th>4</th>\n", |
135 |
| - " <td>0651</td>\n", |
136 |
| - " <td>14:46</td>\n", |
137 |
| - " <td>16:32</td>\n", |
138 |
| - " <td>01:46</td>\n", |
139 |
| - " <td></td>\n", |
140 |
| - " </tr>\n", |
141 |
| - " <tr>\n", |
142 |
| - " <th>5</th>\n", |
| 115 | + " <th>1</th>\n", |
143 | 116 | " <td>0651</td>\n",
|
144 | 117 | " <td>14:46</td>\n",
|
145 | 118 | " <td>16:32</td>\n",
|
146 | 119 | " <td>01:46</td>\n",
|
147 | 120 | " <td></td>\n",
|
148 | 121 | " </tr>\n",
|
149 | 122 | " <tr>\n",
|
150 |
| - " <th>6</th>\n", |
151 |
| - " <td>0837</td>\n", |
152 |
| - " <td>15:11</td>\n", |
153 |
| - " <td>17:11</td>\n", |
154 |
| - " <td>02:00</td>\n", |
155 |
| - " <td>65折起</td>\n", |
156 |
| - " </tr>\n", |
157 |
| - " <tr>\n", |
158 |
| - " <th>7</th>\n", |
| 123 | + " <th>2</th>\n", |
159 | 124 | " <td>0837</td>\n",
|
160 | 125 | " <td>15:11</td>\n",
|
161 | 126 | " <td>17:11</td>\n",
|
162 | 127 | " <td>02:00</td>\n",
|
163 |
| - " <td>65折起</td>\n", |
164 |
| - " </tr>\n", |
165 |
| - " <tr>\n", |
166 |
| - " <th>8</th>\n", |
167 |
| - " <td>1655</td>\n", |
168 |
| - " <td>15:21</td>\n", |
169 |
| - " <td>17:06</td>\n", |
170 |
| - " <td>01:45</td>\n", |
171 |
| - " <td>8折起</td>\n", |
172 |
| - " </tr>\n", |
173 |
| - " <tr>\n", |
174 |
| - " <th>9</th>\n", |
175 |
| - " <td>1655</td>\n", |
176 |
| - " <td>15:21</td>\n", |
177 |
| - " <td>17:06</td>\n", |
178 |
| - " <td>01:45</td>\n", |
179 |
| - " <td>8折起</td>\n", |
180 |
| - " </tr>\n", |
181 |
| - " <tr>\n", |
182 |
| - " <th>10</th>\n", |
183 |
| - " <td>0657</td>\n", |
184 |
| - " <td>15:46</td>\n", |
185 |
| - " <td>17:32</td>\n", |
186 |
| - " <td>01:46</td>\n", |
187 | 128 | " <td>8折起</td>\n",
|
188 | 129 | " </tr>\n",
|
189 | 130 | " <tr>\n",
|
190 |
| - " <th>11</th>\n", |
| 131 | + " <th>3</th>\n", |
191 | 132 | " <td>0657</td>\n",
|
192 | 133 | " <td>15:46</td>\n",
|
193 | 134 | " <td>17:32</td>\n",
|
194 | 135 | " <td>01:46</td>\n",
|
195 |
| - " <td>8折起</td>\n", |
196 |
| - " </tr>\n", |
197 |
| - " <tr>\n", |
198 |
| - " <th>12</th>\n", |
199 |
| - " <td>1237</td>\n", |
200 |
| - " <td>15:51</td>\n", |
201 |
| - " <td>17:17</td>\n", |
202 |
| - " <td>01:26</td>\n", |
203 |
| - " <td></td>\n", |
204 |
| - " </tr>\n", |
205 |
| - " <tr>\n", |
206 |
| - " <th>13</th>\n", |
207 |
| - " <td>1237</td>\n", |
208 |
| - " <td>15:51</td>\n", |
209 |
| - " <td>17:17</td>\n", |
210 |
| - " <td>01:26</td>\n", |
211 | 136 | " <td></td>\n",
|
212 | 137 | " </tr>\n",
|
213 | 138 | " <tr>\n",
|
214 |
| - " <th>14</th>\n", |
215 |
| - " <td>0841</td>\n", |
216 |
| - " <td>16:11</td>\n", |
217 |
| - " <td>18:11</td>\n", |
218 |
| - " <td>02:00</td>\n", |
219 |
| - " <td>8折起</td>\n", |
220 |
| - " </tr>\n", |
221 |
| - " <tr>\n", |
222 |
| - " <th>15</th>\n", |
| 139 | + " <th>4</th>\n", |
223 | 140 | " <td>0841</td>\n",
|
224 | 141 | " <td>16:11</td>\n",
|
225 | 142 | " <td>18:11</td>\n",
|
226 | 143 | " <td>02:00</td>\n",
|
227 |
| - " <td>8折起</td>\n", |
228 |
| - " </tr>\n", |
229 |
| - " <tr>\n", |
230 |
| - " <th>16</th>\n", |
231 |
| - " <td>0661</td>\n", |
232 |
| - " <td>16:21</td>\n", |
233 |
| - " <td>18:06</td>\n", |
234 |
| - " <td>01:45</td>\n", |
235 |
| - " <td></td>\n", |
| 144 | + " <td>65折起</td>\n", |
236 | 145 | " </tr>\n",
|
237 | 146 | " <tr>\n",
|
238 |
| - " <th>17</th>\n", |
| 147 | + " <th>5</th>\n", |
239 | 148 | " <td>0661</td>\n",
|
240 | 149 | " <td>16:21</td>\n",
|
241 | 150 | " <td>18:06</td>\n",
|
242 | 151 | " <td>01:45</td>\n",
|
243 |
| - " <td></td>\n", |
| 152 | + " <td>8折起</td>\n", |
244 | 153 | " </tr>\n",
|
245 | 154 | " <tr>\n",
|
246 |
| - " <th>18</th>\n", |
| 155 | + " <th>6</th>\n", |
247 | 156 | " <td>0663</td>\n",
|
248 | 157 | " <td>16:46</td>\n",
|
249 | 158 | " <td>18:32</td>\n",
|
250 | 159 | " <td>01:46</td>\n",
|
251 | 160 | " <td></td>\n",
|
252 | 161 | " </tr>\n",
|
253 | 162 | " <tr>\n",
|
254 |
| - " <th>19</th>\n", |
255 |
| - " <td>0663</td>\n", |
256 |
| - " <td>16:46</td>\n", |
257 |
| - " <td>18:32</td>\n", |
| 163 | + " <th>7</th>\n", |
| 164 | + " <td>0845</td>\n", |
| 165 | + " <td>17:11</td>\n", |
| 166 | + " <td>19:11</td>\n", |
| 167 | + " <td>02:00</td>\n", |
| 168 | + " <td>65折起</td>\n", |
| 169 | + " </tr>\n", |
| 170 | + " <tr>\n", |
| 171 | + " <th>8</th>\n", |
| 172 | + " <td>0667</td>\n", |
| 173 | + " <td>17:21</td>\n", |
| 174 | + " <td>19:06</td>\n", |
| 175 | + " <td>01:45</td>\n", |
| 176 | + " <td>8折起</td>\n", |
| 177 | + " </tr>\n", |
| 178 | + " <tr>\n", |
| 179 | + " <th>9</th>\n", |
| 180 | + " <td>0669</td>\n", |
| 181 | + " <td>17:46</td>\n", |
| 182 | + " <td>19:32</td>\n", |
258 | 183 | " <td>01:46</td>\n",
|
259 | 184 | " <td></td>\n",
|
260 | 185 | " </tr>\n",
|
|
263 | 188 | "</div>"
|
264 | 189 | ],
|
265 | 190 | "text/plain": [
|
266 |
| - " 車次 出發時間 抵達時間 行車時間 早鳥\n", |
267 |
| - "0 0833 14:11 16:11 02:00 65折起\n", |
268 |
| - "1 0833 14:11 16:11 02:00 65折起\n", |
269 |
| - "2 1649 14:21 16:06 01:45 8折起\n", |
270 |
| - "3 1649 14:21 16:06 01:45 8折起\n", |
271 |
| - "4 0651 14:46 16:32 01:46 \n", |
272 |
| - "5 0651 14:46 16:32 01:46 \n", |
273 |
| - "6 0837 15:11 17:11 02:00 65折起\n", |
274 |
| - "7 0837 15:11 17:11 02:00 65折起\n", |
275 |
| - "8 1655 15:21 17:06 01:45 8折起\n", |
276 |
| - "9 1655 15:21 17:06 01:45 8折起\n", |
277 |
| - "10 0657 15:46 17:32 01:46 8折起\n", |
278 |
| - "11 0657 15:46 17:32 01:46 8折起\n", |
279 |
| - "12 1237 15:51 17:17 01:26 \n", |
280 |
| - "13 1237 15:51 17:17 01:26 \n", |
281 |
| - "14 0841 16:11 18:11 02:00 8折起\n", |
282 |
| - "15 0841 16:11 18:11 02:00 8折起\n", |
283 |
| - "16 0661 16:21 18:06 01:45 \n", |
284 |
| - "17 0661 16:21 18:06 01:45 \n", |
285 |
| - "18 0663 16:46 18:32 01:46 \n", |
286 |
| - "19 0663 16:46 18:32 01:46 " |
| 191 | + " 車次 出發時間 抵達時間 行車時間 早鳥\n", |
| 192 | + "0 0833 14:11 16:11 02:00 8折起\n", |
| 193 | + "1 0651 14:46 16:32 01:46 \n", |
| 194 | + "2 0837 15:11 17:11 02:00 8折起\n", |
| 195 | + "3 0657 15:46 17:32 01:46 \n", |
| 196 | + "4 0841 16:11 18:11 02:00 65折起\n", |
| 197 | + "5 0661 16:21 18:06 01:45 8折起\n", |
| 198 | + "6 0663 16:46 18:32 01:46 \n", |
| 199 | + "7 0845 17:11 19:11 02:00 65折起\n", |
| 200 | + "8 0667 17:21 19:06 01:45 8折起\n", |
| 201 | + "9 0669 17:46 19:32 01:46 " |
287 | 202 | ]
|
288 | 203 | },
|
289 | 204 | "execution_count": 4,
|
|
292 | 207 | }
|
293 | 208 | ],
|
294 | 209 | "source": [
|
295 |
| - "rows = soup.table.find_all('tr')\n", |
| 210 | + "rows = soup.table.find_all('tr', recursive=False)\n", |
296 | 211 | "\n",
|
297 | 212 | "colname, rows = rows[1], rows[2:]\n",
|
298 | 213 | "colname = list(colname.stripped_strings)\n",
|
|
312 | 227 | " early_ticket = early_ticket[0] if early_ticket else ''\n",
|
313 | 228 | " \n",
|
314 | 229 | " rows[i] = [trips, t_departure, t_arrive, duration, early_ticket]\n",
|
315 |
| - " \n", |
| 230 | + "\n", |
316 | 231 | "df = pd.DataFrame(rows, columns=colname)\n",
|
317 | 232 | "df"
|
318 | 233 | ]
|
319 | 234 | },
|
320 | 235 | {
|
321 | 236 | "cell_type": "code",
|
322 | 237 | "execution_count": 5,
|
323 |
| - "metadata": { |
324 |
| - "collapsed": false |
325 |
| - }, |
| 238 | + "metadata": {}, |
326 | 239 | "outputs": [
|
327 | 240 | {
|
328 | 241 | "name": "stdout",
|
329 | 242 | "output_type": "stream",
|
330 | 243 | "text": [
|
331 |
| - "Save csv to /home/afun/github/Python-Crawling-Tutorial/results/thsrc_20180209.csv\n" |
| 244 | + "Save csv to /home/afun/github/Python-Crawling-Tutorial/results/thsrc_20180228.csv\n" |
332 | 245 | ]
|
333 | 246 | }
|
334 | 247 | ],
|
|
0 commit comments