|
4 | 4 | "cell_type": "markdown",
|
5 | 5 | "metadata": {},
|
6 | 6 | "source": [
|
7 |
| - "# 爬取昨天的所有文章\n", |
| 7 | + "# 爬取今天到目前為止的所有文章\n", |
8 | 8 | "\n",
|
9 | 9 | "https://www.ptt.cc/bbs/Gossiping/index.html"
|
10 | 10 | ]
|
|
20 | 20 | "import json\n",
|
21 | 21 | "\n",
|
22 | 22 | "from bs4 import BeautifulSoup, NavigableString\n",
|
23 |
| - "from datetime import datetime, timedelta\n", |
| 23 | + "from datetime import datetime\n", |
24 | 24 | "from pprint import pprint\n",
|
25 | 25 | "from urllib.parse import urljoin"
|
26 | 26 | ]
|
|
34 | 34 | "name": "stdout",
|
35 | 35 | "output_type": "stream",
|
36 | 36 | "text": [
|
37 |
| - "09/25\n" |
| 37 | + "09/27\n" |
38 | 38 | ]
|
39 | 39 | }
|
40 | 40 | ],
|
41 | 41 | "source": [
|
42 | 42 | "base_url = 'https://www.ptt.cc/bbs/Gossiping/index.html'\n",
|
43 |
| - "ptt_yesterday = datetime.now() - timedelta(days=1)\n", |
44 |
| - "ptt_yesterday_str = ptt_yesterday.strftime('%m/%d')\n", |
45 |
| - "print(ptt_yesterday_str)" |
| 43 | + "ptt_today = datetime.now()\n", |
| 44 | + "ptt_today_str = ptt_today.strftime('%m/%d')\n", |
| 45 | + "print(ptt_today_str)" |
46 | 46 | ]
|
47 | 47 | },
|
48 | 48 | {
|
|
74 | 74 | "name": "stdout",
|
75 | 75 | "output_type": "stream",
|
76 | 76 | "text": [
|
77 |
| - "total page = 39219\n" |
| 77 | + "total page = 39228\n" |
78 | 78 | ]
|
79 | 79 | }
|
80 | 80 | ],
|
|
210 | 210 | "name": "stdout",
|
211 | 211 | "output_type": "stream",
|
212 | 212 | "text": [
|
213 |
| - "https://www.ptt.cc/bbs/Gossiping/index39219.html - date 9/26 result 1\n" |
| 213 | + "https://www.ptt.cc/bbs/Gossiping/index39228.html - date 9/27 result 0\n", |
| 214 | + "Start to Crawling https://www.ptt.cc/bbs/Gossiping/M.1537978608.A.325.html\n", |
| 215 | + "Start to Crawling https://www.ptt.cc/bbs/Gossiping/M.1537978662.A.45A.html\n", |
| 216 | + "Start to Crawling https://www.ptt.cc/bbs/Gossiping/M.1537978695.A.9A7.html\n", |
| 217 | + "Start to Crawling https://www.ptt.cc/bbs/Gossiping/M.1537978699.A.194.html\n", |
| 218 | + "Start to Crawling https://www.ptt.cc/bbs/Gossiping/M.1537978724.A.356.html\n", |
| 219 | + "Start to Crawling https://www.ptt.cc/bbs/Gossiping/M.1537978750.A.39A.html\n", |
| 220 | + "Start to Crawling https://www.ptt.cc/bbs/Gossiping/M.1537978768.A.08B.html\n", |
| 221 | + "Start to Crawling https://www.ptt.cc/bbs/Gossiping/M.1537978815.A.5B2.html\n", |
| 222 | + "Start to Crawling https://www.ptt.cc/bbs/Gossiping/M.1537978820.A.119.html\n", |
| 223 | + "Start to Crawling https://www.ptt.cc/bbs/Gossiping/M.1537978934.A.F8E.html\n", |
| 224 | + "Start to Crawling https://www.ptt.cc/bbs/Gossiping/M.1537978941.A.754.html\n", |
| 225 | + "Start to Crawling https://www.ptt.cc/bbs/Gossiping/M.1537978960.A.779.html\n", |
| 226 | + "Start to Crawling https://www.ptt.cc/bbs/Gossiping/M.1537978973.A.B90.html\n", |
| 227 | + "Start to Crawling https://www.ptt.cc/bbs/Gossiping/M.1537978993.A.F88.html\n", |
| 228 | + "Start to Crawling https://www.ptt.cc/bbs/Gossiping/M.1537979013.A.67C.html\n", |
| 229 | + "https://www.ptt.cc/bbs/Gossiping/index39227.html - date 9/27 result 0\n", |
| 230 | + "Start to Crawling https://www.ptt.cc/bbs/Gossiping/M.1537977913.A.4EE.html\n", |
| 231 | + "Start to Crawling https://www.ptt.cc/bbs/Gossiping/M.1537977930.A.01B.html\n", |
| 232 | + "Start to Crawling https://www.ptt.cc/bbs/Gossiping/M.1537977933.A.013.html\n", |
| 233 | + "Start to Crawling https://www.ptt.cc/bbs/Gossiping/M.1537977952.A.904.html\n", |
| 234 | + "Start to Crawling https://www.ptt.cc/bbs/Gossiping/M.1537977959.A.A7B.html\n", |
| 235 | + "Start to Crawling https://www.ptt.cc/bbs/Gossiping/M.1537977966.A.77C.html\n", |
| 236 | + "Start to Crawling https://www.ptt.cc/bbs/Gossiping/M.1537978043.A.03E.html\n", |
| 237 | + "Start to Crawling https://www.ptt.cc/bbs/Gossiping/M.1537978060.A.9DF.html\n", |
| 238 | + "Start to Crawling https://www.ptt.cc/bbs/Gossiping/M.1537978098.A.D36.html\n", |
| 239 | + "Start to Crawling https://www.ptt.cc/bbs/Gossiping/M.1537978140.A.C44.html\n", |
| 240 | + "Start to Crawling https://www.ptt.cc/bbs/Gossiping/M.1537978152.A.31C.html\n", |
| 241 | + "Start to Crawling https://www.ptt.cc/bbs/Gossiping/M.1537978156.A.B1A.html\n", |
| 242 | + "Start to Crawling https://www.ptt.cc/bbs/Gossiping/M.1537978179.A.844.html\n", |
| 243 | + "Start to Crawling https://www.ptt.cc/bbs/Gossiping/M.1537978195.A.D33.html\n", |
| 244 | + "Start to Crawling https://www.ptt.cc/bbs/Gossiping/M.1537978272.A.533.html\n", |
| 245 | + "Start to Crawling https://www.ptt.cc/bbs/Gossiping/M.1537978295.A.B6A.html\n", |
| 246 | + "Start to Crawling https://www.ptt.cc/bbs/Gossiping/M.1537978350.A.D02.html\n", |
| 247 | + "Start to Crawling https://www.ptt.cc/bbs/Gossiping/M.1537978378.A.746.html\n", |
| 248 | + "Start to Crawling https://www.ptt.cc/bbs/Gossiping/M.1537978494.A.B6B.html\n", |
| 249 | + "Start to Crawling https://www.ptt.cc/bbs/Gossiping/M.1537978521.A.06B.html\n", |
| 250 | + "https://www.ptt.cc/bbs/Gossiping/index39226.html - date 9/26 result -1\n", |
| 251 | + "Start to Crawling https://www.ptt.cc/bbs/Gossiping/M.1537977639.A.3F8.html\n", |
| 252 | + "Start to Crawling https://www.ptt.cc/bbs/Gossiping/M.1537977693.A.A67.html\n", |
| 253 | + "Start to Crawling https://www.ptt.cc/bbs/Gossiping/M.1537977700.A.FD6.html\n", |
| 254 | + "Start to Crawling https://www.ptt.cc/bbs/Gossiping/M.1537977711.A.493.html\n", |
| 255 | + "Start to Crawling https://www.ptt.cc/bbs/Gossiping/M.1537977729.A.BE4.html\n", |
| 256 | + "Start to Crawling https://www.ptt.cc/bbs/Gossiping/M.1537977740.A.534.html\n", |
| 257 | + "Start to Crawling https://www.ptt.cc/bbs/Gossiping/M.1537977827.A.B50.html\n", |
| 258 | + "Start to Crawling https://www.ptt.cc/bbs/Gossiping/M.1537977851.A.17A.html\n", |
| 259 | + "Start to Crawling https://www.ptt.cc/bbs/Gossiping/M.1537977857.A.B1D.html\n", |
| 260 | + "Start to Crawling https://www.ptt.cc/bbs/Gossiping/M.1537977877.A.292.html\n", |
| 261 | + "Start to Crawling https://www.ptt.cc/bbs/Gossiping/M.1537977878.A.13E.html\n", |
| 262 | + "Start to Crawling https://www.ptt.cc/bbs/Gossiping/M.1537977910.A.566.html\n", |
| 263 | + "Save - today_articles.json\n" |
214 | 264 | ]
|
215 | 265 | }
|
216 | 266 | ],
|
217 | 267 | "source": [
|
| 268 | + "data = []\n", |
218 | 269 | "for page in range(total_page, 1, -1):\n",
|
219 | 270 | " current_url = 'https://www.ptt.cc/bbs/Gossiping/index{}.html'.format(page)\n",
|
220 | 271 | " resp_page = requests.get(current_url, cookies={'over18': '1'})\n",
|
|
228 | 279 | " container_tag = soup_page.find('div', class_='r-list-container')\n",
|
229 | 280 | " first_article = container_tag.find('div', class_='r-ent')\n",
|
230 | 281 | " first_article_date = first_article.find('div', class_='date').text.strip()\n",
|
231 |
| - " compare_datetime = compare_timestamp_md(ptt_yesterday_str, first_article_date)\n", |
| 282 | + " compare_datetime = compare_timestamp_md(ptt_today_str, first_article_date)\n", |
232 | 283 | " print('{} - date {} result {}'.format(current_url, first_article_date, compare_datetime))\n",
|
233 |
| - " break" |
| 284 | + " \n", |
| 285 | + " if compare_datetime == 1:\n", |
| 286 | + " continue\n", |
| 287 | + " else:\n", |
| 288 | + " # only crawling today's article before r-list-sep line\n", |
| 289 | + " for article_row_tag in container_tag.findChildren('div', recursive=False):\n", |
| 290 | + " if 'r-list-sep' in article_row_tag['class']:\n", |
| 291 | + " break\n", |
| 292 | + " if 'r-ent' in article_row_tag['class']:\n", |
| 293 | + " article_date = article_row_tag.find('div', class_='date').text.strip()\n", |
| 294 | + " article_date_compare = compare_timestamp_md(ptt_today_str, article_date)\n", |
| 295 | + " if article_date_compare != 0:\n", |
| 296 | + " continue\n", |
| 297 | + " article_tag = article_row_tag.find('a', href=True)\n", |
| 298 | + " article_url = urljoin(base_url, article_tag['href'])\n", |
| 299 | + " article_data = crawl_article(article_url)\n", |
| 300 | + " data.append(article_data)\n", |
| 301 | + "\n", |
| 302 | + " # if the first article date is earlier than current date, should break the iteration\n", |
| 303 | + " if compare_datetime == -1:\n", |
| 304 | + " break\n", |
| 305 | + "\n", |
| 306 | + "with open('today_articles.json', 'w+', encoding='utf-8') as f:\n", |
| 307 | + " json.dump(data, f, indent=2, ensure_ascii=False)\n", |
| 308 | + " print('Save - today_articles.json')" |
234 | 309 | ]
|
235 |
| - }, |
236 |
| - { |
237 |
| - "cell_type": "code", |
238 |
| - "execution_count": null, |
239 |
| - "metadata": {}, |
240 |
| - "outputs": [], |
241 |
| - "source": [] |
242 | 310 | }
|
243 | 311 | ],
|
244 | 312 | "metadata": {
|
|
0 commit comments