Skip to content

Commit f981bc2

Browse files
committed
modified to crawl today's articles
1 parent c1cb332 commit f981bc2

File tree

1 file changed

+85
-17
lines changed

1 file changed

+85
-17
lines changed

appendix_ptt/02_yesterday_articles.ipynb renamed to appendix_ptt/02_today_articles.ipynb

Lines changed: 85 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,7 @@
44
"cell_type": "markdown",
55
"metadata": {},
66
"source": [
7-
"# 爬取昨天的所有文章\n",
7+
"# 爬取今天到目前為止的所有文章\n",
88
"\n",
99
"https://www.ptt.cc/bbs/Gossiping/index.html"
1010
]
@@ -20,7 +20,7 @@
2020
"import json\n",
2121
"\n",
2222
"from bs4 import BeautifulSoup, NavigableString\n",
23-
"from datetime import datetime, timedelta\n",
23+
"from datetime import datetime\n",
2424
"from pprint import pprint\n",
2525
"from urllib.parse import urljoin"
2626
]
@@ -34,15 +34,15 @@
3434
"name": "stdout",
3535
"output_type": "stream",
3636
"text": [
37-
"09/25\n"
37+
"09/27\n"
3838
]
3939
}
4040
],
4141
"source": [
4242
"base_url = 'https://www.ptt.cc/bbs/Gossiping/index.html'\n",
43-
"ptt_yesterday = datetime.now() - timedelta(days=1)\n",
44-
"ptt_yesterday_str = ptt_yesterday.strftime('%m/%d')\n",
45-
"print(ptt_yesterday_str)"
43+
"ptt_today = datetime.now()\n",
44+
"ptt_today_str = ptt_today.strftime('%m/%d')\n",
45+
"print(ptt_today_str)"
4646
]
4747
},
4848
{
@@ -74,7 +74,7 @@
7474
"name": "stdout",
7575
"output_type": "stream",
7676
"text": [
77-
"total page = 39219\n"
77+
"total page = 39228\n"
7878
]
7979
}
8080
],
@@ -210,11 +210,62 @@
210210
"name": "stdout",
211211
"output_type": "stream",
212212
"text": [
213-
"https://www.ptt.cc/bbs/Gossiping/index39219.html - date 9/26 result 1\n"
213+
"https://www.ptt.cc/bbs/Gossiping/index39228.html - date 9/27 result 0\n",
214+
"Start to Crawling https://www.ptt.cc/bbs/Gossiping/M.1537978608.A.325.html\n",
215+
"Start to Crawling https://www.ptt.cc/bbs/Gossiping/M.1537978662.A.45A.html\n",
216+
"Start to Crawling https://www.ptt.cc/bbs/Gossiping/M.1537978695.A.9A7.html\n",
217+
"Start to Crawling https://www.ptt.cc/bbs/Gossiping/M.1537978699.A.194.html\n",
218+
"Start to Crawling https://www.ptt.cc/bbs/Gossiping/M.1537978724.A.356.html\n",
219+
"Start to Crawling https://www.ptt.cc/bbs/Gossiping/M.1537978750.A.39A.html\n",
220+
"Start to Crawling https://www.ptt.cc/bbs/Gossiping/M.1537978768.A.08B.html\n",
221+
"Start to Crawling https://www.ptt.cc/bbs/Gossiping/M.1537978815.A.5B2.html\n",
222+
"Start to Crawling https://www.ptt.cc/bbs/Gossiping/M.1537978820.A.119.html\n",
223+
"Start to Crawling https://www.ptt.cc/bbs/Gossiping/M.1537978934.A.F8E.html\n",
224+
"Start to Crawling https://www.ptt.cc/bbs/Gossiping/M.1537978941.A.754.html\n",
225+
"Start to Crawling https://www.ptt.cc/bbs/Gossiping/M.1537978960.A.779.html\n",
226+
"Start to Crawling https://www.ptt.cc/bbs/Gossiping/M.1537978973.A.B90.html\n",
227+
"Start to Crawling https://www.ptt.cc/bbs/Gossiping/M.1537978993.A.F88.html\n",
228+
"Start to Crawling https://www.ptt.cc/bbs/Gossiping/M.1537979013.A.67C.html\n",
229+
"https://www.ptt.cc/bbs/Gossiping/index39227.html - date 9/27 result 0\n",
230+
"Start to Crawling https://www.ptt.cc/bbs/Gossiping/M.1537977913.A.4EE.html\n",
231+
"Start to Crawling https://www.ptt.cc/bbs/Gossiping/M.1537977930.A.01B.html\n",
232+
"Start to Crawling https://www.ptt.cc/bbs/Gossiping/M.1537977933.A.013.html\n",
233+
"Start to Crawling https://www.ptt.cc/bbs/Gossiping/M.1537977952.A.904.html\n",
234+
"Start to Crawling https://www.ptt.cc/bbs/Gossiping/M.1537977959.A.A7B.html\n",
235+
"Start to Crawling https://www.ptt.cc/bbs/Gossiping/M.1537977966.A.77C.html\n",
236+
"Start to Crawling https://www.ptt.cc/bbs/Gossiping/M.1537978043.A.03E.html\n",
237+
"Start to Crawling https://www.ptt.cc/bbs/Gossiping/M.1537978060.A.9DF.html\n",
238+
"Start to Crawling https://www.ptt.cc/bbs/Gossiping/M.1537978098.A.D36.html\n",
239+
"Start to Crawling https://www.ptt.cc/bbs/Gossiping/M.1537978140.A.C44.html\n",
240+
"Start to Crawling https://www.ptt.cc/bbs/Gossiping/M.1537978152.A.31C.html\n",
241+
"Start to Crawling https://www.ptt.cc/bbs/Gossiping/M.1537978156.A.B1A.html\n",
242+
"Start to Crawling https://www.ptt.cc/bbs/Gossiping/M.1537978179.A.844.html\n",
243+
"Start to Crawling https://www.ptt.cc/bbs/Gossiping/M.1537978195.A.D33.html\n",
244+
"Start to Crawling https://www.ptt.cc/bbs/Gossiping/M.1537978272.A.533.html\n",
245+
"Start to Crawling https://www.ptt.cc/bbs/Gossiping/M.1537978295.A.B6A.html\n",
246+
"Start to Crawling https://www.ptt.cc/bbs/Gossiping/M.1537978350.A.D02.html\n",
247+
"Start to Crawling https://www.ptt.cc/bbs/Gossiping/M.1537978378.A.746.html\n",
248+
"Start to Crawling https://www.ptt.cc/bbs/Gossiping/M.1537978494.A.B6B.html\n",
249+
"Start to Crawling https://www.ptt.cc/bbs/Gossiping/M.1537978521.A.06B.html\n",
250+
"https://www.ptt.cc/bbs/Gossiping/index39226.html - date 9/26 result -1\n",
251+
"Start to Crawling https://www.ptt.cc/bbs/Gossiping/M.1537977639.A.3F8.html\n",
252+
"Start to Crawling https://www.ptt.cc/bbs/Gossiping/M.1537977693.A.A67.html\n",
253+
"Start to Crawling https://www.ptt.cc/bbs/Gossiping/M.1537977700.A.FD6.html\n",
254+
"Start to Crawling https://www.ptt.cc/bbs/Gossiping/M.1537977711.A.493.html\n",
255+
"Start to Crawling https://www.ptt.cc/bbs/Gossiping/M.1537977729.A.BE4.html\n",
256+
"Start to Crawling https://www.ptt.cc/bbs/Gossiping/M.1537977740.A.534.html\n",
257+
"Start to Crawling https://www.ptt.cc/bbs/Gossiping/M.1537977827.A.B50.html\n",
258+
"Start to Crawling https://www.ptt.cc/bbs/Gossiping/M.1537977851.A.17A.html\n",
259+
"Start to Crawling https://www.ptt.cc/bbs/Gossiping/M.1537977857.A.B1D.html\n",
260+
"Start to Crawling https://www.ptt.cc/bbs/Gossiping/M.1537977877.A.292.html\n",
261+
"Start to Crawling https://www.ptt.cc/bbs/Gossiping/M.1537977878.A.13E.html\n",
262+
"Start to Crawling https://www.ptt.cc/bbs/Gossiping/M.1537977910.A.566.html\n",
263+
"Save - today_articles.json\n"
214264
]
215265
}
216266
],
217267
"source": [
268+
"data = []\n",
218269
"for page in range(total_page, 1, -1):\n",
219270
" current_url = 'https://www.ptt.cc/bbs/Gossiping/index{}.html'.format(page)\n",
220271
" resp_page = requests.get(current_url, cookies={'over18': '1'})\n",
@@ -228,17 +279,34 @@
228279
" container_tag = soup_page.find('div', class_='r-list-container')\n",
229280
" first_article = container_tag.find('div', class_='r-ent')\n",
230281
" first_article_date = first_article.find('div', class_='date').text.strip()\n",
231-
" compare_datetime = compare_timestamp_md(ptt_yesterday_str, first_article_date)\n",
282+
" compare_datetime = compare_timestamp_md(ptt_today_str, first_article_date)\n",
232283
" print('{} - date {} result {}'.format(current_url, first_article_date, compare_datetime))\n",
233-
" break"
284+
" \n",
285+
" if compare_datetime == 1:\n",
286+
" continue\n",
287+
" else:\n",
288+
" # only crawling today's article before r-list-sep line\n",
289+
" for article_row_tag in container_tag.findChildren('div', recursive=False):\n",
290+
" if 'r-list-sep' in article_row_tag['class']:\n",
291+
" break\n",
292+
" if 'r-ent' in article_row_tag['class']:\n",
293+
" article_date = article_row_tag.find('div', class_='date').text.strip()\n",
294+
" article_date_compare = compare_timestamp_md(ptt_today_str, article_date)\n",
295+
" if article_date_compare != 0:\n",
296+
" continue\n",
297+
" article_tag = article_row_tag.find('a', href=True)\n",
298+
" article_url = urljoin(base_url, article_tag['href'])\n",
299+
" article_data = crawl_article(article_url)\n",
300+
" data.append(article_data)\n",
301+
"\n",
302+
" # if the first article date is earlier than current date, should break the iteration\n",
303+
" if compare_datetime == -1:\n",
304+
" break\n",
305+
"\n",
306+
"with open('today_articles.json', 'w+', encoding='utf-8') as f:\n",
307+
" json.dump(data, f, indent=2, ensure_ascii=False)\n",
308+
" print('Save - today_articles.json')"
234309
]
235-
},
236-
{
237-
"cell_type": "code",
238-
"execution_count": null,
239-
"metadata": {},
240-
"outputs": [],
241-
"source": []
242310
}
243311
],
244312
"metadata": {

0 commit comments

Comments
 (0)