Skip to content

Commit c1cb332

Browse files
committed
add: crawl all yesterday articles
1 parent f3604b1 commit c1cb332

File tree

1 file changed

+265
-0
lines changed

1 file changed

+265
-0
lines changed
Lines changed: 265 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,265 @@
1+
{
2+
"cells": [
3+
{
4+
"cell_type": "markdown",
5+
"metadata": {},
6+
"source": [
7+
"# 爬取昨天的所有文章\n",
8+
"\n",
9+
"https://www.ptt.cc/bbs/Gossiping/index.html"
10+
]
11+
},
12+
{
13+
"cell_type": "code",
14+
"execution_count": 1,
15+
"metadata": {},
16+
"outputs": [],
17+
"source": [
18+
"import requests\n",
19+
"import re\n",
20+
"import json\n",
21+
"\n",
22+
"from bs4 import BeautifulSoup, NavigableString\n",
23+
"from datetime import datetime, timedelta\n",
24+
"from pprint import pprint\n",
25+
"from urllib.parse import urljoin"
26+
]
27+
},
28+
{
29+
"cell_type": "code",
30+
"execution_count": 2,
31+
"metadata": {},
32+
"outputs": [
33+
{
34+
"name": "stdout",
35+
"output_type": "stream",
36+
"text": [
37+
"09/25\n"
38+
]
39+
}
40+
],
41+
"source": [
42+
"base_url = 'https://www.ptt.cc/bbs/Gossiping/index.html'\n",
43+
"ptt_yesterday = datetime.now() - timedelta(days=1)\n",
44+
"ptt_yesterday_str = ptt_yesterday.strftime('%m/%d')\n",
45+
"print(ptt_yesterday_str)"
46+
]
47+
},
48+
{
49+
"cell_type": "markdown",
50+
"metadata": {},
51+
"source": [
52+
"## 取得總頁碼\n",
53+
"\n",
54+
"從 html 上一頁的按鈕中取得 n-1 page 的頁碼,在將該頁碼加一就是總頁碼了"
55+
]
56+
},
57+
{
58+
"cell_type": "code",
59+
"execution_count": 3,
60+
"metadata": {},
61+
"outputs": [],
62+
"source": [
63+
"resp_base = requests.get(base_url, cookies={'over18': '1'})\n",
64+
"assert resp_base.status_code == 200\n",
65+
"soup_base = BeautifulSoup(resp_base.text, 'lxml') "
66+
]
67+
},
68+
{
69+
"cell_type": "code",
70+
"execution_count": 4,
71+
"metadata": {},
72+
"outputs": [
73+
{
74+
"name": "stdout",
75+
"output_type": "stream",
76+
"text": [
77+
"total page = 39219\n"
78+
]
79+
}
80+
],
81+
"source": [
82+
"paging_tag = soup_base.find(class_='btn-group-paging')\n",
83+
"total_page = None\n",
84+
"for btn_tag in paging_tag.findAll('a'):\n",
85+
" if btn_tag.text == '‹ 上頁':\n",
86+
" compile_page = re.search('(\\d+)', btn_tag['href'])\n",
87+
" if compile_page:\n",
88+
" total_page = int(compile_page.group(0)) + 1\n",
89+
"print('total page =', total_page)"
90+
]
91+
},
92+
{
93+
"cell_type": "markdown",
94+
"metadata": {},
95+
"source": [
96+
"## 往回檢查日期並爬取文章\n",
97+
"\n",
98+
"最舊的文章頁面,頁碼為 1"
99+
]
100+
},
101+
{
102+
"cell_type": "code",
103+
"execution_count": 5,
104+
"metadata": {},
105+
"outputs": [],
106+
"source": [
107+
"def crawl_article(url):\n",
108+
" resp = requests.get(url, cookies={'over18': '1'})\n",
109+
" if resp.status_code != 200:\n",
110+
" return\n",
111+
" soup = BeautifulSoup(resp.text, 'lxml')\n",
112+
" print('Start to Crawling', url)\n",
113+
"\n",
114+
" # ##############################\n",
115+
" # crawl article\n",
116+
" # ##############################\n",
117+
" article = {\n",
118+
" 'author_id': '',\n",
119+
" 'author_nickname': '',\n",
120+
" 'title': '',\n",
121+
" 'timestamp': '',\n",
122+
" 'contents': '',\n",
123+
" 'ip': ''\n",
124+
" }\n",
125+
" article_body = soup.find(id='main-content')\n",
126+
"\n",
127+
" # article header\n",
128+
" article_head = article_body.findAll('div', class_='article-metaline')\n",
129+
" for metaline in article_head:\n",
130+
" meta_tag = metaline.find(class_='article-meta-tag').text\n",
131+
" meta_value = metaline.find(class_='article-meta-value').text\n",
132+
" if meta_tag == '作者':\n",
133+
" compile_nickname = re.compile('\\((.*)\\)').search(meta_value)\n",
134+
" article['author_id'] = meta_value.split('(')[0].strip(' ')\n",
135+
" article['author_nickname'] = compile_nickname.group(1) if compile_nickname else ''\n",
136+
" elif meta_tag == '標題':\n",
137+
" article['title'] = meta_value\n",
138+
" elif meta_tag == '時間':\n",
139+
" article['timestamp'] = meta_value\n",
140+
"\n",
141+
" # article content\n",
142+
" contents = [expr for expr in article_body.contents if isinstance(expr, NavigableString)]\n",
143+
" contents = [re.sub('\\n', '', expr) for expr in contents]\n",
144+
" contents = [i for i in contents if i]\n",
145+
" contents = '\\n'.join(contents)\n",
146+
" article['contents'] = contents\n",
147+
"\n",
148+
" # article publish ip\n",
149+
" article_ip = article_body.find(class_='f2').text\n",
150+
" compile_ip = re.compile('[0-9]{1,3}\\.[0-9]{1,3}\\.[0-9]{1,3}\\.[0-9]{1,3}').search(article_ip)\n",
151+
" article['ip'] = compile_ip.group(0) if compile_ip else ''\n",
152+
"\n",
153+
" # ##############################\n",
154+
" # crawl comments\n",
155+
" # ##############################\n",
156+
" comments = []\n",
157+
" for comment in article_body.findAll('div', class_='push'):\n",
158+
" tag = comment.find(class_='push-tag').text\n",
159+
" guest_id = comment.find(class_='push-userid').text\n",
160+
" guest_content = comment.find(class_='push-content').text\n",
161+
" guest_ipdatetime = comment.find(class_='push-ipdatetime').text\n",
162+
" compile_ip = re.compile('[0-9]{1,3}\\.[0-9]{1,3}\\.[0-9]{1,3}\\.[0-9]{1,3}').search(guest_ipdatetime)\n",
163+
" guest_ip = compile_ip.group(0) if compile_ip else ''\n",
164+
" guest_timestamp = re.sub(guest_ip, '', guest_ipdatetime).strip()\n",
165+
" comments.append({\n",
166+
" 'tag': tag,\n",
167+
" 'id': guest_id,\n",
168+
" 'content': guest_content,\n",
169+
" 'ip': guest_ip,\n",
170+
" 'timestamp': guest_timestamp\n",
171+
" })\n",
172+
" \n",
173+
" article['comments'] = comments\n",
174+
" article['url'] = url\n",
175+
" return article"
176+
]
177+
},
178+
{
179+
"cell_type": "code",
180+
"execution_count": 6,
181+
"metadata": {},
182+
"outputs": [],
183+
"source": [
184+
"DATE_GRATER=1\n",
185+
"DATE_EQUAL=0\n",
186+
"DATE_LESS=-1\n",
187+
"\n",
188+
"def compare_timestamp_md(src, dest):\n",
189+
" \"\"\"\n",
190+
" greater: 1\n",
191+
" equal: 0\n",
192+
" less: -1\n",
193+
" \"\"\"\n",
194+
" date_src = datetime.strptime(src, '%m/%d')\n",
195+
" date_dest = datetime.strptime(dest, '%m/%d')\n",
196+
" if date_dest > date_src:\n",
197+
" return 1\n",
198+
" elif date_dest == date_src:\n",
199+
" return 0\n",
200+
" else:\n",
201+
" return -1"
202+
]
203+
},
204+
{
205+
"cell_type": "code",
206+
"execution_count": 7,
207+
"metadata": {},
208+
"outputs": [
209+
{
210+
"name": "stdout",
211+
"output_type": "stream",
212+
"text": [
213+
"https://www.ptt.cc/bbs/Gossiping/index39219.html - date 9/26 result 1\n"
214+
]
215+
}
216+
],
217+
"source": [
218+
"for page in range(total_page, 1, -1):\n",
219+
" current_url = 'https://www.ptt.cc/bbs/Gossiping/index{}.html'.format(page)\n",
220+
" resp_page = requests.get(current_url, cookies={'over18': '1'})\n",
221+
" if resp_page.status_code != 200:\n",
222+
" continue\n",
223+
" soup_page = BeautifulSoup(resp_page.text, 'lxml')\n",
224+
" \n",
225+
" # ##############################\n",
226+
" # check the first article date\n",
227+
" # ##############################\n",
228+
" container_tag = soup_page.find('div', class_='r-list-container')\n",
229+
" first_article = container_tag.find('div', class_='r-ent')\n",
230+
" first_article_date = first_article.find('div', class_='date').text.strip()\n",
231+
" compare_datetime = compare_timestamp_md(ptt_yesterday_str, first_article_date)\n",
232+
" print('{} - date {} result {}'.format(current_url, first_article_date, compare_datetime))\n",
233+
" break"
234+
]
235+
},
236+
{
237+
"cell_type": "code",
238+
"execution_count": null,
239+
"metadata": {},
240+
"outputs": [],
241+
"source": []
242+
}
243+
],
244+
"metadata": {
245+
"kernelspec": {
246+
"display_name": "Python 3",
247+
"language": "python",
248+
"name": "python3"
249+
},
250+
"language_info": {
251+
"codemirror_mode": {
252+
"name": "ipython",
253+
"version": 3
254+
},
255+
"file_extension": ".py",
256+
"mimetype": "text/x-python",
257+
"name": "python",
258+
"nbconvert_exporter": "python",
259+
"pygments_lexer": "ipython3",
260+
"version": "3.6.6"
261+
}
262+
},
263+
"nbformat": 4,
264+
"nbformat_minor": 2
265+
}

0 commit comments

Comments
 (0)