Skip to content

Commit 63d2e50

Browse files
committed
add ptt search api
1 parent 2d0d460 commit 63d2e50

File tree

1 file changed

+215
-0
lines changed

1 file changed

+215
-0
lines changed
Lines changed: 215 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,215 @@
1+
{
2+
"cells": [
3+
{
4+
"cell_type": "markdown",
5+
"metadata": {},
6+
"source": [
7+
"# 給定文章標題,爬取 PTT 上的所有相關文章\n",
8+
"\n",
9+
"- title: [新聞] 2噸水晶球沿街滾 撞壞5輛汽機車和民宅\n",
10+
"- URL encoing (UTF-8)\n",
11+
"- combine URL path"
12+
]
13+
},
14+
{
15+
"cell_type": "code",
16+
"execution_count": 1,
17+
"metadata": {},
18+
"outputs": [],
19+
"source": [
20+
"import requests\n",
21+
"import re\n",
22+
"import json\n",
23+
"\n",
24+
"from bs4 import BeautifulSoup, NavigableString\n",
25+
"from pprint import pprint\n",
26+
"from urllib.parse import urlencode, urljoin"
27+
]
28+
},
29+
{
30+
"cell_type": "code",
31+
"execution_count": 2,
32+
"metadata": {},
33+
"outputs": [],
34+
"source": [
35+
"QUERY_TITLE = '[新聞] 2噸水晶球沿街滾 撞壞5輛汽機車和民宅'\n",
36+
"cookies = {'over18': '1'}"
37+
]
38+
},
39+
{
40+
"cell_type": "markdown",
41+
"metadata": {},
42+
"source": [
43+
"## URL encoding\n",
44+
"\n",
45+
"取得相同文章標題的列表"
46+
]
47+
},
48+
{
49+
"cell_type": "code",
50+
"execution_count": 3,
51+
"metadata": {},
52+
"outputs": [
53+
{
54+
"name": "stdout",
55+
"output_type": "stream",
56+
"text": [
57+
"https://www.ptt.cc/bbs/Gossiping/search?q=%5B%E6%96%B0%E8%81%9E%5D+2%E5%99%B8%E6%B0%B4%E6%99%B6%E7%90%83%E6%B2%BF%E8%A1%97%E6%BB%BE+%E6%92%9E%E5%A3%9E5%E8%BC%9B%E6%B1%BD%E6%A9%9F%E8%BB%8A%E5%92%8C%E6%B0%91%E5%AE%85\n"
58+
]
59+
}
60+
],
61+
"source": [
62+
"encoding_title = urlencode({'q': QUERY_TITLE})\n",
63+
"query = 'https://www.ptt.cc/bbs/Gossiping/search?{}'.format(encoding_title)\n",
64+
"print(query)"
65+
]
66+
},
67+
{
68+
"cell_type": "code",
69+
"execution_count": 4,
70+
"metadata": {},
71+
"outputs": [],
72+
"source": [
73+
"resp_article_list = requests.get(query, cookies=cookies)\n",
74+
"soup_article_list = BeautifulSoup(resp_article_list.text, 'lxml')"
75+
]
76+
},
77+
{
78+
"cell_type": "markdown",
79+
"metadata": {},
80+
"source": [
81+
"## 列出所有文章並爬取"
82+
]
83+
},
84+
{
85+
"cell_type": "code",
86+
"execution_count": 5,
87+
"metadata": {},
88+
"outputs": [],
89+
"source": [
90+
"def crawl_article(url):\n",
91+
" resp = requests.get(url, cookies={'over18': '1'})\n",
92+
" if resp.status_code != 200:\n",
93+
" return\n",
94+
" soup = BeautifulSoup(resp.text, 'lxml')\n",
95+
" print('Start to Crawling', url)\n",
96+
"\n",
97+
" # ##############################\n",
98+
" # crawl article\n",
99+
" # ##############################\n",
100+
" article = {\n",
101+
" 'author_id': '',\n",
102+
" 'author_nickname': '',\n",
103+
" 'title': '',\n",
104+
" 'timestamp': '',\n",
105+
" 'contents': '',\n",
106+
" 'ip': ''\n",
107+
" }\n",
108+
" article_body = soup.find(id='main-content')\n",
109+
"\n",
110+
" # article header\n",
111+
" article_head = article_body.findAll('div', class_='article-metaline')\n",
112+
" for metaline in article_head:\n",
113+
" meta_tag = metaline.find(class_='article-meta-tag').text\n",
114+
" meta_value = metaline.find(class_='article-meta-value').text\n",
115+
" if meta_tag == '作者':\n",
116+
" compile_nickname = re.compile('\\((.*)\\)').search(meta_value)\n",
117+
" article['author_id'] = meta_value.split('(')[0].strip(' ')\n",
118+
" article['author_nickname'] = compile_nickname.group(1) if compile_nickname else ''\n",
119+
" elif meta_tag == '標題':\n",
120+
" article['title'] = meta_value\n",
121+
" elif meta_tag == '時間':\n",
122+
" article['timestamp'] = meta_value\n",
123+
"\n",
124+
" # article content\n",
125+
" contents = [expr for expr in article_body.contents if isinstance(expr, NavigableString)]\n",
126+
" contents = [re.sub('\\n', '', expr) for expr in contents]\n",
127+
" contents = [i for i in contents if i]\n",
128+
" contents = '\\n'.join(contents)\n",
129+
" article['contents'] = contents\n",
130+
"\n",
131+
" # article publish ip\n",
132+
" article_ip = article_body.find(class_='f2').text\n",
133+
" compile_ip = re.compile('[0-9]{1,3}\\.[0-9]{1,3}\\.[0-9]{1,3}\\.[0-9]{1,3}').search(article_ip)\n",
134+
" article['ip'] = compile_ip.group(0) if compile_ip else ''\n",
135+
"\n",
136+
" # ##############################\n",
137+
" # crawl comments\n",
138+
" # ##############################\n",
139+
" comments = []\n",
140+
" for comment in article_body.findAll('div', class_='push'):\n",
141+
" tag = comment.find(class_='push-tag').text\n",
142+
" guest_id = comment.find(class_='push-userid').text\n",
143+
" guest_content = comment.find(class_='push-content').text\n",
144+
" guest_ipdatetime = comment.find(class_='push-ipdatetime').text\n",
145+
" compile_ip = re.compile('[0-9]{1,3}\\.[0-9]{1,3}\\.[0-9]{1,3}\\.[0-9]{1,3}').search(guest_ipdatetime)\n",
146+
" guest_ip = compile_ip.group(0) if compile_ip else ''\n",
147+
" guest_timestamp = re.sub(guest_ip, '', guest_ipdatetime).strip()\n",
148+
" comments.append({\n",
149+
" 'tag': tag,\n",
150+
" 'id': guest_id,\n",
151+
" 'content': guest_content,\n",
152+
" 'ip': guest_ip,\n",
153+
" 'timestamp': guest_timestamp\n",
154+
" })\n",
155+
" \n",
156+
" article['comments'] = comments\n",
157+
" article['url'] = url\n",
158+
" return article"
159+
]
160+
},
161+
{
162+
"cell_type": "code",
163+
"execution_count": 6,
164+
"metadata": {},
165+
"outputs": [
166+
{
167+
"name": "stdout",
168+
"output_type": "stream",
169+
"text": [
170+
"Start to Crawling https://www.ptt.cc/bbs/Gossiping/M.1537874850.A.20D.html\n",
171+
"Start to Crawling https://www.ptt.cc/bbs/Gossiping/M.1537868945.A.8A9.html\n",
172+
"Start to Crawling https://www.ptt.cc/bbs/Gossiping/M.1537861382.A.154.html\n",
173+
"Start to Crawling https://www.ptt.cc/bbs/Gossiping/M.1537859788.A.BE2.html\n",
174+
"Start to Crawling https://www.ptt.cc/bbs/Gossiping/M.1537859045.A.287.html\n",
175+
"Save - search_api_by_title.json\n"
176+
]
177+
}
178+
],
179+
"source": [
180+
"data = []\n",
181+
"for article_line in soup_article_list.findAll('div', class_='r-ent'):\n",
182+
" title_tag = article_line.find('div', class_='title')\n",
183+
" article_url = title_tag.find('a')['href']\n",
184+
" article_url = urljoin(resp_article_list.url, article_url)\n",
185+
" article_data = crawl_article(article_url)\n",
186+
" data.append(article_data)\n",
187+
"\n",
188+
"with open('search_api_by_title.json', 'w+', encoding='utf-8') as f:\n",
189+
" json.dump(data, f, indent=2, ensure_ascii=False)\n",
190+
" print('Save - search_api_by_title.json')"
191+
]
192+
}
193+
],
194+
"metadata": {
195+
"kernelspec": {
196+
"display_name": "Python 3",
197+
"language": "python",
198+
"name": "python3"
199+
},
200+
"language_info": {
201+
"codemirror_mode": {
202+
"name": "ipython",
203+
"version": 3
204+
},
205+
"file_extension": ".py",
206+
"mimetype": "text/x-python",
207+
"name": "python",
208+
"nbconvert_exporter": "python",
209+
"pygments_lexer": "ipython3",
210+
"version": "3.6.6"
211+
}
212+
},
213+
"nbformat": 4,
214+
"nbformat_minor": 2
215+
}

0 commit comments

Comments
 (0)