afunTW
diff --git a/‎appendix_ptt/01_search_api_by_title.ipynb‎
Lines changed: 215 additions & 0 deletions b/‎appendix_ptt/01_search_api_by_title.ipynb‎
Lines changed: 215 additions & 0 deletions
@@ -0,0 +1,215 @@
+{
+ "cells": [
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "# 給定文章標題，爬取 PTT 上的所有相關文章\n",
+ "\n",
+ "- title: [新聞] 2噸水晶球沿街滾 撞壞5輛汽機車和民宅\n",
+ "- URL encoing (UTF-8)\n",
+ "- combine URL path"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 1,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "import requests\n",
+ "import re\n",
+ "import json\n",
+ "\n",
+ "from bs4 import BeautifulSoup, NavigableString\n",
+ "from pprint import pprint\n",
+ "from urllib.parse import urlencode, urljoin"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 2,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "QUERY_TITLE = '[新聞] 2噸水晶球沿街滾 撞壞5輛汽機車和民宅'\n",
+ "cookies = {'over18': '1'}"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "## URL encoding\n",
+ "\n",
+ "取得相同文章標題的列表"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 3,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "https://www.ptt.cc/bbs/Gossiping/search?q=%5B%E6%96%B0%E8%81%9E%5D+2%E5%99%B8%E6%B0%B4%E6%99%B6%E7%90%83%E6%B2%BF%E8%A1%97%E6%BB%BE+%E6%92%9E%E5%A3%9E5%E8%BC%9B%E6%B1%BD%E6%A9%9F%E8%BB%8A%E5%92%8C%E6%B0%91%E5%AE%85\n"
+ ]
+ }
+ ],
+ "source": [
+ "encoding_title = urlencode({'q': QUERY_TITLE})\n",
+ "query = 'https://www.ptt.cc/bbs/Gossiping/search?{}'.format(encoding_title)\n",
+ "print(query)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 4,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "resp_article_list = requests.get(query, cookies=cookies)\n",
+ "soup_article_list = BeautifulSoup(resp_article_list.text, 'lxml')"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "## 列出所有文章並爬取"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 5,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "def crawl_article(url):\n",
+ " resp = requests.get(url, cookies={'over18': '1'})\n",
+ " if resp.status_code != 200:\n",
+ " return\n",
+ " soup = BeautifulSoup(resp.text, 'lxml')\n",
+ " print('Start to Crawling', url)\n",
+ "\n",
+ " # ##############################\n",
+ " # crawl article\n",
+ " # ##############################\n",
+ " article = {\n",
+ " 'author_id': '',\n",
+ " 'author_nickname': '',\n",
+ " 'title': '',\n",
+ " 'timestamp': '',\n",
+ " 'contents': '',\n",
+ " 'ip': ''\n",
+ " }\n",
+ " article_body = soup.find(id='main-content')\n",
+ "\n",
+ " # article header\n",
+ " article_head = article_body.findAll('div', class_='article-metaline')\n",
+ " for metaline in article_head:\n",
+ " meta_tag = metaline.find(class_='article-meta-tag').text\n",
+ " meta_value = metaline.find(class_='article-meta-value').text\n",
+ " if meta_tag == '作者':\n",
+ " compile_nickname = re.compile('\\((.*)\\)').search(meta_value)\n",
+ " article['author_id'] = meta_value.split('(')[0].strip(' ')\n",
+ " article['author_nickname'] = compile_nickname.group(1) if compile_nickname else ''\n",
+ " elif meta_tag == '標題':\n",
+ " article['title'] = meta_value\n",
+ " elif meta_tag == '時間':\n",
+ " article['timestamp'] = meta_value\n",
+ "\n",
+ " # article content\n",
+ " contents = [expr for expr in article_body.contents if isinstance(expr, NavigableString)]\n",
+ " contents = [re.sub('\\n', '', expr) for expr in contents]\n",
+ " contents = [i for i in contents if i]\n",
+ " contents = '\\n'.join(contents)\n",
+ " article['contents'] = contents\n",
+ "\n",
+ " # article publish ip\n",
+ " article_ip = article_body.find(class_='f2').text\n",
+ " compile_ip = re.compile('[0-9]{1,3}\\.[0-9]{1,3}\\.[0-9]{1,3}\\.[0-9]{1,3}').search(article_ip)\n",
+ " article['ip'] = compile_ip.group(0) if compile_ip else ''\n",
+ "\n",
+ " # ##############################\n",
+ " # crawl comments\n",
+ " # ##############################\n",
+ " comments = []\n",
+ " for comment in article_body.findAll('div', class_='push'):\n",
+ " tag = comment.find(class_='push-tag').text\n",
+ " guest_id = comment.find(class_='push-userid').text\n",
+ " guest_content = comment.find(class_='push-content').text\n",
+ " guest_ipdatetime = comment.find(class_='push-ipdatetime').text\n",
+ " compile_ip = re.compile('[0-9]{1,3}\\.[0-9]{1,3}\\.[0-9]{1,3}\\.[0-9]{1,3}').search(guest_ipdatetime)\n",
+ " guest_ip = compile_ip.group(0) if compile_ip else ''\n",
+ " guest_timestamp = re.sub(guest_ip, '', guest_ipdatetime).strip()\n",
+ " comments.append({\n",
+ " 'tag': tag,\n",
+ " 'id': guest_id,\n",
+ " 'content': guest_content,\n",
+ " 'ip': guest_ip,\n",
+ " 'timestamp': guest_timestamp\n",
+ " })\n",
+ " \n",
+ " article['comments'] = comments\n",
+ " article['url'] = url\n",
+ " return article"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 6,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "Start to Crawling https://www.ptt.cc/bbs/Gossiping/M.1537874850.A.20D.html\n",
+ "Start to Crawling https://www.ptt.cc/bbs/Gossiping/M.1537868945.A.8A9.html\n",
+ "Start to Crawling https://www.ptt.cc/bbs/Gossiping/M.1537861382.A.154.html\n",
+ "Start to Crawling https://www.ptt.cc/bbs/Gossiping/M.1537859788.A.BE2.html\n",
+ "Start to Crawling https://www.ptt.cc/bbs/Gossiping/M.1537859045.A.287.html\n",
+ "Save - search_api_by_title.json\n"
+ ]
+ }
+ ],
+ "source": [
+ "data = []\n",
+ "for article_line in soup_article_list.findAll('div', class_='r-ent'):\n",
+ " title_tag = article_line.find('div', class_='title')\n",
+ " article_url = title_tag.find('a')['href']\n",
+ " article_url = urljoin(resp_article_list.url, article_url)\n",
+ " article_data = crawl_article(article_url)\n",
+ " data.append(article_data)\n",
+ "\n",
+ "with open('search_api_by_title.json', 'w+', encoding='utf-8') as f:\n",
+ " json.dump(data, f, indent=2, ensure_ascii=False)\n",
+ " print('Save - search_api_by_title.json')"
+ ]
+ }
+ ],
+ "metadata": {
+ "kernelspec": {
+ "display_name": "Python 3",
+ "language": "python",
+ "name": "python3"
+ },
+ "language_info": {
+ "codemirror_mode": {
+ "name": "ipython",
+ "version": 3
+ },
+ "file_extension": ".py",
+ "mimetype": "text/x-python",
+ "name": "python",
+ "nbconvert_exporter": "python",
+ "pygments_lexer": "ipython3",
+ "version": "3.6.6"
+ }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}