afunTW
diff --git a/‎appendix_ptt/02_yesterday_articles.ipynb‎
Lines changed: 265 additions & 0 deletions b/‎appendix_ptt/02_yesterday_articles.ipynb‎
Lines changed: 265 additions & 0 deletions
@@ -0,0 +1,265 @@
+{
+ "cells": [
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "# 爬取昨天的所有文章\n",
+ "\n",
+ "https://www.ptt.cc/bbs/Gossiping/index.html"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 1,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "import requests\n",
+ "import re\n",
+ "import json\n",
+ "\n",
+ "from bs4 import BeautifulSoup, NavigableString\n",
+ "from datetime import datetime, timedelta\n",
+ "from pprint import pprint\n",
+ "from urllib.parse import urljoin"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 2,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "09/25\n"
+ ]
+ }
+ ],
+ "source": [
+ "base_url = 'https://www.ptt.cc/bbs/Gossiping/index.html'\n",
+ "ptt_yesterday = datetime.now() - timedelta(days=1)\n",
+ "ptt_yesterday_str = ptt_yesterday.strftime('%m/%d')\n",
+ "print(ptt_yesterday_str)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "## 取得總頁碼\n",
+ "\n",
+ "從 html 上一頁的按鈕中取得 n-1 page 的頁碼，在將該頁碼加一就是總頁碼了"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 3,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "resp_base = requests.get(base_url, cookies={'over18': '1'})\n",
+ "assert resp_base.status_code == 200\n",
+ "soup_base = BeautifulSoup(resp_base.text, 'lxml') "
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 4,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "total page = 39219\n"
+ ]
+ }
+ ],
+ "source": [
+ "paging_tag = soup_base.find(class_='btn-group-paging')\n",
+ "total_page = None\n",
+ "for btn_tag in paging_tag.findAll('a'):\n",
+ " if btn_tag.text == '‹ 上頁':\n",
+ " compile_page = re.search('(\\d+)', btn_tag['href'])\n",
+ " if compile_page:\n",
+ " total_page = int(compile_page.group(0)) + 1\n",
+ "print('total page =', total_page)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "## 往回檢查日期並爬取文章\n",
+ "\n",
+ "最舊的文章頁面，頁碼為 1"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 5,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "def crawl_article(url):\n",
+ " resp = requests.get(url, cookies={'over18': '1'})\n",
+ " if resp.status_code != 200:\n",
+ " return\n",
+ " soup = BeautifulSoup(resp.text, 'lxml')\n",
+ " print('Start to Crawling', url)\n",
+ "\n",
+ " # ##############################\n",
+ " # crawl article\n",
+ " # ##############################\n",
+ " article = {\n",
+ " 'author_id': '',\n",
+ " 'author_nickname': '',\n",
+ " 'title': '',\n",
+ " 'timestamp': '',\n",
+ " 'contents': '',\n",
+ " 'ip': ''\n",
+ " }\n",
+ " article_body = soup.find(id='main-content')\n",
+ "\n",
+ " # article header\n",
+ " article_head = article_body.findAll('div', class_='article-metaline')\n",
+ " for metaline in article_head:\n",
+ " meta_tag = metaline.find(class_='article-meta-tag').text\n",
+ " meta_value = metaline.find(class_='article-meta-value').text\n",
+ " if meta_tag == '作者':\n",
+ " compile_nickname = re.compile('\\((.*)\\)').search(meta_value)\n",
+ " article['author_id'] = meta_value.split('(')[0].strip(' ')\n",
+ " article['author_nickname'] = compile_nickname.group(1) if compile_nickname else ''\n",
+ " elif meta_tag == '標題':\n",
+ " article['title'] = meta_value\n",
+ " elif meta_tag == '時間':\n",
+ " article['timestamp'] = meta_value\n",
+ "\n",
+ " # article content\n",
+ " contents = [expr for expr in article_body.contents if isinstance(expr, NavigableString)]\n",
+ " contents = [re.sub('\\n', '', expr) for expr in contents]\n",
+ " contents = [i for i in contents if i]\n",
+ " contents = '\\n'.join(contents)\n",
+ " article['contents'] = contents\n",
+ "\n",
+ " # article publish ip\n",
+ " article_ip = article_body.find(class_='f2').text\n",
+ " compile_ip = re.compile('[0-9]{1,3}\\.[0-9]{1,3}\\.[0-9]{1,3}\\.[0-9]{1,3}').search(article_ip)\n",
+ " article['ip'] = compile_ip.group(0) if compile_ip else ''\n",
+ "\n",
+ " # ##############################\n",
+ " # crawl comments\n",
+ " # ##############################\n",
+ " comments = []\n",
+ " for comment in article_body.findAll('div', class_='push'):\n",
+ " tag = comment.find(class_='push-tag').text\n",
+ " guest_id = comment.find(class_='push-userid').text\n",
+ " guest_content = comment.find(class_='push-content').text\n",
+ " guest_ipdatetime = comment.find(class_='push-ipdatetime').text\n",
+ " compile_ip = re.compile('[0-9]{1,3}\\.[0-9]{1,3}\\.[0-9]{1,3}\\.[0-9]{1,3}').search(guest_ipdatetime)\n",
+ " guest_ip = compile_ip.group(0) if compile_ip else ''\n",
+ " guest_timestamp = re.sub(guest_ip, '', guest_ipdatetime).strip()\n",
+ " comments.append({\n",
+ " 'tag': tag,\n",
+ " 'id': guest_id,\n",
+ " 'content': guest_content,\n",
+ " 'ip': guest_ip,\n",
+ " 'timestamp': guest_timestamp\n",
+ " })\n",
+ " \n",
+ " article['comments'] = comments\n",
+ " article['url'] = url\n",
+ " return article"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 6,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "DATE_GRATER=1\n",
+ "DATE_EQUAL=0\n",
+ "DATE_LESS=-1\n",
+ "\n",
+ "def compare_timestamp_md(src, dest):\n",
+ " \"\"\"\n",
+ " greater: 1\n",
+ " equal: 0\n",
+ " less: -1\n",
+ " \"\"\"\n",
+ " date_src = datetime.strptime(src, '%m/%d')\n",
+ " date_dest = datetime.strptime(dest, '%m/%d')\n",
+ " if date_dest > date_src:\n",
+ " return 1\n",
+ " elif date_dest == date_src:\n",
+ " return 0\n",
+ " else:\n",
+ " return -1"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 7,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "https://www.ptt.cc/bbs/Gossiping/index39219.html - date 9/26 result 1\n"
+ ]
+ }
+ ],
+ "source": [
+ "for page in range(total_page, 1, -1):\n",
+ " current_url = 'https://www.ptt.cc/bbs/Gossiping/index{}.html'.format(page)\n",
+ " resp_page = requests.get(current_url, cookies={'over18': '1'})\n",
+ " if resp_page.status_code != 200:\n",
+ " continue\n",
+ " soup_page = BeautifulSoup(resp_page.text, 'lxml')\n",
+ " \n",
+ " # ##############################\n",
+ " # check the first article date\n",
+ " # ##############################\n",
+ " container_tag = soup_page.find('div', class_='r-list-container')\n",
+ " first_article = container_tag.find('div', class_='r-ent')\n",
+ " first_article_date = first_article.find('div', class_='date').text.strip()\n",
+ " compare_datetime = compare_timestamp_md(ptt_yesterday_str, first_article_date)\n",
+ " print('{} - date {} result {}'.format(current_url, first_article_date, compare_datetime))\n",
+ " break"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": []
+ }
+ ],
+ "metadata": {
+ "kernelspec": {
+ "display_name": "Python 3",
+ "language": "python",
+ "name": "python3"
+ },
+ "language_info": {
+ "codemirror_mode": {
+ "name": "ipython",
+ "version": 3
+ },
+ "file_extension": ".py",
+ "mimetype": "text/x-python",
+ "name": "python",
+ "nbconvert_exporter": "python",
+ "pygments_lexer": "ipython3",
+ "version": "3.6.6"
+ }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}