afunTW
diff --git a/‎practice/00_image_crawling.ipynb‎
Lines changed: 131 additions & 0 deletions b/‎practice/00_image_crawling.ipynb‎
Lines changed: 131 additions & 0 deletions
diff --git a/‎practice/01_image_crawling_and_check_format.ipynb‎
Lines changed: 125 additions & 0 deletions b/‎practice/01_image_crawling_and_check_format.ipynb‎
Lines changed: 125 additions & 0 deletions
@@ -0,0 +1,131 @@
+{
+ "cells": [
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "# 練習\n",
+ "\n",
+ "- 觀察 https://www.pexels.com/ 並撰寫爬蟲程式\n",
+ "- 下載 5 張桌布圖"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 1,
+ "metadata": {
+ "collapsed": true
+ },
+ "outputs": [],
+ "source": [
+ "import requests\n",
+ "import re\n",
+ "import os\n",
+ "\n",
+ "from bs4 import BeautifulSoup\n",
+ "from pprint import pprint\n",
+ "\n",
+ "url = 'https://www.pexels.com/'"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 2,
+ "metadata": {
+ "collapsed": true
+ },
+ "outputs": [],
+ "source": [
+ "resp = requests.get(url)\n",
+ "soup = BeautifulSoup(resp.text, 'lxml')"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 3,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "['https://images.pexels.com/photos/106606/pexels-photo-106606.jpeg?h=350&auto=compress&cs=tinysrgb',\n",
+ " 'https://images.pexels.com/photos/405041/pexels-photo-405041.jpeg?h=350&auto=compress&cs=tinysrgb',\n",
+ " 'https://images.pexels.com/photos/102170/pexels-photo-102170.jpeg?h=350&auto=compress&cs=tinysrgb',\n",
+ " 'https://images.pexels.com/photos/583399/pexels-photo-583399.jpeg?h=350&auto=compress&cs=tinysrgb',\n",
+ " 'https://images.pexels.com/photos/398533/pexels-photo-398533.jpeg?h=350&auto=compress&cs=tinysrgb']\n"
+ ]
+ }
+ ],
+ "source": [
+ "article = soup.find('div', class_='photos').find_all('article', class_='photo-item')\n",
+ "imgs = [a.find('a').find('img')['src'] for a in article]\n",
+ "target = imgs[:5]\n",
+ "\n",
+ "pprint(target)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 4,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "regex catch the name pexels-photo-106606.jpeg\n",
+ "Save the img at /home/dirl/github/Python-Crawling-Tutorial/results/pexels-photo-106606.jpeg\n",
+ "regex catch the name pexels-photo-405041.jpeg\n",
+ "Save the img at /home/dirl/github/Python-Crawling-Tutorial/results/pexels-photo-405041.jpeg\n",
+ "regex catch the name pexels-photo-102170.jpeg\n",
+ "Save the img at /home/dirl/github/Python-Crawling-Tutorial/results/pexels-photo-102170.jpeg\n",
+ "regex catch the name pexels-photo-583399.jpeg\n",
+ "Save the img at /home/dirl/github/Python-Crawling-Tutorial/results/pexels-photo-583399.jpeg\n",
+ "regex catch the name pexels-photo-398533.jpeg\n",
+ "Save the img at /home/dirl/github/Python-Crawling-Tutorial/results/pexels-photo-398533.jpeg\n"
+ ]
+ }
+ ],
+ "source": [
+ "results = os.path.abspath('../results')\n",
+ "\n",
+ "if not os.path.exists(results):\n",
+ " os.makedirs(results)\n",
+ "\n",
+ "for i in target:\n",
+ " img_resp = requests.get(i, stream=True) \n",
+ " filename = re.match(r\".*(pexels-photo-([0-9]{6})\\.jpeg).*\", i).group(1)\n",
+ " print('regex catch the name {}'.format(filename))\n",
+ " \n",
+ " filename = os.path.join(results, filename)\n",
+ "\n",
+ " with open(filename, 'wb') as f:\n",
+ " for chunk in img_resp.iter_content(2048):\n",
+ " f.write(chunk)\n",
+ " print('Save the img at {}'.format(filename))"
+ ]
+ }
+ ],
+ "metadata": {
+ "kernelspec": {
+ "display_name": "Python 3",
+ "language": "python",
+ "name": "python3"
+ },
+ "language_info": {
+ "codemirror_mode": {
+ "name": "ipython",
+ "version": 3
+ },
+ "file_extension": ".py",
+ "mimetype": "text/x-python",
+ "name": "python",
+ "nbconvert_exporter": "python",
+ "pygments_lexer": "ipython3",
+ "version": "3.5.2"
+ }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}
@@ -0,0 +1,125 @@
+{
+ "cells": [
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "# 練習\n",
+ "\n",
+ "- 觀察 https://afuntw.github.io/Test-Crawling-Website/pages/portfolio/index.html 並撰寫爬蟲程式\n",
+ "- 下載 5 張圖片\n",
+ "- 以正確的圖片格式存檔"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 1,
+ "metadata": {
+ "collapsed": true
+ },
+ "outputs": [],
+ "source": [
+ "import requests\n",
+ "import os\n",
+ "\n",
+ "from PIL import Image\n",
+ "from bs4 import BeautifulSoup\n",
+ "from pprint import pprint\n",
+ "\n",
+ "url = 'https://afuntw.github.io/Test-Crawling-Website/pages/portfolio/index.html'"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 2,
+ "metadata": {
+ "collapsed": true
+ },
+ "outputs": [],
+ "source": [
+ "resp = requests.get(url)\n",
+ "soup = BeautifulSoup(resp.text, 'lxml')"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 3,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "imgs = soup.find_all('img')\n",
+ "imgs = [i['src'] for i in imgs]"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 4,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "catch the filename XgXT3Va.png and the real format is JPEG\n",
+ "catch the real filename XgXT3Va.jpeg\n",
+ "save image at /home/dirl/github/Python-Crawling-Tutorial/results/XgXT3Va.jpeg\n",
+ "catch the filename Q3bkStv.png and the real format is PNG\n",
+ "catch the real filename Q3bkStv.png\n",
+ "save image at /home/dirl/github/Python-Crawling-Tutorial/results/Q3bkStv.png\n",
+ "catch the filename IDPxvSl.jpg and the real format is PNG\n",
+ "catch the real filename IDPxvSl.png\n",
+ "save image at /home/dirl/github/Python-Crawling-Tutorial/results/IDPxvSl.png\n",
+ "catch the filename ZEhBDs6.png and the real format is PNG\n",
+ "catch the real filename ZEhBDs6.png\n",
+ "save image at /home/dirl/github/Python-Crawling-Tutorial/results/ZEhBDs6.png\n",
+ "catch the filename UKxK6FZ.gif and the real format is PNG\n",
+ "catch the real filename UKxK6FZ.png\n",
+ "save image at /home/dirl/github/Python-Crawling-Tutorial/results/UKxK6FZ.png\n"
+ ]
+ }
+ ],
+ "source": [
+ "results = os.path.abspath('../results')\n",
+ "if not os.path.exists(results):\n",
+ " os.makedirs(results)\n",
+ "\n",
+ "for i in imgs:\n",
+ " img_resp = requests.get(i, stream=True)\n",
+ " image = Image.open(img_resp.raw)\n",
+ " filename = os.path.basename(i)\n",
+ " print('catch the filename {} and the real format is {}'.format(filename, image.format))\n",
+ " \n",
+ " real_filename = '{}.{}'.format(\n",
+ " filename.split('.')[0],\n",
+ " image.format.lower()\n",
+ " )\n",
+ " save_filename = os.path.join(results, real_filename)\n",
+ " print('catch the real filename {}'.format(real_filename))\n",
+ " \n",
+ " image.save(save_filename)\n",
+ " print('save image at {}'.format(save_filename))"
+ ]
+ }
+ ],
+ "metadata": {
+ "kernelspec": {
+ "display_name": "Python 3",
+ "language": "python",
+ "name": "python3"
+ },
+ "language_info": {
+ "codemirror_mode": {
+ "name": "ipython",
+ "version": 3
+ },
+ "file_extension": ".py",
+ "mimetype": "text/x-python",
+ "name": "python",
+ "nbconvert_exporter": "python",
+ "pygments_lexer": "ipython3",
+ "version": "3.5.2"
+ }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}