Skip to content

Commit 454510b

Browse files
committed
add: practice answer
1 parent acf3dfc commit 454510b

File tree

2 files changed

+256
-0
lines changed

2 files changed

+256
-0
lines changed

practice/00_image_crawling.ipynb

Lines changed: 131 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,131 @@
1+
{
2+
"cells": [
3+
{
4+
"cell_type": "markdown",
5+
"metadata": {},
6+
"source": [
7+
"# 練習\n",
8+
"\n",
9+
"- 觀察 https://www.pexels.com/ 並撰寫爬蟲程式\n",
10+
"- 下載 5 張桌布圖"
11+
]
12+
},
13+
{
14+
"cell_type": "code",
15+
"execution_count": 1,
16+
"metadata": {
17+
"collapsed": true
18+
},
19+
"outputs": [],
20+
"source": [
21+
"import requests\n",
22+
"import re\n",
23+
"import os\n",
24+
"\n",
25+
"from bs4 import BeautifulSoup\n",
26+
"from pprint import pprint\n",
27+
"\n",
28+
"url = 'https://www.pexels.com/'"
29+
]
30+
},
31+
{
32+
"cell_type": "code",
33+
"execution_count": 2,
34+
"metadata": {
35+
"collapsed": true
36+
},
37+
"outputs": [],
38+
"source": [
39+
"resp = requests.get(url)\n",
40+
"soup = BeautifulSoup(resp.text, 'lxml')"
41+
]
42+
},
43+
{
44+
"cell_type": "code",
45+
"execution_count": 3,
46+
"metadata": {},
47+
"outputs": [
48+
{
49+
"name": "stdout",
50+
"output_type": "stream",
51+
"text": [
52+
"['https://images.pexels.com/photos/106606/pexels-photo-106606.jpeg?h=350&auto=compress&cs=tinysrgb',\n",
53+
" 'https://images.pexels.com/photos/405041/pexels-photo-405041.jpeg?h=350&auto=compress&cs=tinysrgb',\n",
54+
" 'https://images.pexels.com/photos/102170/pexels-photo-102170.jpeg?h=350&auto=compress&cs=tinysrgb',\n",
55+
" 'https://images.pexels.com/photos/583399/pexels-photo-583399.jpeg?h=350&auto=compress&cs=tinysrgb',\n",
56+
" 'https://images.pexels.com/photos/398533/pexels-photo-398533.jpeg?h=350&auto=compress&cs=tinysrgb']\n"
57+
]
58+
}
59+
],
60+
"source": [
61+
"article = soup.find('div', class_='photos').find_all('article', class_='photo-item')\n",
62+
"imgs = [a.find('a').find('img')['src'] for a in article]\n",
63+
"target = imgs[:5]\n",
64+
"\n",
65+
"pprint(target)"
66+
]
67+
},
68+
{
69+
"cell_type": "code",
70+
"execution_count": 4,
71+
"metadata": {},
72+
"outputs": [
73+
{
74+
"name": "stdout",
75+
"output_type": "stream",
76+
"text": [
77+
"regex catch the name pexels-photo-106606.jpeg\n",
78+
"Save the img at /home/dirl/github/Python-Crawling-Tutorial/results/pexels-photo-106606.jpeg\n",
79+
"regex catch the name pexels-photo-405041.jpeg\n",
80+
"Save the img at /home/dirl/github/Python-Crawling-Tutorial/results/pexels-photo-405041.jpeg\n",
81+
"regex catch the name pexels-photo-102170.jpeg\n",
82+
"Save the img at /home/dirl/github/Python-Crawling-Tutorial/results/pexels-photo-102170.jpeg\n",
83+
"regex catch the name pexels-photo-583399.jpeg\n",
84+
"Save the img at /home/dirl/github/Python-Crawling-Tutorial/results/pexels-photo-583399.jpeg\n",
85+
"regex catch the name pexels-photo-398533.jpeg\n",
86+
"Save the img at /home/dirl/github/Python-Crawling-Tutorial/results/pexels-photo-398533.jpeg\n"
87+
]
88+
}
89+
],
90+
"source": [
91+
"results = os.path.abspath('../results')\n",
92+
"\n",
93+
"if not os.path.exists(results):\n",
94+
" os.makedirs(results)\n",
95+
"\n",
96+
"for i in target:\n",
97+
" img_resp = requests.get(i, stream=True) \n",
98+
" filename = re.match(r\".*(pexels-photo-([0-9]{6})\\.jpeg).*\", i).group(1)\n",
99+
" print('regex catch the name {}'.format(filename))\n",
100+
" \n",
101+
" filename = os.path.join(results, filename)\n",
102+
"\n",
103+
" with open(filename, 'wb') as f:\n",
104+
" for chunk in img_resp.iter_content(2048):\n",
105+
" f.write(chunk)\n",
106+
" print('Save the img at {}'.format(filename))"
107+
]
108+
}
109+
],
110+
"metadata": {
111+
"kernelspec": {
112+
"display_name": "Python 3",
113+
"language": "python",
114+
"name": "python3"
115+
},
116+
"language_info": {
117+
"codemirror_mode": {
118+
"name": "ipython",
119+
"version": 3
120+
},
121+
"file_extension": ".py",
122+
"mimetype": "text/x-python",
123+
"name": "python",
124+
"nbconvert_exporter": "python",
125+
"pygments_lexer": "ipython3",
126+
"version": "3.5.2"
127+
}
128+
},
129+
"nbformat": 4,
130+
"nbformat_minor": 2
131+
}
Lines changed: 125 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,125 @@
1+
{
2+
"cells": [
3+
{
4+
"cell_type": "markdown",
5+
"metadata": {},
6+
"source": [
7+
"# 練習\n",
8+
"\n",
9+
"- 觀察 https://afuntw.github.io/Test-Crawling-Website/pages/portfolio/index.html 並撰寫爬蟲程式\n",
10+
"- 下載 5 張圖片\n",
11+
"- 以正確的圖片格式存檔"
12+
]
13+
},
14+
{
15+
"cell_type": "code",
16+
"execution_count": 1,
17+
"metadata": {
18+
"collapsed": true
19+
},
20+
"outputs": [],
21+
"source": [
22+
"import requests\n",
23+
"import os\n",
24+
"\n",
25+
"from PIL import Image\n",
26+
"from bs4 import BeautifulSoup\n",
27+
"from pprint import pprint\n",
28+
"\n",
29+
"url = 'https://afuntw.github.io/Test-Crawling-Website/pages/portfolio/index.html'"
30+
]
31+
},
32+
{
33+
"cell_type": "code",
34+
"execution_count": 2,
35+
"metadata": {
36+
"collapsed": true
37+
},
38+
"outputs": [],
39+
"source": [
40+
"resp = requests.get(url)\n",
41+
"soup = BeautifulSoup(resp.text, 'lxml')"
42+
]
43+
},
44+
{
45+
"cell_type": "code",
46+
"execution_count": 3,
47+
"metadata": {},
48+
"outputs": [],
49+
"source": [
50+
"imgs = soup.find_all('img')\n",
51+
"imgs = [i['src'] for i in imgs]"
52+
]
53+
},
54+
{
55+
"cell_type": "code",
56+
"execution_count": 4,
57+
"metadata": {},
58+
"outputs": [
59+
{
60+
"name": "stdout",
61+
"output_type": "stream",
62+
"text": [
63+
"catch the filename XgXT3Va.png and the real format is JPEG\n",
64+
"catch the real filename XgXT3Va.jpeg\n",
65+
"save image at /home/dirl/github/Python-Crawling-Tutorial/results/XgXT3Va.jpeg\n",
66+
"catch the filename Q3bkStv.png and the real format is PNG\n",
67+
"catch the real filename Q3bkStv.png\n",
68+
"save image at /home/dirl/github/Python-Crawling-Tutorial/results/Q3bkStv.png\n",
69+
"catch the filename IDPxvSl.jpg and the real format is PNG\n",
70+
"catch the real filename IDPxvSl.png\n",
71+
"save image at /home/dirl/github/Python-Crawling-Tutorial/results/IDPxvSl.png\n",
72+
"catch the filename ZEhBDs6.png and the real format is PNG\n",
73+
"catch the real filename ZEhBDs6.png\n",
74+
"save image at /home/dirl/github/Python-Crawling-Tutorial/results/ZEhBDs6.png\n",
75+
"catch the filename UKxK6FZ.gif and the real format is PNG\n",
76+
"catch the real filename UKxK6FZ.png\n",
77+
"save image at /home/dirl/github/Python-Crawling-Tutorial/results/UKxK6FZ.png\n"
78+
]
79+
}
80+
],
81+
"source": [
82+
"results = os.path.abspath('../results')\n",
83+
"if not os.path.exists(results):\n",
84+
" os.makedirs(results)\n",
85+
"\n",
86+
"for i in imgs:\n",
87+
" img_resp = requests.get(i, stream=True)\n",
88+
" image = Image.open(img_resp.raw)\n",
89+
" filename = os.path.basename(i)\n",
90+
" print('catch the filename {} and the real format is {}'.format(filename, image.format))\n",
91+
" \n",
92+
" real_filename = '{}.{}'.format(\n",
93+
" filename.split('.')[0],\n",
94+
" image.format.lower()\n",
95+
" )\n",
96+
" save_filename = os.path.join(results, real_filename)\n",
97+
" print('catch the real filename {}'.format(real_filename))\n",
98+
" \n",
99+
" image.save(save_filename)\n",
100+
" print('save image at {}'.format(save_filename))"
101+
]
102+
}
103+
],
104+
"metadata": {
105+
"kernelspec": {
106+
"display_name": "Python 3",
107+
"language": "python",
108+
"name": "python3"
109+
},
110+
"language_info": {
111+
"codemirror_mode": {
112+
"name": "ipython",
113+
"version": 3
114+
},
115+
"file_extension": ".py",
116+
"mimetype": "text/x-python",
117+
"name": "python",
118+
"nbconvert_exporter": "python",
119+
"pygments_lexer": "ipython3",
120+
"version": "3.5.2"
121+
}
122+
},
123+
"nbformat": 4,
124+
"nbformat_minor": 2
125+
}

0 commit comments

Comments
 (0)