Skip to content

Commit 7f288ee

Browse files
committed
一小时入门网络爬虫系列代码
1 parent 5e197a9 commit 7f288ee

File tree

3 files changed

+244
-0
lines changed

3 files changed

+244
-0
lines changed

one_hour_spider/biqukan.py

Lines changed: 91 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,91 @@
1+
# -*- coding:UTF-8 -*-
2+
from bs4 import BeautifulSoup
3+
import requests, sys
4+
5+
"""
6+
类说明:下载《笔趣看》网小说《一念永恒》
7+
Parameters:
8+
9+
Returns:
10+
11+
Modify:
12+
2017-09-13
13+
"""
14+
class downloader(object):
15+
16+
def __init__(self):
17+
self.server = 'http://www.biqukan.com/'
18+
self.target = 'http://www.biqukan.com/1_1094/'
19+
self.names = []#存放章节名
20+
self.urls = []#存放章节链接
21+
self.nums = 0#章节数
22+
23+
"""
24+
函数说明:获取下载链接
25+
Parameters:
26+
27+
Returns:
28+
29+
Modify:
30+
2017-09-13
31+
"""
32+
def get_download_url(self):
33+
req = requests.get(url = self.target)
34+
html = req.text
35+
div_bf = BeautifulSoup(html)
36+
div = div_bf.find_all('div', class_ = 'listmain')
37+
a_bf = BeautifulSoup(str(div[0]))
38+
a = a_bf.find_all('a')
39+
self.nums = len(a[15:])#剔除不必要的章节,并统计章节数
40+
for each in a[15:]:
41+
self.names.append(each.string)
42+
self.urls.append(self.server + each.get('href'))
43+
44+
"""
45+
函数说明:获取章节内容
46+
Parameters:
47+
target - 下载连接(string)
48+
Returns:
49+
texts - 章节内容(string)
50+
Modify:
51+
2017-09-13
52+
"""
53+
def get_contents(self, target):
54+
req = requests.get(url = target)
55+
html = req.text
56+
bf = BeautifulSoup(html)
57+
texts = bf.find_all('div', class_ = 'showtxt')
58+
texts = texts[0].text.replace('\xa0'*8,'\n\n')
59+
return texts
60+
61+
"""
62+
函数说明:将爬取的文章内容写入文件
63+
Parameters:
64+
name - 章节名称(string)
65+
path - 当前路径下,小说保存名称(string)
66+
text - 章节内容(string)
67+
Returns:
68+
69+
Modify:
70+
2017-09-13
71+
"""
72+
def writer(self, name, path, text):
73+
write_flag = True
74+
with open(path, 'a', encoding='utf-8') as f:
75+
f.write(name + '\n')
76+
f.writelines(text)
77+
f.write('\n\n')
78+
79+
if __name__ == "__main__":
80+
# dl = downloader()
81+
# dl.get_download_url()
82+
# print('《一年永恒》开始下载:')
83+
# for i in range(dl.nums):
84+
# dl.writer(dl.names[i], '一念永恒.txt', dl.get_contents(dl.urls[i]))
85+
# sys.stdout.write(" 已下载:%.3f%%" % float(i/dl.nums) + '\r')
86+
# sys.stdout.flush()
87+
# print('《一年永恒》下载完成')
88+
url = 'https://unsplash.com/'
89+
headers = {'User-Agent':'Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/27.0.1453.94 Safari/537.36'}
90+
req = requests.get(url = url, headers = headers)
91+
print(req.text)

one_hour_spider/unsplash.py

Lines changed: 64 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,64 @@
1+
# -*- coding:UTF-8 -*-
2+
import requests, json, time, sys
3+
from contextlib import closing
4+
5+
class get_photos(object):
6+
7+
def __init__(self):
8+
self.photos_id = []
9+
self.download_server = 'https://unsplash.com/photos/xxx/download?force=trues'
10+
self.target = 'http://unsplash.com/napi/feeds/home'
11+
self.headers = {'authorization':'Client-ID c94869b36aa272dd62dfaeefed769d4115fb3189a9d1ec88ed457207747be626'}
12+
13+
"""
14+
函数说明:获取图片ID
15+
Parameters:
16+
17+
Returns:
18+
19+
Modify:
20+
2017-09-13
21+
"""
22+
def get_ids(self):
23+
req = requests.get(url=self.target, headers=self.headers, verify=False)
24+
html = json.loads(req.text)
25+
next_page = html['next_page']
26+
for each in html['photos']:
27+
self.photos_id.append(each['id'])
28+
time.sleep(1)
29+
for i in range(5):
30+
req = requests.get(url=next_page, headers=self.headers, verify=False)
31+
html = json.loads(req.text)
32+
next_page = html['next_page']
33+
for each in html['photos']:
34+
self.photos_id.append(each['id'])
35+
time.sleep(1)
36+
37+
38+
"""
39+
函数说明:图片下载
40+
Parameters:
41+
42+
Returns:
43+
44+
Modify:
45+
2017-09-13
46+
"""
47+
def download(self, photo_id, filename):
48+
headers = {'User-Agent':'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/61.0.3163.79 Safari/537.36'}
49+
target = self.download_server.replace('xxx', photo_id)
50+
with closing(requests.get(url=target, stream=True, verify = False, headers = self.headers)) as r:
51+
with open('%d.jpg' % filename, 'ab+') as f:
52+
for chunk in r.iter_content(chunk_size = 1024):
53+
if chunk:
54+
f.write(chunk)
55+
f.flush()
56+
57+
if __name__ == '__main__':
58+
gp = get_photos()
59+
print('获取图片连接中:')
60+
gp.get_ids()
61+
print('图片下载中:')
62+
for i in range(len(gp.photos_id)):
63+
print(' 正在下载第%d张图片' % (i+1))
64+
gp.download(gp.photos_id[i], (i+1))
Lines changed: 89 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,89 @@
1+
#-*- coding:UTF-8 -*-
2+
import requests,re, json, sys
3+
from bs4 import BeautifulSoup
4+
from urllib import request
5+
6+
class video_downloader():
7+
def __init__(self, url):
8+
self.server = 'http://api.xfsub.com'
9+
self.api = 'http://api.xfsub.com/xfsub_api/?url='
10+
self.get_url_api = 'http://api.xfsub.com/xfsub_api/url.php'
11+
self.url = url.split('#')[0]
12+
self.target = self.api + self.url
13+
self.s = requests.session()
14+
15+
"""
16+
函数说明:获取key、time、url等参数
17+
Parameters:
18+
19+
Returns:
20+
21+
Modify:
22+
2017-09-18
23+
"""
24+
def get_key(self):
25+
req = self.s.get(url=self.target)
26+
req.encoding = 'utf-8'
27+
self.info = json.loads(re.findall('"url.php",\ (.+),', req.text)[0])#使用正则表达式匹配结果,将匹配的结果存入info变量中
28+
29+
"""
30+
函数说明:获取视频地址
31+
Parameters:
32+
33+
Returns:
34+
video_url - 视频存放地址
35+
Modify:
36+
2017-09-18
37+
"""
38+
def get_url(self):
39+
data = {'time':self.info['time'],
40+
'key':self.info['key'],
41+
'url':self.info['url'],
42+
'type':''}
43+
req = self.s.post(url=self.get_url_api,data=data)
44+
url = self.server + json.loads(req.text)['url']
45+
req = self.s.get(url)
46+
bf = BeautifulSoup(req.text,'xml')#因为文件是xml格式的,所以要进行xml解析。
47+
video_url = bf.find('file').string#匹配到视频地址
48+
return video_url
49+
50+
"""
51+
函数说明:回调函数,打印下载进度
52+
Parameters:
53+
a b c - 返回信息
54+
Returns:
55+
56+
Modify:
57+
2017-09-18
58+
"""
59+
def Schedule(self, a, b, c):
60+
per = 100.0*a*b/c
61+
if per > 100 :
62+
per = 1
63+
sys.stdout.write(" " + "%.2f%% 已经下载的大小:%ld 文件大小:%ld" % (per,a*b,c) + '\r')
64+
sys.stdout.flush()
65+
66+
"""
67+
函数说明:视频下载
68+
Parameters:
69+
url - 视频地址
70+
filename - 视频名字
71+
Returns:
72+
73+
Modify:
74+
2017-09-18
75+
"""
76+
def video_download(self, url, filename):
77+
request.urlretrieve(url=url,filename=filename,reporthook=self.Schedule)
78+
79+
80+
if __name__ == '__main__':
81+
url = 'http://www.iqiyi.com/v_19rr7qhfg0.html#vfrm=19-9-0-1'
82+
vd = video_downloader(url)
83+
filename = '加勒比海盗5'
84+
print('%s下载中:' % filename)
85+
vd.get_key()
86+
video_url = vd.get_url()
87+
print(' 获取地址成功:%s' % video_url)
88+
vd.video_download(video_url, filename+'.mp4')
89+
print('\n下载完成!')

0 commit comments

Comments
 (0)