|
| 1 | +# _*_ coding: utf-8 _*_ |
| 2 | + |
| 3 | +""" |
| 4 | +python_spider.py by xianhu |
| 5 | +""" |
| 6 | + |
| 7 | +import urllib.error |
| 8 | +import urllib.parse |
| 9 | +import urllib.request |
| 10 | +import http.cookiejar |
| 11 | + |
| 12 | + |
| 13 | +# 最简单的方式 |
| 14 | +response = urllib.request.urlopen("http://www.baidu.com", timeout=10) |
| 15 | +html = response.read().decode("utf-8") |
| 16 | + |
| 17 | + |
| 18 | +# 使用Request类 |
| 19 | +request = urllib.request.Request("http://www.baidu.com/") |
| 20 | +response = urllib.request.urlopen(request, timeout=10) |
| 21 | + |
| 22 | + |
| 23 | +# 发送数据,即在urlopen()或者Request()中添加data参数 |
| 24 | +url = "http://localhost/login.php" |
| 25 | +data = urllib.parse.urlencode({"act": "login", "email": "xianhu@qq.com", "password": "123456"}) |
| 26 | +request1 = urllib.request.Request(url, data)# POST方法 |
| 27 | +request2 = urllib.request.Request(url + "?%s" % data) # GET方法 |
| 28 | +response = urllib.request.urlopen(request, timeout=10) |
| 29 | + |
| 30 | + |
| 31 | +# 发送Header,即在urlopen()或者Request()中添加headers参数 |
| 32 | +headers = {"User-Agent": "Mozilla/4.0 (compatible; MSIE 5.5; Windows NT)"} |
| 33 | +request = urllib.request.Request(url, data=data, headers=headers) # 参数中添加header参数 |
| 34 | +request.add_header("Referer", "http://www.baidu.com") # add_header函数,另一种添加header的方法 |
| 35 | +response = urllib.request.urlopen(request, timeout=10) |
| 36 | + |
| 37 | + |
| 38 | +# 对付"反盗链":所谓的反盗链设置,就是检查header里的referer站点是不是他自己。所以只需要像把headers的referer改成某个网站自己即可 |
| 39 | +headers = {"Referer": "http://www.baidu.com/"} |
| 40 | + |
| 41 | + |
| 42 | +# 引发异常:urllib.error.HTTPError, urllib.error.URLError, 两者存在继承关系 |
| 43 | +try: |
| 44 | + urllib.request.urlopen(request, timeout=10) |
| 45 | +except urllib.error.HTTPError as e: |
| 46 | + print(e.code, e.reason) |
| 47 | + |
| 48 | + |
| 49 | +# 使用代理,以防止IP被封或IP次数受限: |
| 50 | +proxy = urllib.request.ProxyHandler({"http": "111.123.76.12:8080"}) |
| 51 | + |
| 52 | +opener = urllib.request.build_opener(proxy)# 利用代理创建opener实例(OpenerDirector实例) |
| 53 | +response = opener.open("https://www.baidu.com/")# 直接利用opener实例打开url |
| 54 | + |
| 55 | +urllib.request.install_opener(opener)# 安装、设置全局的opener,然后利用urlopen打开url |
| 56 | +response = urllib.request.urlopen("https://www.baidu.com/") |
| 57 | + |
| 58 | + |
| 59 | +# 抓取网页中的图片:同样适用于抓取网络上的文件。右击鼠标,找到图片属性中的地址,然后进行保存。 |
| 60 | +url = "http://ww3.sinaimg.cn/large/7d742c99tw1ee7dac2766j204q04qmxq.jpg" |
| 61 | +response = urllib.request.urlopen(url, timeout=120) |
| 62 | +with open("test.jpg", "wb") as file_img: |
| 63 | + file_img.write(response.read()) |
| 64 | + |
| 65 | + |
| 66 | +# 使用cookie和cookiejar |
| 67 | +cookie_jar = http.cookiejar.CookieJar() |
| 68 | +opener = urllib.request.build_opener(urllib.request.HTTPCookieProcessor(cookiejar=cookie_jar)) |
| 69 | +response = opener.open("https://www.baidu.com/") |
| 70 | +for cookie in cookie_jar: |
| 71 | + print(cookie) |
| 72 | + |
| 73 | + |
| 74 | +# 发送在浏览器中获取的cookie,两种方式:(1)直接放到headers里,(2)构建cookie,添加到cookiejar中 |
| 75 | +headers = {"User-Agent": "Mozilla/4.0 (compatible; MSIE 5.5; Windows NT)", |
| 76 | + "Cookie": "PHPSESSID=btqkg9amjrtoeev8coq0m78396; USERINFO=n6nxTHTY%2BJA39z6CpNB4eKN8f0KsYLjAQTwPe%2BhLHLruEbjaeh4ulhWAS5RysUM%2B; "} |
| 77 | +request = urllib.request.Request(url, headers=headers) |
| 78 | + |
| 79 | +cookie = http.cookiejar.Cookie(name="xx", value="xx", domain="xx") |
| 80 | +cookie_jar.set_cookie(cookie) |
| 81 | +response = opener.open("https://www.baidu.com/") |
| 82 | + |
| 83 | + |
| 84 | +# 同时使用代理和cookiejar |
| 85 | +opener = urllib.request.build_opener(urllib.request.HTTPCookieProcessor(cookiejar=cookie_jar)) |
| 86 | +opener.add_handler(urllib.request.ProxyHandler(proxies={"http": "http://www.example.com:8888/"})) |
| 87 | +response = opener.open("https://www.baidu.com/") |
0 commit comments