Skip to content

Commit e7a10a8

Browse files
committed
first
1 parent eff6905 commit e7a10a8

File tree

1 file changed

+59
-0
lines changed

1 file changed

+59
-0
lines changed
Lines changed: 59 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,59 @@
1+
#메탈킹덤 내부링크 연동을 통한 이미지 다운로드
2+
3+
from urllib.request import urlopen
4+
from bs4 import BeautifulSoup
5+
import urllib.request
6+
7+
mainURL="https://www.metalkingdom.net"
8+
URL="https://www.metalkingdom.net/top-albums/?start=1"
9+
10+
### 크롤링 금지 페이지 처리 ########################################################################
11+
headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:23.0) Gecko/20100101 Firefox/23.0'}
12+
req = urllib.request.Request(url=URL, headers=headers)
13+
####################################################################################################
14+
15+
count = 0
16+
17+
html = urlopen(req)
18+
bs = BeautifulSoup(html, "html.parser")
19+
20+
def imgDownload(img_url, img_num):
21+
imgName = img_url.split('/')[-1]
22+
imgName = imgName[imgName.find(imgName.split('-')[1]):]
23+
imageReq = urllib.request.Request(url=img_url, headers=headers)
24+
imageRes = urllib.request.urlopen(imageReq) # store the response
25+
26+
newimgName = "".join(c for c in imgName if c not in ':[]%?')
27+
#print(newimgName)
28+
29+
with open('images/'+order[img_num]+'_'+newimgName, 'wb') as fd:
30+
fd.write(imageRes.read())
31+
32+
def innerLinkConnection(new_url):
33+
innerReq = urllib.request.Request(url=new_url, headers=headers)
34+
bs2 = BeautifulSoup(urlopen(innerReq), "html.parser")
35+
36+
global count
37+
38+
#print(newURL)
39+
for td2 in bs2.findAll('div', {'id': 'album-info-left-1'}):
40+
try:
41+
imgURL = mainURL + td2.find('a').attrs['href']
42+
imgDownload(imgURL, count)
43+
count = count + 1
44+
except:
45+
imgURL = mainURL + td2.find('img').attrs['src']
46+
imgDownload(imgURL, count)
47+
count = count + 1
48+
49+
order = []
50+
51+
for td in bs.findAll('td',{'class':'c1'}):
52+
order.append(td.get_text().zfill(3))
53+
54+
for td in bs.findAll('td',{'class':'c2'}):
55+
#print(td.find('a').attrs['href'])
56+
newURL = mainURL + td.find('a').attrs['href']
57+
innerLinkConnection(newURL)
58+
#print(newURL)
59+
#imgDownload(newURL)

0 commit comments

Comments
 (0)