1+ #메탈킹덤 내부링크 연동을 통한 이미지 다운로드 
2+ 
3+ from  urllib .request  import  urlopen 
4+ from  bs4  import  BeautifulSoup 
5+ import  urllib .request 
6+ 
7+ mainURL = "https://www.metalkingdom.net" 
8+ URL = "https://www.metalkingdom.net/top-albums/?start=1" 
9+ 
10+ ### 크롤링 금지 페이지 처리 ######################################################################## 
11+ headers  =  {'User-Agent' : 'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:23.0) Gecko/20100101 Firefox/23.0' }
12+ req  =  urllib .request .Request (url = URL , headers = headers )
13+ #################################################################################################### 
14+ 
15+ count  =  0 
16+ 
17+ html  =  urlopen (req )
18+ bs  =  BeautifulSoup (html , "html.parser" )
19+ 
20+ def  imgDownload (img_url , img_num ):
21+  imgName  =  img_url .split ('/' )[- 1 ]
22+  imgName  =  imgName [imgName .find (imgName .split ('-' )[1 ]):]
23+  imageReq  =  urllib .request .Request (url = img_url , headers = headers )
24+  imageRes  =  urllib .request .urlopen (imageReq ) # store the response 
25+ 
26+  newimgName  =  "" .join (c  for  c  in  imgName  if  c  not  in ':[]%?' )
27+  #print(newimgName) 
28+ 
29+  with  open ('images/' + order [img_num ]+ '_' + newimgName , 'wb' ) as  fd :
30+  fd .write (imageRes .read ())
31+ 
32+ def  innerLinkConnection (new_url ):
33+  innerReq  =  urllib .request .Request (url = new_url , headers = headers )
34+  bs2  =  BeautifulSoup (urlopen (innerReq ), "html.parser" )
35+ 
36+  global  count 
37+ 
38+  #print(newURL) 
39+  for  td2  in  bs2 .findAll ('div' , {'id' : 'album-info-left-1' }):
40+  try :
41+  imgURL  =  mainURL  +  td2 .find ('a' ).attrs ['href' ]
42+  imgDownload (imgURL , count )
43+  count  =  count  +  1 
44+  except :
45+  imgURL  =  mainURL  +  td2 .find ('img' ).attrs ['src' ]
46+  imgDownload (imgURL , count )
47+  count  =  count  +  1 
48+ 
49+ order  =  []
50+ 
51+ for  td  in  bs .findAll ('td' ,{'class' :'c1' }):
52+  order .append (td .get_text ().zfill (3 ))
53+ 
54+ for  td  in  bs .findAll ('td' ,{'class' :'c2' }):
55+  #print(td.find('a').attrs['href']) 
56+  newURL  =  mainURL  +  td .find ('a' ).attrs ['href' ]
57+  innerLinkConnection (newURL )
58+  #print(newURL) 
59+  #imgDownload(newURL) 
0 commit comments