11# -*- coding: utf-8 -*-
2- # __author__ = "Hong Nguyen Nam"
3- # __copyright__ = "Copyright 2021 , The Browser Clone"
2+ # __author__ = "Hong Nguyen Nam (Jeremy Nguyen) "
3+ # __copyright__ = "Copyright 2022 , The Browser Clone"
44# __license__ = "GPL"
5- # __version__ = "1.0 .0"
6- # __maintainer__ = "Hong Nguyen Nam"
5+ # __version__ = "2.1 .0"
6+ # __maintainer__ = "Hong Nguyen Nam (Jeremy Nguyen) "
77# __email__ = "a2FpdG9raWQxNDEyLmNvbmFuQGdtYWlsLmNvbQ=="
8- __path_driver__ = 'chromedriver'
8+ __path_driver__ = '/Users/Hacker1945/Desktop/clone-web/ chromedriver'
99__black_list_type__ = ['.php' ]
1010__status_code__ = [200 , 404 ]
1111__clone_all__ = False
12- __zip__ = False
13- __clone_url__ = 'https://pixinvent .com/demo/vuexy- html-bootstrap-admin-template/html/ltr/vertical-menu-template /index.html'
12+ __zip__ = True
13+ __clone_url__ = 'https://themesbrand .com/velzon/ html/default /index.html'
1414
15-
16- from seleniumwire import webdriver
17- from selenium .webdriver .common .keys import Keys
18- import time
19- from urllib .parse import urlparse
20- import urllib .request
21- import urllib .parse
15+ # https://unistudio.co/html/renox/main/index.html
16+ # https://themesbrand.com/velzon/html/default/index.html
2217import os
2318import os .path
24- import requests
2519import re
20+ import shutil
21+ import time
22+ from urllib .parse import urlparse
23+
24+ import requests
2625from bs4 import BeautifulSoup
26+ from seleniumwire import webdriver
2727from tqdm import tqdm
2828from zipfile36 import ZipFile
29- import shutil
3029
3130
32- class File () :
31+ class File :
3332 info_url = ''
33+
3434 def __init__ (self , url ):
3535 self .url = url
3636 self .info_url = self .extract_info_url (url , True )
3737 self .check_exists (url )
3838
39-
40- def extract_info_url (self , url , main = False ):
39+ def extract_info_url (self , url , main = False ):
4140 data_url = urlparse (url )
4241 domain = data_url .netloc
4342 path_file = domain .replace ('.' , '' ) + os .path .split (data_url .path )[0 ] + '/'
@@ -49,54 +48,47 @@ def extract_info_url(self, url, main = False):
4948 file_name = 'index.html'
5049 return {"domain" : domain , "path" : path_file , "file_name" : file_name , "scheme" : scheme , "url" : url_ori }
5150
52-
5351 def download_file (self , url ):
54- black_list = ['' , '/' ]
5552 info_url = self .extract_info_url (url )
5653 if url == self .url :
5754 info_url = self .extract_info_url (url , True )
58-
55+
5956 if info_url ['file_name' ][- 4 :] not in __black_list_type__ :
6057 file_name = info_url ['file_name' ]
61- if info_url ['file_name' ] in black_list :
58+ black_list = ['' , '/' ]
59+ if file_name in black_list :
6260 file_name = 'index.html'
6361 path_file = info_url ['path' ] + file_name
64- if os .path .exists (path_file ) == False :
62+ if not os .path .exists (path_file ):
6563 r = requests .get (url )
6664 os .makedirs (os .path .dirname (path_file ), exist_ok = True )
6765 with open (path_file , 'wb' ) as f :
6866 f .write (r .content )
6967
70-
7168 def check_invalid (self , file_name ):
7269 regex = r"[a-z-0-9]+.html"
7370 matches = re .finditer (regex , file_name , re .MULTILINE )
74- for matchNum , match in enumerate ( matches , start = 1 ) :
71+ for match in matches :
7572 return match .group ()
76-
7773
7874 def check_exists (self , url ):
7975 info_url = self .extract_info_url (url )
8076 path_file = info_url ['path' ] + info_url ['file_name' ]
8177 if info_url ['domain' ] == self .info_url ['domain' ]:
82- if os .path .exists (path_file ) == False :
83- return True
84- else :
85- return False
78+ return os .path .exists (path_file ) == False
8679 else :
8780 return False
88-
89-
90- def get_href_a_tag (self , pagesource ):
81+
82+ def get_href_a_tag (self , page_source ):
9183 result = []
92- source = BeautifulSoup (pagesource , 'html.parser' )
84+ source = BeautifulSoup (page_source , 'html.parser' )
9385 try :
9486 data_a = source .find_all ("a" )
95- except :
87+ except Exception :
9688 data_a = None
9789 a_tag_list = []
9890 for a in data_a :
99- if a .get ('href' ) != '' and a .get ('href' ) != '#' and str (a .get ('href' )) not in a_tag_list and self .check_invalid (str (a .get ('href' ))) != None :
91+ if a .get ('href' ) != '' and a .get ('href' ) != '#' and str (a .get ('href' )) not in a_tag_list and self .check_invalid (str (a .get ('href' ))) is not None :
10092 a_tag_list .append (a .get ('href' ))
10193
10294 for href in a_tag_list :
@@ -107,108 +99,126 @@ def get_href_a_tag(self, pagesource):
10799 link = self .info_url ['url' ]
108100 for text in cut :
109101 if text != '' :
110- link = link .replace (str (text )+ ' /' , '' )
102+ link = link .replace (f' { str (text )} /' , '' )
111103 result .append (link + href .replace ('../' , '' ))
112104 elif href [:1 ] == '/' :
113105 link = re .split ('[\/]+' , self .info_url ['url' ])[:2 ]
114- link = str (link [0 ]) + '//' + str (link [1 ])
106+ link = f' { str (link [0 ])} // { str (link [1 ])} '
115107 result .append (link + href )
116108 else :
117109 result .append (self .info_url ['url' ] + href )
118110 if domain == self .info_url ['domain' ]:
119111 result .append (href )
120112 return result
121113
122-
123114 def get_all_file_paths (self , directory ):
124115 file_paths = []
125116 for root , directories , files in os .walk (directory ):
126117 for filename in files :
127118 filepath = os .path .join (root , filename )
128119 file_paths .append (filepath )
129- return file_paths
130-
120+ return file_paths
131121
132122 def zip (self , path_folder ):
133- print ('Begin zipped file ' + str (path_folder ) + ' .zip' )
123+ print (f'Compression files... { str (path_folder )} .zip' )
134124 directory = path_folder
135125 file_paths = self .get_all_file_paths (directory )
136- with ZipFile (path_folder + ' .zip' ,'w' ) as zip :
126+ with ZipFile (f' { path_folder } .zip' , 'w' ) as zip :
137127 for file in file_paths :
138128 zip .write (file )
139129 print ('All files zipped successfully!' )
140-
130+
141131
142132class BrowserClone (File ):
143133 driver = ''
144134 page_source = ''
145135 all_tab = []
146136 url_down = []
147-
148-
137+
149138 def __init__ (self , url ):
139+ super ().__init__ (url )
150140 self .url = url
151141 self .open_browser ()
152-
153-
142+
154143 def open_browser (self ):
155144 print ('============================== Begin ==============================' )
156145 options = webdriver .ChromeOptions ()
157146 options .add_argument ("--incognito" )
158- # options.add_argument("--headless")
159- options .add_argument (f"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/89.0.4389.90 Safari/537.36" )
147+ options .add_argument ("--headless" )
148+ # options.add_argument(f"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/89.0.4389.90 Safari/537.36")
149+ # Configure to run in the background on ubuntu/debian
150+ options .add_argument ("--no-sandbox" )
151+ # Bypass network to fast loading
152+ # options.add_argument('--no-proxy-server')
153+ # options.add_argument("--proxy-server='direct://'")
154+ # options.add_argument("--proxy-bypass-list=*")
155+
156+ options .add_experimental_option ("useAutomationExtension" , False )
157+ options .add_experimental_option ("excludeSwitches" , ["enable-automation" ])
158+ options .page_load_strategy = 'none'
159+
160160 path_chrome_driver = __path_driver__
161161 self .driver = webdriver .Chrome (chrome_options = options , executable_path = path_chrome_driver )
162162 self .driver .get (self .url )
163+ time .sleep (30 )
164+ for _ in range (5 ):
165+ try :
166+ self .driver .switch_to .alert .accept ()
167+ except Exception :
168+ continue
163169 self .page_source = self .driver .page_source
164170 super ().__init__ (self .url )
165171 self .extract_file ()
166- print ('Get all link clone ...' )
172+ print ('Getting all the links to crawl ...' )
167173 url_tab_data = super ().get_href_a_tag (self .page_source )
168174 for url_tab in url_tab_data :
169175 self .all_tab .append (url_tab )
170176 self .extract_html (url_tab )
171-
172-
173- # clone options
174- if __clone_all__ == True :
177+
178+ # clone options
179+ if __clone_all__ :
175180 data = list (set (self .all_tab ))
176181 for url in data :
177182 self .driver .get (url )
178183 self .extract_file ()
179184
180-
181185 print ('Get all link clone done!' )
182186 print ('Save files...' )
183187 self .extract_file (True )
184188 print ('Save files Done!' )
185189
186- if __zip__ == True :
190+ if __zip__ :
187191 url_info = super ().extract_info_url (self .url , True )
188192 folder = './' + url_info ['domain' ].replace ('.' , '' )
189193 super ().zip (folder )
190194 try :
191195 shutil .rmtree (folder , ignore_errors = True )
192196 except OSError as e :
193- print ("Error: %s : %s" % ( folder , e .strerror ) )
197+ print (f "Error: { folder } : { e .strerror } " )
194198 print ('============================== End Game ==============================' )
195199
196-
197200 def extract_html (self , url ):
198201 super ().__init__ (url )
199202 self .driver .get (url )
203+ for _ in range (5 ):
204+ try :
205+ self .driver .switch_to .alert .accept ()
206+ except Exception :
207+ continue
200208 self .page_source = self .driver .page_source
201209 url_tab_data = super ().get_href_a_tag (self .page_source )
202210 for url_tab in url_tab_data :
203211 self .all_tab .append (url_tab )
204-
205212
206- def extract_file (self , down = False ):
213+ def extract_file (self , down = False ):
207214 for request in self .driver .requests :
208- if request .response :
209- if request .response .status_code in __status_code__ and request .url not in self .url_down :
210- self .url_down .append (request .url )
211- if down == True :
215+ if (
216+ request .response
217+ and request .response .status_code in __status_code__
218+ and request .url not in self .url_down
219+ ):
220+ self .url_down .append (request .url )
221+ if down :
212222 super ().__init__ (self .url )
213223 data = list (set (self .url_down ))
214224 with tqdm (total = len (data )) as pbar :
@@ -218,4 +228,4 @@ def extract_file(self, down = False):
218228 pbar .update (1 )
219229
220230
221- BrowserClone (__clone_url__ )
231+ BrowserClone (__clone_url__ )
0 commit comments