Skip to content

Commit 1bf0877

Browse files
authored
REFACTOR: block alert notifications
1 parent 7eddc07 commit 1bf0877

File tree

1 file changed

+77
-67
lines changed

1 file changed

+77
-67
lines changed

main.py

Lines changed: 77 additions & 67 deletions
Original file line numberDiff line numberDiff line change
@@ -1,43 +1,42 @@
11
# -*- coding: utf-8 -*-
2-
# __author__ = "Hong Nguyen Nam"
3-
# __copyright__ = "Copyright 2021, The Browser Clone"
2+
# __author__ = "Hong Nguyen Nam (Jeremy Nguyen)"
3+
# __copyright__ = "Copyright 2022, The Browser Clone"
44
# __license__ = "GPL"
5-
# __version__ = "1.0.0"
6-
# __maintainer__ = "Hong Nguyen Nam"
5+
# __version__ = "2.1.0"
6+
# __maintainer__ = "Hong Nguyen Nam (Jeremy Nguyen)"
77
# __email__ = "a2FpdG9raWQxNDEyLmNvbmFuQGdtYWlsLmNvbQ=="
8-
__path_driver__ = 'chromedriver'
8+
__path_driver__ = '/Users/Hacker1945/Desktop/clone-web/chromedriver'
99
__black_list_type__ = ['.php']
1010
__status_code__ = [200, 404]
1111
__clone_all__ = False
12-
__zip__ = False
13-
__clone_url__ = 'https://pixinvent.com/demo/vuexy-html-bootstrap-admin-template/html/ltr/vertical-menu-template/index.html'
12+
__zip__ = True
13+
__clone_url__ = 'https://themesbrand.com/velzon/html/default/index.html'
1414

15-
16-
from seleniumwire import webdriver
17-
from selenium.webdriver.common.keys import Keys
18-
import time
19-
from urllib.parse import urlparse
20-
import urllib.request
21-
import urllib.parse
15+
# https://unistudio.co/html/renox/main/index.html
16+
# https://themesbrand.com/velzon/html/default/index.html
2217
import os
2318
import os.path
24-
import requests
2519
import re
20+
import shutil
21+
import time
22+
from urllib.parse import urlparse
23+
24+
import requests
2625
from bs4 import BeautifulSoup
26+
from seleniumwire import webdriver
2727
from tqdm import tqdm
2828
from zipfile36 import ZipFile
29-
import shutil
3029

3130

32-
class File():
31+
class File:
3332
info_url = ''
33+
3434
def __init__(self, url):
3535
self.url = url
3636
self.info_url = self.extract_info_url(url, True)
3737
self.check_exists(url)
3838

39-
40-
def extract_info_url(self, url, main = False):
39+
def extract_info_url(self, url, main=False):
4140
data_url = urlparse(url)
4241
domain = data_url.netloc
4342
path_file = domain.replace('.', '') + os.path.split(data_url.path)[0] + '/'
@@ -49,54 +48,47 @@ def extract_info_url(self, url, main = False):
4948
file_name = 'index.html'
5049
return {"domain": domain, "path": path_file, "file_name": file_name, "scheme": scheme, "url": url_ori}
5150

52-
5351
def download_file(self, url):
54-
black_list = ['', '/']
5552
info_url = self.extract_info_url(url)
5653
if url == self.url:
5754
info_url = self.extract_info_url(url, True)
58-
55+
5956
if info_url['file_name'][-4:] not in __black_list_type__:
6057
file_name = info_url['file_name']
61-
if info_url['file_name'] in black_list:
58+
black_list = ['', '/']
59+
if file_name in black_list:
6260
file_name = 'index.html'
6361
path_file = info_url['path'] + file_name
64-
if os.path.exists(path_file) == False:
62+
if not os.path.exists(path_file):
6563
r = requests.get(url)
6664
os.makedirs(os.path.dirname(path_file), exist_ok=True)
6765
with open(path_file, 'wb') as f:
6866
f.write(r.content)
6967

70-
7168
def check_invalid(self, file_name):
7269
regex = r"[a-z-0-9]+.html"
7370
matches = re.finditer(regex, file_name, re.MULTILINE)
74-
for matchNum, match in enumerate(matches, start=1):
71+
for match in matches:
7572
return match.group()
76-
7773

7874
def check_exists(self, url):
7975
info_url = self.extract_info_url(url)
8076
path_file = info_url['path'] + info_url['file_name']
8177
if info_url['domain'] == self.info_url['domain']:
82-
if os.path.exists(path_file) == False:
83-
return True
84-
else:
85-
return False
78+
return os.path.exists(path_file) == False
8679
else:
8780
return False
88-
89-
90-
def get_href_a_tag(self, pagesource):
81+
82+
def get_href_a_tag(self, page_source):
9183
result = []
92-
source = BeautifulSoup(pagesource,'html.parser')
84+
source = BeautifulSoup(page_source, 'html.parser')
9385
try:
9486
data_a = source.find_all("a")
95-
except:
87+
except Exception:
9688
data_a = None
9789
a_tag_list = []
9890
for a in data_a:
99-
if a.get('href') != '' and a.get('href') != '#' and str(a.get('href')) not in a_tag_list and self.check_invalid(str(a.get('href'))) != None:
91+
if a.get('href') != '' and a.get('href') != '#' and str(a.get('href')) not in a_tag_list and self.check_invalid(str(a.get('href'))) is not None:
10092
a_tag_list.append(a.get('href'))
10193

10294
for href in a_tag_list:
@@ -107,108 +99,126 @@ def get_href_a_tag(self, pagesource):
10799
link = self.info_url['url']
108100
for text in cut:
109101
if text != '':
110-
link = link.replace(str(text)+'/', '')
102+
link = link.replace(f'{str(text)}/', '')
111103
result.append(link + href.replace('../', ''))
112104
elif href[:1] == '/':
113105
link = re.split('[\/]+', self.info_url['url'])[:2]
114-
link = str(link[0]) + '//' + str(link[1])
106+
link = f'{str(link[0])}//{str(link[1])}'
115107
result.append(link + href)
116108
else:
117109
result.append(self.info_url['url'] + href)
118110
if domain == self.info_url['domain']:
119111
result.append(href)
120112
return result
121113

122-
123114
def get_all_file_paths(self, directory):
124115
file_paths = []
125116
for root, directories, files in os.walk(directory):
126117
for filename in files:
127118
filepath = os.path.join(root, filename)
128119
file_paths.append(filepath)
129-
return file_paths
130-
120+
return file_paths
131121

132122
def zip(self, path_folder):
133-
print('Begin zipped file '+ str(path_folder) + '.zip')
123+
print(f'Compression files... {str(path_folder)}.zip')
134124
directory = path_folder
135125
file_paths = self.get_all_file_paths(directory)
136-
with ZipFile(path_folder + '.zip','w') as zip:
126+
with ZipFile(f'{path_folder}.zip', 'w') as zip:
137127
for file in file_paths:
138128
zip.write(file)
139129
print('All files zipped successfully!')
140-
130+
141131

142132
class BrowserClone(File):
143133
driver = ''
144134
page_source = ''
145135
all_tab = []
146136
url_down = []
147-
148-
137+
149138
def __init__(self, url):
139+
super().__init__(url)
150140
self.url = url
151141
self.open_browser()
152-
153-
142+
154143
def open_browser(self):
155144
print('============================== Begin ==============================')
156145
options = webdriver.ChromeOptions()
157146
options.add_argument("--incognito")
158-
# options.add_argument("--headless")
159-
options.add_argument(f"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/89.0.4389.90 Safari/537.36")
147+
options.add_argument("--headless")
148+
# options.add_argument(f"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/89.0.4389.90 Safari/537.36")
149+
# Configure to run in the background on ubuntu/debian
150+
options.add_argument("--no-sandbox")
151+
# Bypass network to fast loading
152+
# options.add_argument('--no-proxy-server')
153+
# options.add_argument("--proxy-server='direct://'")
154+
# options.add_argument("--proxy-bypass-list=*")
155+
156+
options.add_experimental_option("useAutomationExtension", False)
157+
options.add_experimental_option("excludeSwitches", ["enable-automation"])
158+
options.page_load_strategy = 'none'
159+
160160
path_chrome_driver = __path_driver__
161161
self.driver = webdriver.Chrome(chrome_options=options, executable_path=path_chrome_driver)
162162
self.driver.get(self.url)
163+
time.sleep(30)
164+
for _ in range(5):
165+
try:
166+
self.driver.switch_to.alert.accept()
167+
except Exception:
168+
continue
163169
self.page_source = self.driver.page_source
164170
super().__init__(self.url)
165171
self.extract_file()
166-
print('Get all link clone...')
172+
print('Getting all the links to crawl...')
167173
url_tab_data = super().get_href_a_tag(self.page_source)
168174
for url_tab in url_tab_data:
169175
self.all_tab.append(url_tab)
170176
self.extract_html(url_tab)
171-
172-
173-
# clone options
174-
if __clone_all__ == True:
177+
178+
# clone options
179+
if __clone_all__:
175180
data = list(set(self.all_tab))
176181
for url in data:
177182
self.driver.get(url)
178183
self.extract_file()
179184

180-
181185
print('Get all link clone done!')
182186
print('Save files...')
183187
self.extract_file(True)
184188
print('Save files Done!')
185189

186-
if __zip__ == True:
190+
if __zip__:
187191
url_info = super().extract_info_url(self.url, True)
188192
folder = './' + url_info['domain'].replace('.', '')
189193
super().zip(folder)
190194
try:
191195
shutil.rmtree(folder, ignore_errors=True)
192196
except OSError as e:
193-
print("Error: %s : %s" % (folder, e.strerror))
197+
print(f"Error: {folder} : {e.strerror}")
194198
print('============================== End Game ==============================')
195199

196-
197200
def extract_html(self, url):
198201
super().__init__(url)
199202
self.driver.get(url)
203+
for _ in range(5):
204+
try:
205+
self.driver.switch_to.alert.accept()
206+
except Exception:
207+
continue
200208
self.page_source = self.driver.page_source
201209
url_tab_data = super().get_href_a_tag(self.page_source)
202210
for url_tab in url_tab_data:
203211
self.all_tab.append(url_tab)
204-
205212

206-
def extract_file(self, down = False):
213+
def extract_file(self, down=False):
207214
for request in self.driver.requests:
208-
if request.response:
209-
if request.response.status_code in __status_code__ and request.url not in self.url_down:
210-
self.url_down.append(request.url)
211-
if down == True:
215+
if (
216+
request.response
217+
and request.response.status_code in __status_code__
218+
and request.url not in self.url_down
219+
):
220+
self.url_down.append(request.url)
221+
if down:
212222
super().__init__(self.url)
213223
data = list(set(self.url_down))
214224
with tqdm(total=len(data)) as pbar:
@@ -218,4 +228,4 @@ def extract_file(self, down = False):
218228
pbar.update(1)
219229

220230

221-
BrowserClone(__clone_url__)
231+
BrowserClone(__clone_url__)

0 commit comments

Comments
 (0)