Skip to content

Commit ee57ea8

Browse files
feat: add project about web scraping by selenium
1 parent 4e1da44 commit ee57ea8

File tree

11 files changed

+1345
-1
lines changed

11 files changed

+1345
-1
lines changed

.gitignore

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,3 @@
1-
.DS_Store
1+
**/.DS_Store
22
.env
33
__pycache__

Reminder/.DS_Store

-6 KB
Binary file not shown.

Reminder/2_functions/.DS_Store

-6 KB
Binary file not shown.

Reminder/9_importing/.DS_Store

-6 KB
Binary file not shown.

Reminder/9_importing/Web Scraping by Selenium/Usaha_ Taman Langit Pangalengan 360 SNI CHSE 9042 Kementerian Pariwisata dan Ekonomi Kreatif.html

Lines changed: 756 additions & 0 deletions
Large diffs are not rendered by default.
17 MB
Binary file not shown.
Lines changed: 199 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,199 @@
1+
from selenium import webdriver
2+
from selenium.webdriver.chrome.service import Service
3+
from selenium.webdriver.common.by import By # the way we want to find the element inside the web page
4+
from selenium.webdriver.common.keys import Keys # keys mean keys we want to press in our keyboards
5+
from selenium.webdriver.support.ui import WebDriverWait
6+
from selenium.webdriver.support.ui import Select
7+
from selenium.webdriver.support import expected_conditions as EC
8+
from selenium.common.exceptions import TimeoutException, ElementClickInterceptedException
9+
from miscellaneous import get_data_adm
10+
import time
11+
12+
# we will create a web driver (that will be downloaded and installed) as an automation tool that controls Google Chrome
13+
# can work with any browser
14+
# https://sites.google.com/chromium.org/driver/
15+
# service = Service(executable_path="/Users/diardanoraihan/Work/GITHUB/Python_Projects/Reminder/9_importing/Web Scraping by Selenium/chromedriver")
16+
service = Service(executable_path="Reminder/9_importing/Web Scraping by Selenium/chromedriver")
17+
driver = webdriver.Chrome(service=service)
18+
driver.set_page_load_timeout(10)
19+
20+
driver.get("https://google.com")
21+
22+
# If after 5 seconds the element doesn't exist, go ahead and just crash the program
23+
WebDriverWait(driver, 5).until(
24+
EC.presence_of_element_located((By.CLASS_NAME, "gLFyf"))
25+
)
26+
input_element = driver.find_element(By.CLASS_NAME, "gLFyf")
27+
input_element.clear() # clear any prompt inside the text box
28+
input_element.send_keys("kemenparekraf chse" + Keys.ENTER)
29+
30+
# If 'chse kemenparekraf' exists inside of some kind of anchor or link tag, we will access it
31+
# To find multiple pages, use .find_elements then iterate over each element [element_1, element_2, etc]
32+
link = driver.find_element(By.PARTIAL_LINK_TEXT, "CHSE Kemenparekraf")
33+
link.click()
34+
35+
# If after 5 seconds the element doesn't exist, go ahead and just crash the program
36+
WebDriverWait(driver, 5).until(
37+
EC.presence_of_element_located((By.LINK_TEXT, "Daya Tarik Wisata"))
38+
)
39+
link = driver.find_element(By.LINK_TEXT, "Daya Tarik Wisata")
40+
link.click()
41+
42+
# If after 5 seconds the element doesn't exist, go ahead and just crash the program
43+
WebDriverWait(driver, 5).until(
44+
EC.presence_of_element_located((By.LINK_TEXT, "Filter"))
45+
)
46+
link = driver.find_element(By.LINK_TEXT, "Filter")
47+
link.click()
48+
time.sleep(1)
49+
50+
# If after 5 seconds the element doesn't exist, go ahead and just crash the program
51+
WebDriverWait(driver, 5).until(
52+
EC.presence_of_element_located((By.ID, "province_id"))
53+
)
54+
link = driver.find_element(By.ID, "province_id")
55+
link.click()
56+
57+
link = driver.find_element(By.ID, "province_id")
58+
link.click()
59+
60+
61+
province_dropdown = driver.find_element(By.ID, "province_id")
62+
# Buat objek Select dan pilih 'Jawa Barat' berdasarkan teks
63+
select = Select(province_dropdown)
64+
select.select_by_visible_text("JAWA BARAT")
65+
66+
# Klik Filter setelah memilih Jawa Barat
67+
WebDriverWait(driver, 5).until(
68+
EC.presence_of_element_located((By.XPATH, "//*[@id='filtersModal']/div/div/div[3]/div/a[2]"))
69+
)
70+
link = driver.find_element(By.XPATH, "//*[@id='filtersModal']/div/div/div[3]/div/a[2]")
71+
link.click()
72+
73+
# Tunggu halaman web untuk update
74+
time.sleep(2)
75+
76+
data = {
77+
'objek_wisata': [],
78+
'alamat': [],
79+
'provinsi': [],
80+
'kab_kota': [],
81+
'kecamatan': [],
82+
'desa': [],
83+
'latitude': [],
84+
'longitude': []
85+
}
86+
87+
# Klik halaman selanjutnya
88+
for i in range(24):
89+
try:
90+
print(f'\n++++++++ Accessing Page {i+1} ++++++++')
91+
WebDriverWait(driver, 20).until(
92+
EC.presence_of_element_located((By.CLASS_NAME, "card-title"))
93+
)
94+
95+
elements = driver.find_elements(By.CLASS_NAME, "card-title")
96+
tourism_places = []
97+
for element in elements:
98+
tourism_places.append(element.text)
99+
100+
print(f"Destinasi Wisata: {', '.join(tourism_places)}")
101+
102+
for tourism_place in tourism_places:
103+
try:
104+
WebDriverWait(driver, 20).until(
105+
# EC.presence_of_element_located((By.PARTIAL_LINK_TEXT, tourism_place))
106+
EC.element_to_be_clickable((By.PARTIAL_LINK_TEXT, tourism_place))
107+
)
108+
109+
current_url = driver.current_url
110+
111+
# Akses halaman destinasi wisata
112+
print(f"Attempt to visit: {tourism_place}")
113+
link = driver.find_element(By.PARTIAL_LINK_TEXT, tourism_place)
114+
115+
# Validasi apakah elemen terlihat
116+
if not link.is_displayed():
117+
driver.execute_script("arguments[0].scrollIntoView(true);", link)
118+
119+
# Gunakan JavaScript atau ActionChains jika klik biasa gagal
120+
try:
121+
link.click()
122+
123+
except ElementClickInterceptedException:
124+
print(f"Element click intercepted for {tourism_place}. Trying JavaScript click.")
125+
driver.execute_script("arguments[0].click();", link)
126+
127+
# Validasi URL berubah
128+
WebDriverWait(driver, 10).until(lambda d: d.current_url != current_url)
129+
print(f"Attempt successful WITHOUT timeout: {driver.current_url}")
130+
131+
# Retrieve data in the new web page
132+
data_temp = get_data_adm(driver.current_url)
133+
data['objek_wisata'].append(data_temp['objek_wisata'])
134+
data['alamat'].append(data_temp['alamat'])
135+
data['provinsi'].append(data_temp['provinsi'])
136+
data['kab_kota'].append(data_temp['kab_kota'])
137+
data['kecamatan'].append(data_temp['kecamatan'])
138+
data['desa'].append(data_temp['desa'])
139+
# print(data)
140+
time.sleep(2)
141+
142+
# Kembali ke halaman sebelumnya
143+
driver.back()
144+
WebDriverWait(driver, 10).until(
145+
EC.presence_of_element_located((By.CLASS_NAME, "card-title"))
146+
)
147+
148+
except TimeoutException:
149+
150+
print(f"Attempt successful WITH timeout: {driver.current_url}")
151+
print("Page load timeout. Stopping...")
152+
driver.execute_script("window.stop();")
153+
154+
# Retrieve data in the new web page
155+
data_temp = get_data_adm(driver.current_url)
156+
data['objek_wisata'].append(data_temp['objek_wisata'])
157+
data['alamat'].append(data_temp['alamat'])
158+
data['provinsi'].append(data_temp['provinsi'])
159+
data['kab_kota'].append(data_temp['kab_kota'])
160+
data['kecamatan'].append(data_temp['kecamatan'])
161+
data['desa'].append(data_temp['desa'])
162+
163+
# driver.save_screenshot("timeout_error.png")
164+
# Hentikan loading jika timeout
165+
# Kembali ke halaman sebelumnya
166+
driver.back()
167+
time.sleep(1)
168+
# print(data)
169+
170+
# Klik tombol "Next"
171+
try:
172+
print(data)
173+
print(f"Total current objek wisata: {len(data['objek_wisata'])}")
174+
175+
WebDriverWait(driver, 20).until(
176+
# EC.presence_of_element_located((By.CLASS_NAME, "bi-caret-right-fill")),
177+
EC.element_to_be_clickable((By.CLASS_NAME, "bi-caret-right-fill"))
178+
)
179+
button_next = driver.find_element(By.CLASS_NAME, "bi-caret-right-fill")
180+
button_next.click()
181+
time.sleep(2)
182+
183+
# except TimeoutException:
184+
except:
185+
print("Next button not found. Possibly last page.")
186+
break
187+
188+
except TimeoutException:
189+
print("Timeout occurred while processing the page.")
190+
driver.save_screenshot("timeout_error.png")
191+
break
192+
193+
driver.quit()
194+
195+
print(data)
196+
197+
with open('/Users/diardanoraihan/Work/GITHUB/Python_Projects/Reminder/9_importing/Web Scraping by Selenium/scrapped_data.txt', 'w') as file:
198+
file.write(data)
199+
Lines changed: 116 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,116 @@
1+
import requests
2+
from bs4 import BeautifulSoup
3+
4+
# URL target
5+
# url = "https://chse.kemenparekraf.go.id/id/detail-tersertifikasi/11863-taman-langit-pangalengan-360"
6+
7+
8+
9+
10+
def get_data_adm(url):
11+
"""
12+
Description:
13+
Scrape chse kemenparekraf to get information about the destination object, address, province, and more.
14+
15+
Return:
16+
a dictionary containing location of the destination object
17+
"""
18+
# objek_wisata = []
19+
# alamat = []
20+
# provinsi = []
21+
# kab_kota = []
22+
# kecamatan = []
23+
# desa = []
24+
25+
# Mengambil konten halaman
26+
response = requests.get(url)
27+
if response.status_code != 200:
28+
print(f"Failed to retrieve page. Status code: {response.status_code}")
29+
exit()
30+
31+
# Parsing HTML menggunakan BeautifulSoup
32+
soup = BeautifulSoup(response.text, "html.parser")
33+
34+
# Ekstraksi informasi
35+
try:
36+
# Nama Objek Wisata
37+
objek_wisata = soup.find("h1", class_="h2").text.strip()
38+
39+
# Alamat
40+
alamat = soup.find("span", class_="d-block").text.strip()
41+
42+
# Tabel informasi lainnya
43+
data_table = soup.find_all("dd", class_="col-6")
44+
provinsi = data_table[1].string
45+
kab_kota = data_table[2].string
46+
kecamatan = data_table[3].string
47+
desa = data_table[4].string
48+
49+
except Exception as e:
50+
print(f"An error occurred: {e}")
51+
52+
data = {
53+
'objek_wisata': objek_wisata,
54+
'alamat': alamat,
55+
'provinsi': provinsi,
56+
'kab_kota': kab_kota,
57+
'kecamatan': kecamatan,
58+
'desa': desa
59+
}
60+
61+
return data
62+
63+
64+
def dms_to_decimal(coord):
65+
"""
66+
Mengonversi koordinat DMS (Degree, Minute, Second) menjadi desimal.
67+
68+
Parameter:
69+
- coord: str, koordinat dalam format "7°13'52.9"S" atau "107°31'33.2"E"
70+
71+
Return:
72+
- float, koordinat dalam format desimal
73+
"""
74+
# Pisahkan komponen
75+
parts = coord[:-1] # Hilangkan huruf terakhir (S, N, W, E)
76+
direction = coord[-1] # Ambil huruf terakhir (S, N, W, E)
77+
78+
# Ekstrak derajat, menit, detik
79+
d, m, s = [float(x.replace("°", "").replace("'", "").replace('"', "")) for x in parts.replace("°", " ").replace("'", " ").replace('"', "").split()]
80+
81+
# Hitung desimal
82+
decimal = d + (m / 60) + (s / 3600)
83+
84+
# Buat negatif jika arah Selatan (S) atau Barat (W)
85+
if direction in ['S', 'W']:
86+
decimal = -decimal
87+
88+
return decimal
89+
90+
def get_data_coord(url):
91+
92+
# webpage = '''
93+
# <div jstcache="24" class="place-name" jsan="7.place-name">7°13'52.9"S 107°31'33.2"E</div>
94+
# '''
95+
96+
# Mengambil konten halaman
97+
response = requests.get(url)
98+
if response.status_code != 200:
99+
print(f"Failed to retrieve page. Status code: {response.status_code}")
100+
exit()
101+
102+
# Parsing HTML menggunakan BeautifulSoup
103+
soup = BeautifulSoup(response.text, "html.parser")
104+
105+
coord = soup.find('div', class_ = 'place-name').text
106+
107+
lat, long = coord.split()[0], coord.split()[1]
108+
109+
data = {
110+
'latitude': lat,
111+
'longitude': long
112+
}
113+
114+
return data
115+
116+
dms_to_decimal("7°13'52.9"S")
15.1 KB
Binary file not shown.

Reminder/9_importing/Web Scraping by Selenium/preprocess.ipynb

Lines changed: 272 additions & 0 deletions
Large diffs are not rendered by default.

0 commit comments

Comments
 (0)