Skip to content

Commit 9329c00

Browse files
authored
Merge pull request #4 from testdrivenio/base-update
update base
2 parents 748c898 + 8324ca6 commit 9329c00

File tree

3 files changed

+55
-51
lines changed

3 files changed

+55
-51
lines changed

project/scrapers/scraper.py

Lines changed: 44 additions & 43 deletions
Original file line numberDiff line numberDiff line change
@@ -1,82 +1,83 @@
1-
import requests
1+
import csv
2+
from pathlib import Path
23

4+
import requests
5+
from bs4 import BeautifulSoup
36
from selenium import webdriver
47
from selenium.webdriver.common.by import By
5-
from selenium.webdriver.support.ui import WebDriverWait
68
from selenium.webdriver.support import expected_conditions as EC
7-
from bs4 import BeautifulSoup
9+
from selenium.webdriver.support.ui import WebDriverWait
10+
11+
BASE_DIR = Path(__file__).resolve(strict=True).parent.parent
812

913

1014
def get_driver():
11-
# initialize options
1215
options = webdriver.ChromeOptions()
13-
# pass in headless argument to options
14-
options.add_argument('--headless')
16+
options.add_argument("--headless")
17+
1518
# initialize driver
1619
driver = webdriver.Chrome(chrome_options=options)
1720
return driver
1821

1922

20-
def connect_to_base(browser, page_number):
21-
base_url = f'https://news.ycombinator.com/news?p={page_number}'
23+
def connect_to_base(browser):
24+
base_url = "https://en.wikipedia.org/wiki/Special:Random"
2225
connection_attempts = 0
2326
while connection_attempts < 3:
2427
try:
2528
browser.get(base_url)
26-
# wait for table element with id = 'hnmain' to load
29+
# wait for table element with id = 'content' to load
2730
# before returning True
2831
WebDriverWait(browser, 5).until(
29-
EC.presence_of_element_located((By.ID, 'hnmain'))
32+
EC.presence_of_element_located((By.ID, "content"))
3033
)
3134
return True
32-
except Exception as ex:
35+
except Exception as e:
36+
print(e)
3337
connection_attempts += 1
34-
print(f'Error connecting to {base_url}.')
35-
print(f'Attempt #{connection_attempts}.')
38+
print(f"Error connecting to {base_url}.")
39+
print(f"Attempt #{connection_attempts}.")
3640
return False
3741

3842

3943
def parse_html(html):
4044
# create soup object
41-
soup = BeautifulSoup(html, 'html.parser')
45+
soup = BeautifulSoup(html, "html.parser")
4246
output_list = []
43-
# parse soup object to get article id, rank, score, and title
44-
tr_blocks = soup.find_all('tr', class_='athing')
45-
article = 0
46-
for tr in tr_blocks:
47-
article_id = tr.get('id')
48-
article_url = tr.find_all('a')[1]['href']
49-
# check if article is a hacker news article
50-
if 'item?id=' in article_url:
51-
article_url = f'https://news.ycombinator.com/{article_url}'
52-
load_time = get_load_time(article_url)
53-
try:
54-
score = soup.find(id=f'score_{article_id}').string
55-
except Exception as ex:
56-
score = '0 points'
57-
article_info = {
58-
'id': article_id,
59-
'load_time': load_time,
60-
'rank': tr.span.string,
61-
'score': score,
62-
'title': tr.find(class_='storylink').string,
63-
'url': article_url
64-
}
65-
# appends article_info to output_list
66-
output_list.append(article_info)
67-
article += 1
47+
# parse soup object to get wikipedia article url, title, and last modified date
48+
article_url = soup.find("link", {"rel": "canonical"})["href"]
49+
article_title = soup.find("h1", {"id": "firstHeading"}).text
50+
article_last_modified = soup.find("li", {"id": "footer-info-lastmod"}).text
51+
article_info = {
52+
"url": article_url,
53+
"title": article_title,
54+
"last_modified": article_last_modified,
55+
}
56+
output_list.append(article_info)
6857
return output_list
6958

7059

7160
def get_load_time(article_url):
7261
try:
7362
# set headers
74-
headers = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.95 Safari/537.36'}
63+
headers = {
64+
"User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/84.0.4147.89 Safari/537.36"
65+
}
7566
# make get request to article_url
7667
response = requests.get(
77-
article_url, headers=headers, stream=True, timeout=3.000)
68+
article_url, headers=headers, stream=True, timeout=3.000
69+
)
7870
# get page load time
7971
load_time = response.elapsed.total_seconds()
80-
except Exception as ex:
81-
load_time = 'Loading Error'
72+
except Exception as e:
73+
print(e)
74+
load_time = "Loading Error"
8275
return load_time
76+
77+
78+
def write_to_file(output_list, filename):
79+
for row in output_list:
80+
with open(Path(BASE_DIR).joinpath(filename), "a") as csvfile:
81+
fieldnames = ["url", "title", "last_modified"]
82+
writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
83+
writer.writerow(row)

project/script.py

Lines changed: 8 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -4,18 +4,21 @@
44
from scrapers.scraper import get_driver, connect_to_base, parse_html
55

66

7-
def run_process(browser, page_number=1):
8-
if connect_to_base(browser, page_number):
9-
print(f'Scraping page {page_number}...')
7+
8+
def run_process(rowser):
9+
if connect_to_base(browser):
10+
print(f'Scraping random Wikipedia page...')
1011
sleep(2)
1112
html = browser.page_source
1213
return parse_html(html)
1314
else:
15+
print("Error connecting to Wikipedia")
1416
return False
1517

1618

1719
if __name__ == '__main__':
1820
browser = get_driver()
19-
data = run_process(browser, sys.argv[1])
21+
data = run_process(browser)
22+
print(data)
2023
browser.quit()
21-
print(f'Finished page {sys.argv[1]}')
24+
print(f'Finished!')

requirements.txt

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,3 @@
1-
beautifulsoup4==4.9.3
2-
requests==2.25.1
3-
selenium==3.141.0
1+
beautifulsoup4==4.10.0
2+
requests==2.27.1
3+
selenium==4.1.3

0 commit comments

Comments
 (0)