|
1 | | -import requests |
| 1 | +import csv |
| 2 | +from pathlib import Path |
2 | 3 |
|
| 4 | +import requests |
| 5 | +from bs4 import BeautifulSoup |
3 | 6 | from selenium import webdriver |
4 | 7 | from selenium.webdriver.common.by import By |
5 | | -from selenium.webdriver.support.ui import WebDriverWait |
6 | 8 | from selenium.webdriver.support import expected_conditions as EC |
7 | | -from bs4 import BeautifulSoup |
| 9 | +from selenium.webdriver.support.ui import WebDriverWait |
| 10 | + |
| 11 | +BASE_DIR = Path(__file__).resolve(strict=True).parent.parent |
8 | 12 |
|
9 | 13 |
|
10 | 14 | def get_driver(): |
11 | | - # initialize options |
12 | 15 | options = webdriver.ChromeOptions() |
13 | | - # pass in headless argument to options |
14 | | - options.add_argument('--headless') |
| 16 | + options.add_argument("--headless") |
| 17 | + |
15 | 18 | # initialize driver |
16 | 19 | driver = webdriver.Chrome(chrome_options=options) |
17 | 20 | return driver |
18 | 21 |
|
19 | 22 |
|
20 | | -def connect_to_base(browser, page_number): |
21 | | - base_url = f'https://news.ycombinator.com/news?p={page_number}' |
| 23 | +def connect_to_base(browser): |
| 24 | + base_url = "https://en.wikipedia.org/wiki/Special:Random" |
22 | 25 | connection_attempts = 0 |
23 | 26 | while connection_attempts < 3: |
24 | 27 | try: |
25 | 28 | browser.get(base_url) |
26 | | - # wait for table element with id = 'hnmain' to load |
| 29 | + # wait for table element with id = 'content' to load |
27 | 30 | # before returning True |
28 | 31 | WebDriverWait(browser, 5).until( |
29 | | - EC.presence_of_element_located((By.ID, 'hnmain')) |
| 32 | + EC.presence_of_element_located((By.ID, "content")) |
30 | 33 | ) |
31 | 34 | return True |
32 | | - except Exception as ex: |
| 35 | + except Exception as e: |
| 36 | + print(e) |
33 | 37 | connection_attempts += 1 |
34 | | - print(f'Error connecting to {base_url}.') |
35 | | - print(f'Attempt #{connection_attempts}.') |
| 38 | + print(f"Error connecting to {base_url}.") |
| 39 | + print(f"Attempt #{connection_attempts}.") |
36 | 40 | return False |
37 | 41 |
|
38 | 42 |
|
39 | 43 | def parse_html(html): |
40 | 44 | # create soup object |
41 | | - soup = BeautifulSoup(html, 'html.parser') |
| 45 | + soup = BeautifulSoup(html, "html.parser") |
42 | 46 | output_list = [] |
43 | | - # parse soup object to get article id, rank, score, and title |
44 | | - tr_blocks = soup.find_all('tr', class_='athing') |
45 | | - article = 0 |
46 | | - for tr in tr_blocks: |
47 | | - article_id = tr.get('id') |
48 | | - article_url = tr.find_all('a')[1]['href'] |
49 | | - # check if article is a hacker news article |
50 | | - if 'item?id=' in article_url: |
51 | | - article_url = f'https://news.ycombinator.com/{article_url}' |
52 | | - load_time = get_load_time(article_url) |
53 | | - try: |
54 | | - score = soup.find(id=f'score_{article_id}').string |
55 | | - except Exception as ex: |
56 | | - score = '0 points' |
57 | | - article_info = { |
58 | | - 'id': article_id, |
59 | | - 'load_time': load_time, |
60 | | - 'rank': tr.span.string, |
61 | | - 'score': score, |
62 | | - 'title': tr.find(class_='storylink').string, |
63 | | - 'url': article_url |
64 | | - } |
65 | | - # appends article_info to output_list |
66 | | - output_list.append(article_info) |
67 | | - article += 1 |
| 47 | + # parse soup object to get wikipedia article url, title, and last modified date |
| 48 | + article_url = soup.find("link", {"rel": "canonical"})["href"] |
| 49 | + article_title = soup.find("h1", {"id": "firstHeading"}).text |
| 50 | + article_last_modified = soup.find("li", {"id": "footer-info-lastmod"}).text |
| 51 | + article_info = { |
| 52 | + "url": article_url, |
| 53 | + "title": article_title, |
| 54 | + "last_modified": article_last_modified, |
| 55 | + } |
| 56 | + output_list.append(article_info) |
68 | 57 | return output_list |
69 | 58 |
|
70 | 59 |
|
71 | 60 | def get_load_time(article_url): |
72 | 61 | try: |
73 | 62 | # set headers |
74 | | - headers = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.95 Safari/537.36'} |
| 63 | + headers = { |
| 64 | + "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/84.0.4147.89 Safari/537.36" |
| 65 | + } |
75 | 66 | # make get request to article_url |
76 | 67 | response = requests.get( |
77 | | - article_url, headers=headers, stream=True, timeout=3.000) |
| 68 | + article_url, headers=headers, stream=True, timeout=3.000 |
| 69 | + ) |
78 | 70 | # get page load time |
79 | 71 | load_time = response.elapsed.total_seconds() |
80 | | - except Exception as ex: |
81 | | - load_time = 'Loading Error' |
| 72 | + except Exception as e: |
| 73 | + print(e) |
| 74 | + load_time = "Loading Error" |
82 | 75 | return load_time |
| 76 | + |
| 77 | + |
| 78 | +def write_to_file(output_list, filename): |
| 79 | + for row in output_list: |
| 80 | + with open(Path(BASE_DIR).joinpath(filename), "a") as csvfile: |
| 81 | + fieldnames = ["url", "title", "last_modified"] |
| 82 | + writer = csv.DictWriter(csvfile, fieldnames=fieldnames) |
| 83 | + writer.writerow(row) |
0 commit comments