Skip to content

Commit 80b3f67

Browse files
Hasith RathnayakeHasith Rathnayake
authored andcommitted
I have utilized some spidering to avoid downloading the files here.
1 parent 2751856 commit 80b3f67

File tree

2 files changed

+50
-0
lines changed

2 files changed

+50
-0
lines changed
Lines changed: 50 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,50 @@
1+
#read a textfile from a website
2+
import urllib.request, urllib.parse, urllib.error
3+
from urllib.request import Request, urlopen
4+
import sqlite3
5+
import ssl
6+
import re
7+
8+
# to ignore certificate errors for https
9+
ctx=ssl.create_default_context()
10+
ctx.check_hostname=False
11+
ctx.verify_mode=ssl.CERT_NONE
12+
13+
#ask for url
14+
url=input('Enter URL for txt file: ')
15+
if len(url)<1:
16+
url='https://www.py4e.com/code3/mbox.txt'
17+
18+
#create db
19+
connection=sqlite3.connect('organizationalCount.sqlite')
20+
cur=connection.cursor()
21+
cur.execute('DROP TABLE IF EXISTS Counts')
22+
cur.execute('CREATE TABLE Counts(org TEXT,count INTEGER)')
23+
connection.commit()
24+
25+
#this wont work on this site so pretend to be a browser since spidering is blocked
26+
req = Request('https://www.py4e.com/code3/mbox.txt', headers={'User-Agent': 'Mozilla/5.0'})
27+
fhand = urlopen(req,context=ctx).read().decode()
28+
fhand=fhand.splitlines()
29+
count=0
30+
31+
#Parse data by lines then extract the org name then run SQL to either increment
32+
#existing organizational count or to add new organization
33+
for elements in fhand:
34+
if elements.startswith('From '):
35+
elements=elements.split()
36+
element=elements[1].split('@')
37+
org=element[1]
38+
cur.execute('SELECT count FROM Counts WHERE org=?',(org,))
39+
row=cur.fetchone()
40+
if row is None:
41+
cur.execute('INSERT INTO Counts(org,count) VALUES(?,1)',(org,))
42+
else:
43+
cur.execute('UPDATE Counts SET count=count+1 WHERE org=?',(org,))
44+
connection.commit()
45+
46+
#print out results
47+
for row in cur.execute('SELECT org, count FROM Counts ORDER BY count DESC'):
48+
print(str(row[0]), row[1])
49+
50+
cur.close()
8 KB
Binary file not shown.

0 commit comments

Comments
 (0)