|
| 1 | +#read a textfile from a website |
| 2 | +import urllib.request, urllib.parse, urllib.error |
| 3 | +from urllib.request import Request, urlopen |
| 4 | +import sqlite3 |
| 5 | +import ssl |
| 6 | +import re |
| 7 | + |
| 8 | +# to ignore certificate errors for https |
| 9 | +ctx=ssl.create_default_context() |
| 10 | +ctx.check_hostname=False |
| 11 | +ctx.verify_mode=ssl.CERT_NONE |
| 12 | + |
| 13 | +#ask for url |
| 14 | +url=input('Enter URL for txt file: ') |
| 15 | +if len(url)<1: |
| 16 | + url='https://www.py4e.com/code3/mbox.txt' |
| 17 | + |
| 18 | +#create db |
| 19 | +connection=sqlite3.connect('organizationalCount.sqlite') |
| 20 | +cur=connection.cursor() |
| 21 | +cur.execute('DROP TABLE IF EXISTS Counts') |
| 22 | +cur.execute('CREATE TABLE Counts(org TEXT,count INTEGER)') |
| 23 | +connection.commit() |
| 24 | + |
| 25 | +#this wont work on this site so pretend to be a browser since spidering is blocked |
| 26 | +req = Request('https://www.py4e.com/code3/mbox.txt', headers={'User-Agent': 'Mozilla/5.0'}) |
| 27 | +fhand = urlopen(req,context=ctx).read().decode() |
| 28 | +fhand=fhand.splitlines() |
| 29 | +count=0 |
| 30 | + |
| 31 | +#Parse data by lines then extract the org name then run SQL to either increment |
| 32 | +#existing organizational count or to add new organization |
| 33 | +for elements in fhand: |
| 34 | + if elements.startswith('From '): |
| 35 | + elements=elements.split() |
| 36 | + element=elements[1].split('@') |
| 37 | + org=element[1] |
| 38 | + cur.execute('SELECT count FROM Counts WHERE org=?',(org,)) |
| 39 | + row=cur.fetchone() |
| 40 | + if row is None: |
| 41 | + cur.execute('INSERT INTO Counts(org,count) VALUES(?,1)',(org,)) |
| 42 | + else: |
| 43 | + cur.execute('UPDATE Counts SET count=count+1 WHERE org=?',(org,)) |
| 44 | +connection.commit() |
| 45 | + |
| 46 | +#print out results |
| 47 | +for row in cur.execute('SELECT org, count FROM Counts ORDER BY count DESC'): |
| 48 | + print(str(row[0]), row[1]) |
| 49 | + |
| 50 | +cur.close() |
0 commit comments