pyaf
diff --git a/‎.gitignore‎
Lines changed: 1 addition & 0 deletions b/‎.gitignore‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎README.md‎
Lines changed: 5 additions & 0 deletions b/‎README.md‎
Lines changed: 5 additions & 0 deletions
diff --git a/‎get_the_mails.py‎
Lines changed: 38 additions & 0 deletions b/‎get_the_mails.py‎
Lines changed: 38 additions & 0 deletions
@@ -87,3 +87,4 @@ ENV/
 
 # Rope project settings
 .ropeproject
+old/
@@ -1,2 +1,7 @@
 # BHU_mail
+
 A python script to get the emails of all the professors of BHU.
+
+Just run `get_the_mails.py` and it will crawl the contact section of BHU website, download all the docs containing details of every department, then use regex to get the emails out and paste it in results.txt.
+
+Enjoy!
@@ -0,0 +1,38 @@
+import httplib2
+import urllib2
+from bs4 import BeautifulSoup, SoupStrainer
+import re
+
+http = httplib2.Http()
+website = 'http://www.bhu.ac.in/telephone/'
+doc_links = []
+emails = []
+
+status, response = http.request(website)
+
+for link in BeautifulSoup(response,"lxml", parse_only=SoupStrainer('a')):
+ if link.has_attr('href'):
+ if 'doc' in link['href']:
+ doc_links.append(website+link['href'])
+
+print "Got doc links.."
+print doc_links
+
+for i in doc_links:
+ try:
+ doc = urllib2.urlopen(i)
+ doc_data = doc.read()
+ match = match = re.findall(r'[\w\.-]+@[\w\.-]+', doc_data)
+ emails.extend(match)
+ print "done", i
+ except Exception as e:
+ print e
+
+print 'Yo, got all emails'
+
+print 'writing data in results.txt'
+with open('results.txt', "w") as f:
+ for email in emails:
+ f.write(email)
+ print(email)
+ f.write("\n")
Original file line number	Diff line number	Diff line change
`@@ -87,3 +87,4 @@ ENV/`
`87`	`87`
`88`	`88`	`# Rope project settings`
`89`	`89`	`.ropeproject`
	`90`	`+old/`