Skip to content

jose-blockchain/garlopy

Folders and files

NameName
Last commit message
Last commit date

Latest commit

 

History

6 Commits
 
 
 
 
 
 
 
 

Repository files navigation

garlopy

A Scrapely clone (machine learning HTML web scraping from examples and basic machine learning) using BeautifulSoup

Example

from garlopy import GarlopyScraper import unittest class TestGarlopy(unittest.TestCase): def setUp(self): self.seq = range(10) self.s = GarlopyScraper() def _test_scraper(self,html1,data1,html2,data2): s = self.s s.train_html(html1, data1) result = s.scrape_html(html2) #print result data2_scraped = result # result[0] print data2 print data2_scraped print '+'*80 #print Counter(data2_scraped['name']) if 'name' in data2_scraped: data2_scraped['name'] = [e.strip() for e in data2_scraped['name'] ] if 'venue_name' in data2_scraped: data2_scraped['venue_name'] = data2_scraped['venue_name']!=None and [e.strip() for e in data2_scraped['venue_name'] ] or None if 'date' in data2_scraped: data2_scraped['date'] = [e.strip() for e in data2_scraped['date'] ] print '*'*80 print data2 print '-'*80 print data2_scraped print '*'*80 self.assertEqual(data2, data2_scraped) def test_basic(self): # make sure the shuffled sequence does not lose any elements html1 = ''' <html> <p>Hector</p> </html> ''' html2 = ''' <html> <p>Jorge</p> </html> ''' data1 = {'name':'Hector'} data2 = {'name':['Jorge']} self._test_scraper(html1, data1, html2, data2) 

About

A Scrapely scraper clone (machine learning HTML scrapping from examples) using BeautifulSoup

Topics

Resources

License

Stars

Watchers

Forks

Releases

No releases published

Packages

No packages published

Languages