Python Forum

Hi,

I am a beginner in Python programming. Of late, I have started learning web scraping after completing python programming course from Udemy. I was trying to scrape this website but unfortunately could not get the csv file. Moreover, total number is also not printing. There maybe more errors. Can you please figure it out and help me with simple explanation. I am just a couple of days old in web scraping.

from bs4 import BeautifulSoup import requests import pandas as pd url = 'https://www.programmableweb.com/apis/directory' api_dict = {} api_no = 0 while True: response = requests.get(url) data = response.text soup = BeautifulSoup(data, 'html.parser') apis = soup.find_all('td',{'class':'views-field views-field-title col-md-3'}) for api in apis: name = api.find('a').text api_no += 1 #print(name) url_tag = soup.find('a',{'title':'Go to next page'}) if url_tag.get('href'): url = 'https://www.programmableweb.com' + url_tag.get('href') #print(url) else: break print('Total APIs: ',api_no) api_dict_df = pd.DataFrame.from_dict(api_dict, orient = 'index', columns = ['API name']) api_dict_df.head() api_dict_df.to_csv('api_detail.csv')

from bs4 import BeautifulSoup import requests import pandas as pd url = 'https://www.programmableweb.com/apis/directory' api_dict = {} api_no = 0 while True: response = requests.get(url) data = response.text soup = BeautifulSoup(data, 'html.parser') apis = soup.find_all('td',{'class':'views-field views-field-title col-md-3'}) for api in apis: name = api.find('a').text api_no += 1 #print(name) url_tag = soup.find('a',{'title':'Go to next page'}) if url_tag.get('href'): url = 'https://www.programmableweb.com' + url_tag.get('href') #print(url) else: break print('Total APIs: ',api_no) api_dict_df = pd.DataFrame.from_dict(api_dict, orient = 'index', columns = ['API name']) api_dict_df.head() api_dict_df.to_csv('api_detail.csv')

This code will show what's available the group you're interested in (prints out all tr's and td's)
All you have to do is select the elements that you want, and then extract that data.

from bs4 import BeautifulSoup import requests import pandas as pd import sys def scrape_page(): url = 'https://www.programmableweb.com/apis/directory' api_dict = {} api_no = 0 # while True: response = requests.get(url) if response.status_code == 200: data = response.text else: print(f"Unable to fetch page, bad status: {response.status_code}") sys.exit(-1) soup = BeautifulSoup(data, 'html.parser') tbody = soup.select('.views-table > tbody:nth-child(2)')[0] trs = tbody.find_all('tr') for n, tr in enumerate(trs): tds = tr.find_all('td') for n1, td in enumerate(tds): print(f"\n--------------------- tr_{n}, td_{n1} ---------------------") print(td)

(Oct-08-2019, 03:52 AM)Larz60+ Wrote: [ -> ]This code will show what's available the group you're interested in (prints out all tr's and td's)
All you have to do is select the elements that you want, and then extract that data.

from bs4 import BeautifulSoup import requests import pandas as pd import sys def scrape_page(): url = 'https://www.programmableweb.com/apis/directory' api_dict = {} api_no = 0 # while True: response = requests.get(url) if response.status_code == 200: data = response.text else: print(f"Unable to fetch page, bad status: {response.status_code}") sys.exit(-1) soup = BeautifulSoup(data, 'html.parser') tbody = soup.select('.views-table > tbody:nth-child(2)')[0] trs = tbody.find_all('tr') for n, tr in enumerate(trs): tds = tr.find_all('td') for n1, td in enumerate(tds): print(f"\n--------------------- tr_{n}, td_{n1} ---------------------") print(td)

Thank you very much for the solution. As you can understand, I am just a few days old in web scraping. Can you also suggest one good tutorial which I should learn from?

Regards,
Ravi

(Oct-08-2019, 07:53 AM)adminravi Wrote: [ -> ]I am just a few days old in web scraping. Can you also suggest one good tutorial which I should learn from?

Have a couple here about this topic.
Web-Scraping part-1
Web-Scraping part-2

adminravi

adminravi

Larz60+

adminravi

snippsat