|
| 1 | +''' Written By : Parvez Alam ''' |
| 2 | + |
| 3 | +import requests |
| 4 | +import bs4 |
| 5 | +from bs4 import BeautifulSoup |
| 6 | +import pandas as pd |
| 7 | +import time |
| 8 | + |
| 9 | +max_results_per_city = 100 |
| 10 | + |
| 11 | +city_set = ['mumbai','bangalore','hyderabad','pune'] |
| 12 | + |
| 13 | +job_title_set = ['full+stack+developer','front+end+developer','back+end+developer','software+engineer','data+scientist','machine+learning+engineer','android+developer','ios+developer'] |
| 14 | + |
| 15 | +columns = ["job_title","company_name","summary","city","location","salary","date"] |
| 16 | + |
| 17 | +sample_df = pd.DataFrame(columns = columns) |
| 18 | +for job_title in job_title_set: |
| 19 | +for city in city_set: |
| 20 | +for start in range(0,max_results_per_city,10): |
| 21 | +url = "http://www.indeed.co.in/jobs?q=%s&l=%s&start=%s"%(job_title,city,start,) |
| 22 | +page = requests.get(url) |
| 23 | +print(url) |
| 24 | +time.sleep(1) |
| 25 | +soup = BeautifulSoup(page.text,"lxml",from_encoding="utf-8") |
| 26 | +for div in soup.find_all(name="div",attrs={"class":"row"}): |
| 27 | +#specifying row num for index of job posting in dataframe |
| 28 | +print(job_title+' '+city+' '+str(start)) |
| 29 | +print(len(sample_df)) |
| 30 | +num = (len(sample_df) + 1) |
| 31 | +#an empty list to hold the data for each posting |
| 32 | +job_post = [] |
| 33 | + |
| 34 | +#grabbing job title |
| 35 | +try: |
| 36 | +for a in div.find_all(name="a",attrs={"data-tn-element":"jobTitle"}): |
| 37 | +job_post.append(a["title"]) |
| 38 | +except: |
| 39 | +job_post.append("Not Available") |
| 40 | + |
| 41 | +#grabbing company_name |
| 42 | +try: |
| 43 | +company = div.find_all(name="span",attrs={"class":"company"}) |
| 44 | +if len(company) > 0: |
| 45 | +for b in company: |
| 46 | +job_post.append(b.text.strip()) |
| 47 | +else: |
| 48 | +sec_try = div.find_all(name="span",attrs={"class":"result-link-source"}) |
| 49 | +for span in sec_try: |
| 50 | +job_post.append(span.text) |
| 51 | +except: |
| 52 | +job_post.append("Not Available") |
| 53 | + |
| 54 | +#grabbing summary text |
| 55 | +try: |
| 56 | +d = div.findAll('span',attrs={'class':'summary'}) |
| 57 | +for span in d: |
| 58 | +job_post.append(span.text.strip()) |
| 59 | +except: |
| 60 | +job_post.append("Not Available") |
| 61 | + |
| 62 | +#append city name |
| 63 | +job_post.append(city) |
| 64 | + |
| 65 | +#grabbing location name |
| 66 | +try: |
| 67 | +c = div.find_all('span',attrs={'class':'location'}) |
| 68 | +for span in c: |
| 69 | +job_post.append(span.text) |
| 70 | +except: |
| 71 | +job_post.append("Not Available") |
| 72 | + |
| 73 | +#grabbing salary |
| 74 | +try: |
| 75 | +#salary = div.find(name="span",attrs={"class":"no-wrap"}) |
| 76 | +job_post.append(div.find(name="span",attrs={"class":"no-wrap"}).text) |
| 77 | +except: |
| 78 | +job_post.append("Not Available") |
| 79 | + |
| 80 | +#grabbing salary |
| 81 | +try: |
| 82 | +#salary = div.find(name="span",attrs={"class":"no-wrap"}) |
| 83 | +job_post.append(div.find(name="span",attrs={"class":"date"}).text) |
| 84 | +except: |
| 85 | +job_post.append("Not Available") |
| 86 | + |
| 87 | +#appending list of job post info to dataframe at index num |
| 88 | +#print(job_post) |
| 89 | +sample_df.loc[num] = job_post |
| 90 | + |
| 91 | + |
| 92 | +sample_df.to_csv("job_listing.csv",encoding="utf-8") |
0 commit comments