prathimacode-hub · prathimacode-hub · Oct 30, 2022 · Oct 8, 2022 · Oct 8, 2022 · Oct 8, 2022
diff --git a/WebScrapingScripts/Flipkart Mobiles Scraping/Images/Screenshot_20221008_070029.png b/WebScrapingScripts/Flipkart Mobiles Scraping/Images/Screenshot_20221008_070029.png
diff --git a/WebScrapingScripts/Flipkart Mobiles Scraping/Images/Screenshot_20221008_070628.png b/WebScrapingScripts/Flipkart Mobiles Scraping/Images/Screenshot_20221008_070628.png
diff --git a/WebScrapingScripts/Flipkart Mobiles Scraping/Images/Screenshot_20221008_071226.png b/WebScrapingScripts/Flipkart Mobiles Scraping/Images/Screenshot_20221008_071226.png
diff --git a/WebScrapingScripts/Flipkart Mobiles Scraping/Images/readme.md b/WebScrapingScripts/Flipkart Mobiles Scraping/Images/readme.md
@@ -0,0 +1 @@
+images
diff --git a/WebScrapingScripts/Flipkart Mobiles Scraping/README.md b/WebScrapingScripts/Flipkart Mobiles Scraping/README.md
@@ -0,0 +1,50 @@
+# Overview
+We will be scraping data from Flipkart website and then will use the data to create a DataFrame, which used for data analysis.
+
+## Web-Scraping
+ Web scraping is an automatic method to obtain large amounts of data from websites. 
+
+ Most of this data is unstructured data in an HTML format which is then converted into structured data in a spreadsheet or a 
+ database so that it can be used in various applications.
+
+## Data Frame
+ A data frame is a table or a two-dimensional array-like structure in which each column contains values of one variable 
+ and each row contains one set of values from each column.
+
+## Python Libraries Required
+
+- BeautifulSoup - It has been used to extract the HTML elements from website.
+- Urllib - It has been to send and recieve the request in order to fetch the data from FlipKart.
+- Pandas - It is used to create and store dataframes into .csv format.
+
+## Setup Packages
+ - pip install requests
+ - pip install pandas
+ - pip install bs4
+
+## Workflow
+1. Go to the url : <a href = 'https://www.flipkart.com/search?p%5B%5D=facets.brand%255B%255D%3DSamsung&sid=tyy%2F4io&sort=recency_desc&wid=1.productCard.PMU_V2_1'> FlipKart Samsung Mobiles </a>
+
+
+<img width="631" alt="image" src="https://user-images.githubusercontent.com/76874762/194710211-de97ec25-1e4a-432e-9c41-edb2e3660b64.png">
+
+
+
+2. Right click to Inspect & Hover over to see the HTML code.
+
+<img width="902" alt="Screenshot_20221008_070628" src="https://user-images.githubusercontent.com/76874762/194710338-3b62fd8a-31da-4e2d-8552-14d8ba6953f1.png">
+
+
+3. Import libraries to start scraping web data
+
+4. Hover over elements you want to scrape data from and use BeautifulSoup library methods to extract data from tags and class names.
+
+5. Create a Dataframe from extracted data and proceed for further analysis.
+
+
+## Output
+<img width="602" alt="image" src="https://user-images.githubusercontent.com/76874762/194710541-a29228c9-e3e0-41e2-a120-17e2968f7ad8.png">
+
+### Author
+
+<a href='https://singhmansi25.github.io'> Mansi Singh </a>
diff --git a/WebScrapingScripts/Flipkart Mobiles Scraping/requirements.txt b/WebScrapingScripts/Flipkart Mobiles Scraping/requirements.txt
@@ -0,0 +1,4 @@
+pandas
+bs4
+requests
+pip
diff --git a/WebScrapingScripts/Flipkart Mobiles Scraping/webscraping.py b/WebScrapingScripts/Flipkart Mobiles Scraping/webscraping.py
@@ -0,0 +1,114 @@
+# HTML Page Read and Upload
+# Import useful libraries and classes.
+
+from urllib.request import urlopen as uReq
+from bs4 import BeautifulSoup as soup
+
+# html page upload and read in web_page variable.
+my_url= "https://www.flipkart.com/search?p%5B%5D=facets.brand%255B%255D%3DSamsung&sid=tyy%2F4io&sort=recency_desc&wid=1.productCard.PMU_V2_1"
+web_page= uReq(my_url)
+page_html= web_page.read()
+
+# Parsing
+# html parser. It is to beautify the HTML code.
+page_soup= soup(page_html)
+
+# Extraction of information
+# read class attribute from web page in containers variable.
+# Print the length of containers.
+
+containers= page_soup.findAll("div", {"class": "_2kHMtA"})
+print(len(containers))
+
+# Extracting Product Name
+# The product_name_list contains the name of product extracted using find function using div tag and
+# class name
+
+product_name_list = []
+
+for box in containers:
+ # name of product is extracted using div tag with class name as given in website
+ product_name = box.find("div", class_="_4rR01T")
+ # the extracted names is stored in a list
+ product_name_list.append(product_name.string)
+
+# Extracting Ratings
+# The rating_list contains the ratings of product extracted using find function using div tag and class name
+
+rating_list = []
+
+for box in containers:
+ # rating of product is extracted using div tag with class name as given in website, if the rating is None, then 0.0 is used.
+ rating = box.find("div", class_="_3LWZlK")
+ if rating != None:
+ rating_list.append(rating.text)
+ else:
+ rating_list.append('0.0')
+# Extracting Price of Product
+# The price_list contains the price of product extracted using find function using div tag and class name
+price_list = []
+
+for box in containers:
+ # price of product is extracted using div tag with class name as given in website
+ price = box.find("div", class_="_30jeq3")
+ # the extracted price is stored in a list after string Rupees sign.
+ price_list.append(price.string.strip('₹'))
+
+# The container in website contains a list of information of the product which is
+# extracted using find function using li tag and class name
+# here n is length of containers in 1 page.
+
+n = len(containers)
+
+# list to store RAM of phones
+ram_list = []
+# list to store ROM of phones
+rom_list = []
+# list to store Display Screen of phones
+display_list = []
+# list to store Camera Specification of phones
+camera_list = []
+# list to store Battery Life of phones
+battery_life_list = []
+# list to store Warranty Period of phones
+warranty_list = []
+# temporary list to store the all the list of phones's specifications
+temp_list = []
+
+
+for box in containers:
+ # one list out of all product list is extracted using li tag with class name as given in website
+ temp_box = box.findAll("li", class_="rgWa7D")
+ temp_list.append(temp_box)
+
+for i in range(n):
+ # this loop extracts the values stored in the list of one container.
+ # since in the website the RAM & ROM of phoes are listed together
+ # so it is stored in a list and then splitted as per given splittor element.
+ split_list = temp_list[i][0].string.split('|')
+ # the extracted RAM is stored in a list
+ ram_list.append(split_list[0])
+ # the extracted ROM is stored in a list
+ rom_list.append(split_list[1])
+ # the extracted display is stored in a list
+ display_list.append(temp_list[i][1].string)
+ # the extracted camera is stored in a list
+ camera_list.append(temp_list[i][2].string)
+ # the extracted battery is stored in a list
+ battery_life_list.append(temp_list[i][3].string)
+ # the extracted warranty is stored in a list
+ warranty_list.append(temp_list[i][-1].string)
+
+# Creating Pandas DataFrame from Data scraped from Web
+# Importing Pandas to create a DataFrame
+import pandas as pd
+# Creating a Dictionary to store List values and creating DataFrame
+dictionary = {'Product_Name':product_name_list, 'Ratings':rating_list, 'Price':price_list, 'RAM_Storage':ram_list,
+ 'ROM_Storage':rom_list, 'Display_Screen':display_list, 'Camera':camera_list, 'Battery_Life':battery_life_list,
+ 'Warranty_Life':warranty_list}
+dataframe = pd.DataFrame(dictionary)
+# Head of DataFrame
+dataframe.head()
+
+# Tail of DataFrame
+dataframe.tail()