11import math
22import re
3+ import datefinder
34import pandas as pd
45from bs4 import BeautifulSoup
56from p_tqdm import p_map
@@ -34,9 +35,11 @@ def extractPage(url: str) -> str:
3435 pageNotLoaded = False
3536 reviewers = []
3637 ratings = []
38+ ratingsDate = []
3739 reviewDescriptions = []
3840 reviewTitles = []
3941 reviewrsSpan = productPage .findAll ("span" , {"class" : "a-profile-name" })
42+ reviewDate = productPage .findAll ("span" , {"class" : "review-date" })
4043 ratingsSpan = productPage .findAll ("i" , {"class" : "review-rating" })
4144 reviewTitlesSpan = productPage .findAll ("a" , {"class" : "review-title-content" })
4245 reviewDescriptionSpan = productPage .findAll (
@@ -48,6 +51,8 @@ def extractPage(url: str) -> str:
4851 for i in range (2 , len (reviewrsSpan )):
4952 reviewers .append (reviewrsSpan [i ].get_text ())
5053 ratings .append (int (ratingsSpan [i ].get_text ()[0 ]))
54+ matches = datefinder .find_dates (reviewDate [i ].get_text ())
55+ ratingsDate .append (list (matches )[0 ].strftime ("%m/%d/%Y" ))
5156
5257 for i in range (0 , len (reviewTitlesSpan )):
5358 reviewTitles .append (reviewTitlesSpan [i ].get_text ())
@@ -63,6 +68,7 @@ def extractPage(url: str) -> str:
6368 "ratings" : ratings ,
6469 "reviewTitles" : reviewTitles ,
6570 "reviewDescriptions" : reviewDescriptions ,
71+ "date" : ratingsDate ,
6672 }
6773
6874
@@ -107,6 +113,7 @@ def scrape_reviews(url):
107113 productReviewsData ["Rating" ] = res ["ratings" ]
108114 productReviewsData ["Title" ] = res ["reviewTitles" ]
109115 productReviewsData ["Description" ] = res ["reviewDescriptions" ]
116+ productReviewsData ["Date" ] = res ["date" ]
110117 # productReviewsData["link"] = url
111118 # productReviewsData["Product Title"] = pageTitle
112119
0 commit comments